Index: src/java/org/apache/lucene/collation/CollationKeyFilter.java =================================================================== --- src/java/org/apache/lucene/collation/CollationKeyFilter.java (revision 893891) +++ src/java/org/apache/lucene/collation/CollationKeyFilter.java (working copy) @@ -24,8 +24,6 @@ import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; import java.text.Collator; @@ -94,15 +92,14 @@ char[] termBuffer = termAtt.termBuffer(); String termText = new String(termBuffer, 0, termAtt.termLength()); byte[] collationKey = collator.getCollationKey(termText).toByteArray(); - ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey); int encodedLength - = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); + = IndexableBinaryStringTools.getEncodedLength(collationKey.length); if (encodedLength > termBuffer.length) { termAtt.resizeTermBuffer(encodedLength); } termAtt.setTermLength(encodedLength); - CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); - IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + IndexableBinaryStringTools.encode(collationKey, 0, collationKey.length, + termAtt.termBuffer(), 0, encodedLength); return true; } else { return false; Index: src/java/org/apache/lucene/util/IndexableBinaryStringTools.java =================================================================== --- src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (revision 893891) +++ src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (working copy) @@ -72,39 +72,62 @@ * @return The number of chars required to encode the given byte sequence * @throws IllegalArgumentException If the given ByteBuffer is not backed by an array */ - public static int getEncodedLength(ByteBuffer original) + public static int getEncodedLength(ByteBuffer original) throws IllegalArgumentException { if (original.hasArray()) { - // Use long for intermediaries to protect against overflow - long length = (long)(original.limit() - original.arrayOffset()); - return (int)((length * 8L + 14L) / 15L) + 1; + return getEncodedLength(original.limit() - original.arrayOffset()); } else { throw new IllegalArgumentException("original argument must have a backing array"); } } + + /** + * Returns the number of chars required to encode the given number of bytes. + * + * @param numBytes The number of bytes to be encoded. + * @return The number of chars required to encode the number of bytes. + */ + public static int getEncodedLength(int numBytes) { + // Use long for intermediaries to protect against overflow + final long length = (long)(numBytes); + return (int)((length * 8L + 14L) / 15L) + 1; + } /** * Returns the number of bytes required to decode the given char sequence. * - * @param encoded The char sequence to be encoded. Must be backed by an array. + * @param encoded The char sequence to be decoded. Must be backed by an array. * @return The number of bytes required to decode the given char sequence * @throws IllegalArgumentException If the given CharBuffer is not backed by an array */ public static int getDecodedLength(CharBuffer encoded) throws IllegalArgumentException { if (encoded.hasArray()) { - int numChars = encoded.limit() - encoded.arrayOffset() - 1; - if (numChars <= 0) { - return 0; - } else { - int numFullBytesInFinalChar = encoded.charAt(encoded.limit() - 1); - int numEncodedChars = numChars - 1; - return (numEncodedChars * 15 + 7) / 8 + numFullBytesInFinalChar; - } + return getDecodedLength(encoded.array(), encoded.arrayOffset(), + encoded.limit() - encoded.arrayOffset()); } else { throw new IllegalArgumentException("encoded argument must have a backing array"); } } + + /** + * Returns the number of bytes required to decode the given char sequence. + * + * @param encoded char sequence to be decoded + * @param offset initial offset + * @param length number of characters + * @return The number of bytes required to decode the given char sequence + */ + public static int getDecodedLength(char[] encoded, int offset, int length) { + final int numChars = length - 1; + if (numChars <= 0) { + return 0; + } else { + int numFullBytesInFinalChar = encoded[offset + length - 1]; + int numEncodedChars = numChars - 1; + return (numEncodedChars * 15 + 7) / 8 + numFullBytesInFinalChar; + } + } /** * Encodes the input byte sequence into the output char sequence. Before @@ -119,64 +142,83 @@ */ public static void encode(ByteBuffer input, CharBuffer output) { if (input.hasArray() && output.hasArray()) { - byte[] inputArray = input.array(); - int inputOffset = input.arrayOffset(); - int inputLength = input.limit() - inputOffset; - char[] outputArray = output.array(); - int outputOffset = output.arrayOffset(); - int outputLength = getEncodedLength(input); - output.limit(outputOffset + outputLength); // Set output final pos + 1 + final int inputOffset = input.arrayOffset(); + final int inputLength = input.limit() - inputOffset; + final int outputOffset = output.arrayOffset(); + final int outputLength = getEncodedLength(inputLength); + output.limit(outputLength + outputOffset); output.position(0); - if (inputLength > 0) { - int inputByteNum = inputOffset; - int caseNum = 0; - int outputCharNum = outputOffset; - CodingCase codingCase; - for ( ; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength ; - ++outputCharNum ) { - codingCase = CODING_CASES[caseNum]; - if (2 == codingCase.numBytes) { - outputArray[outputCharNum] - = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) - + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) - & codingCase.finalMask) - & (short)0x7FFF); - } else { // numBytes is 3 - outputArray[outputCharNum] - = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) - + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift) - + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) - & codingCase.finalMask) - & (short)0x7FFF); - } - inputByteNum += codingCase.advanceBytes; - if (++caseNum == CODING_CASES.length) { - caseNum = 0; - } - } - // Produce final char (if any) and trailing count chars. + encode(input.array(), inputOffset, inputLength, output.array(), + outputOffset, outputLength); + } else { + throw new IllegalArgumentException("Arguments must have backing arrays"); + } + } + + /** + * Encodes the input byte sequence into the output char sequence. Before + * calling this method, ensure that the output array has sufficient + * capacity by calling {@link #getEncodedLength(int)}. + * + * @param inputArray byte sequence to be encoded + * @param inputOffset initial offset into inputArray + * @param inputLength number of bytes in inputArray + * @param outputArray char sequence to store encoded result + * @param outputOffset initial offset into outputArray + * @param outputLength length of output, should be + * at least getEncodedLength(inputLength) + */ + public static void encode(byte[] inputArray, int inputOffset, int inputLength, + char[] outputArray, int outputOffset, int outputLength) { + // TODO: maybe we should throw index out of bounds if this is exceeded? + assert(outputLength >= getEncodedLength(inputLength)); + if (inputLength > 0) { + int inputByteNum = inputOffset; + int caseNum = 0; + int outputCharNum = outputOffset; + CodingCase codingCase; + for ( ; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength ; + ++outputCharNum ) { codingCase = CODING_CASES[caseNum]; - - if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3 - outputArray[outputCharNum++] - = (char)((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) - + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) + if (2 == codingCase.numBytes) { + outputArray[outputCharNum] + = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) + & codingCase.finalMask) & (short)0x7FFF); - // Add trailing char containing the number of full bytes in final char - outputArray[outputCharNum++] = (char)1; - } else if (inputByteNum < inputLength) { - outputArray[outputCharNum++] + } else { // numBytes is 3 + outputArray[outputCharNum] = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) - & (short)0x7FFF); - // Add trailing char containing the number of full bytes in final char - outputArray[outputCharNum++] = caseNum == 0 ? (char)1 : (char)0; - } else { // No left over bits - last char is completely filled. - // Add trailing char containing the number of full bytes in final char - outputArray[outputCharNum++] = (char)1; + + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift) + + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) + & codingCase.finalMask) + & (short)0x7FFF); } + inputByteNum += codingCase.advanceBytes; + if (++caseNum == CODING_CASES.length) { + caseNum = 0; + } } - } else { - throw new IllegalArgumentException("Arguments must have backing arrays"); + // Produce final char (if any) and trailing count chars. + codingCase = CODING_CASES[caseNum]; + + if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3 + outputArray[outputCharNum++] + = (char)((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) + & (short)0x7FFF); + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = (char)1; + } else if (inputByteNum < inputLength) { + outputArray[outputCharNum++] + = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + & (short)0x7FFF); + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = caseNum == 0 ? (char)1 : (char)0; + } else { // No left over bits - last char is completely filled. + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = (char)1; + } } } @@ -193,65 +235,91 @@ */ public static void decode(CharBuffer input, ByteBuffer output) { if (input.hasArray() && output.hasArray()) { - int numInputChars = input.limit() - input.arrayOffset() - 1; - int numOutputBytes = getDecodedLength(input); - output.limit(numOutputBytes + output.arrayOffset()); // Set output final pos + 1 + final int inputOffset = input.arrayOffset(); + final int inputLength = input.limit() - inputOffset; + final int outputOffset = output.arrayOffset(); + final int outputLength = getDecodedLength(input.array(), inputOffset, + inputLength); + output.limit(outputLength + outputOffset); output.position(0); - byte[] outputArray = output.array(); - char[] inputArray = input.array(); - if (numOutputBytes > 0) { - int caseNum = 0; - int outputByteNum = output.arrayOffset(); - int inputCharNum = input.arrayOffset(); - short inputChar; - CodingCase codingCase; - for ( ; inputCharNum < numInputChars - 1 ; ++inputCharNum) { - codingCase = CODING_CASES[caseNum]; - inputChar = (short)inputArray[inputCharNum]; - if (2 == codingCase.numBytes) { - if (0 == caseNum) { - outputArray[outputByteNum] = (byte)(inputChar >>> codingCase.initialShift); - } else { - outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); - } - outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) - << codingCase.finalShift); - } else { // numBytes is 3 + decode(input.array(), inputOffset, inputLength, output.array(), + outputOffset, outputLength); + } else { + throw new IllegalArgumentException("Arguments must have backing arrays"); + } + } + + /** + * Decodes the input char sequence into the output byte sequence. Before + * calling this method, ensure that the output array has sufficient + * capacity by calling {@link #getDecodedLength(char[], int, int)}. + * + * @param inputArray char sequence to be decoded + * @param inputOffset initial offset into inputArray + * @param inputLength number of chars in inputArray + * @param outputArray byte sequence to store encoded result + * @param outputOffset initial offset into outputArray + * @param outputLength length of output, should be + * at least getDecodedLength(inputArray, inputOffset, inputLength) + */ + public static void decode(char[] inputArray, int inputOffset, int inputLength, + byte[] outputArray, int outputOffset, int outputLength) { + // TODO: maybe we should throw index out of bounds if this is exceeded? + assert(outputLength >= + getDecodedLength(inputArray, inputOffset, inputLength)); + int numInputChars = inputLength - 1; + int numOutputBytes = outputLength; + + if (numOutputBytes > 0) { + int caseNum = 0; + int outputByteNum = outputOffset; + int inputCharNum = inputOffset; + short inputChar; + CodingCase codingCase; + for ( ; inputCharNum < numInputChars - 1 ; ++inputCharNum) { + codingCase = CODING_CASES[caseNum]; + inputChar = (short)inputArray[inputCharNum]; + if (2 == codingCase.numBytes) { + if (0 == caseNum) { + outputArray[outputByteNum] = (byte)(inputChar >>> codingCase.initialShift); + } else { outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); - outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) - >>> codingCase.middleShift); - outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) - << codingCase.finalShift); } - outputByteNum += codingCase.advanceBytes; - if (++caseNum == CODING_CASES.length) { - caseNum = 0; - } + outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) + << codingCase.finalShift); + } else { // numBytes is 3 + outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); + outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) + >>> codingCase.middleShift); + outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) + << codingCase.finalShift); } - // Handle final char - inputChar = (short)inputArray[inputCharNum]; - codingCase = CODING_CASES[caseNum]; - if (0 == caseNum) { - outputArray[outputByteNum] = 0; + outputByteNum += codingCase.advanceBytes; + if (++caseNum == CODING_CASES.length) { + caseNum = 0; } - outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); - int bytesLeft = numOutputBytes - outputByteNum; - if (bytesLeft > 1) { - if (2 == codingCase.numBytes) { - outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) - >>> codingCase.finalShift); - } else { // numBytes is 3 - outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) - >>> codingCase.middleShift); - if (bytesLeft > 2) { - outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) - << codingCase.finalShift); - } + } + // Handle final char + inputChar = (short)inputArray[inputCharNum]; + codingCase = CODING_CASES[caseNum]; + if (0 == caseNum) { + outputArray[outputByteNum] = 0; + } + outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); + int bytesLeft = numOutputBytes - outputByteNum; + if (bytesLeft > 1) { + if (2 == codingCase.numBytes) { + outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) + >>> codingCase.finalShift); + } else { // numBytes is 3 + outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) + >>> codingCase.middleShift); + if (bytesLeft > 2) { + outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) + << codingCase.finalShift); } } } - } else { - throw new IllegalArgumentException("Arguments must have backing arrays"); } } Index: contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (revision 893891) +++ contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (working copy) @@ -23,13 +23,10 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; /** @@ -92,15 +89,14 @@ char[] termBuffer = termAtt.termBuffer(); String termText = new String(termBuffer, 0, termAtt.termLength()); collator.getRawCollationKey(termText, reusableKey); - ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size); int encodedLength - = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); + = IndexableBinaryStringTools.getEncodedLength(reusableKey.size); if (encodedLength > termBuffer.length) { termAtt.resizeTermBuffer(encodedLength); } termAtt.setTermLength(encodedLength); - CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); - IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size, + termAtt.termBuffer(), 0, encodedLength); return true; } else { return false;