Index: src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java =================================================================== --- src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (revision 0) +++ src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (revision 0) @@ -0,0 +1,189 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +import java.util.Random; +import java.nio.CharBuffer; +import java.nio.ByteBuffer; + +public class TestIndexableBinaryStringTools extends TestCase { + private static final int NUM_RANDOM_TESTS = 20000; + private static final int MAX_RANDOM_BINARY_LENGTH = 300; + + public void testSingleBinaryRoundTrip() { + byte[] binary = new byte[] + { (byte)0x23, (byte)0x98, (byte)0x13, (byte)0xE4, (byte)0x76, (byte)0x41, + (byte)0xB2, (byte)0xC9, (byte)0x7F, (byte)0x0A, (byte)0xA6, (byte)0xD8 }; + + ByteBuffer binaryBuf = ByteBuffer.wrap(binary); + CharBuffer encoded = IndexableBinaryStringTools.encode(binaryBuf); + ByteBuffer decoded = IndexableBinaryStringTools.decode(encoded); + assertEquals("Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + + "original: " + binaryDump(binaryBuf) + + System.getProperty("line.separator") + + " encoded: " + charArrayDump(encoded) + + System.getProperty("line.separator") + + " decoded: " + binaryDump(decoded), + binaryBuf, decoded); + } + + public void testEncodedSortability() { + Random random = new Random(33447); // Fixed seed for replicable test + byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH]; + ByteBuffer originalBuf1 = ByteBuffer.wrap(originalArray1); + char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH]; + CharBuffer originalStringBuf1 = CharBuffer.wrap(originalString1); + char[] encoded1 = new char[IndexableBinaryStringTools.getEncodedLength(originalBuf1)]; + CharBuffer encodedBuf1 = CharBuffer.wrap(encoded1); + byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH]; + ByteBuffer originalBuf2 = ByteBuffer.wrap(original2); + char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH]; + CharBuffer originalStringBuf2 = CharBuffer.wrap(originalString2); + char[] encoded2 = new char[IndexableBinaryStringTools.getEncodedLength(originalBuf2)]; + CharBuffer encodedBuf2 = CharBuffer.wrap(encoded2); + for (int testNum = 0 ; testNum < NUM_RANDOM_TESTS ; ++testNum) { + int numBytes1 = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1 + originalBuf1.limit(numBytes1); + originalStringBuf1.limit(numBytes1); + + for (int byteNum = 0 ; byteNum < numBytes1 ; ++byteNum) { + int randomInt = random.nextInt(0x100); + originalArray1[byteNum] = (byte) randomInt; + originalString1[byteNum] = (char)randomInt; + } + + int numBytes2 = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1 + originalBuf2.limit(numBytes2); + originalStringBuf2.limit(numBytes2); + for (int byteNum = 0 ; byteNum < numBytes2 ; ++byteNum) { + int randomInt = random.nextInt(0x100); + original2[byteNum] = (byte)randomInt; + originalString2[byteNum] = (char)randomInt; + } + int originalComparison = originalStringBuf1.compareTo(originalStringBuf2); + originalComparison = originalComparison < 0 ? -1 : originalComparison > 0 ? 1 : 0; + + IndexableBinaryStringTools.encode(originalBuf1, encodedBuf1); + IndexableBinaryStringTools.encode(originalBuf2, encodedBuf2); + + int encodedComparison = encodedBuf1.compareTo(encodedBuf2); + encodedComparison = encodedComparison < 0 ? -1 : encodedComparison > 0 ? 1 : 0; + + assertEquals("Test #" + (testNum + 1) + + ": Original bytes and encoded chars compare differently:" + + System.getProperty("line.separator") + + " binary 1: " + binaryDump(originalBuf1) + + System.getProperty("line.separator") + + " binary 2: " + binaryDump(originalBuf2) + + System.getProperty("line.separator") + + "encoded 1: " + charArrayDump(encodedBuf1) + + System.getProperty("line.separator") + + "encoded 2: " + charArrayDump(encodedBuf2) + + System.getProperty("line.separator"), + originalComparison, encodedComparison); + } + } + + public void testEmptyInput() { + byte[] binary = new byte[0]; + CharBuffer encoded = IndexableBinaryStringTools.encode(ByteBuffer.wrap(binary)); + ByteBuffer decoded = IndexableBinaryStringTools.decode(encoded); + assertNotNull("decode() returned null", decoded); + assertEquals("decoded empty input was not empty", decoded.limit(), 0); + } + + public void testAllNullInput() { + byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + ByteBuffer binaryBuf = ByteBuffer.wrap(binary); + CharBuffer encoded = IndexableBinaryStringTools.encode(binaryBuf); + assertNotNull("encode() returned null", encoded); + ByteBuffer decodedBuf = IndexableBinaryStringTools.decode(encoded); + assertNotNull("decode() returned null", decodedBuf); + assertEquals("Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + + " original: " + binaryDump(binaryBuf) + + System.getProperty("line.separator") + + "decodedBuf: " + binaryDump(decodedBuf), + binaryBuf, decodedBuf); + } + + public void testRandomBinaryRoundTrip() { + Random random = new Random(1093733); // Fixed seed for replicable test + byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH]; + ByteBuffer binaryBuf = ByteBuffer.wrap(binary); + char[] encoded = new char[IndexableBinaryStringTools.getEncodedLength(binaryBuf)]; + CharBuffer encodedBuf = CharBuffer.wrap(encoded); + byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH]; + ByteBuffer decodedBuf = ByteBuffer.wrap(decoded); + for (int testNum = 0 ; testNum < NUM_RANDOM_TESTS ; ++testNum) { + int numBytes = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1 ; // Min == 1 + binaryBuf.limit(numBytes); + for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) { + binary[byteNum] = (byte)random.nextInt(0x100); + } + IndexableBinaryStringTools.encode(binaryBuf, encodedBuf); + IndexableBinaryStringTools.decode(encodedBuf, decodedBuf); + assertEquals("Test #" + (testNum + 1) + + ": Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + + " original: " + binaryDump(binaryBuf) + + System.getProperty("line.separator") + + "encodedBuf: " + charArrayDump(encodedBuf) + + System.getProperty("line.separator") + + "decodedBuf: " + binaryDump(decodedBuf), + binaryBuf, decodedBuf); + } + } + + public String binaryDump(ByteBuffer binaryBuf) { + StringBuffer buf = new StringBuffer(); + int numBytes = binaryBuf.limit() - binaryBuf.arrayOffset(); + byte[] binary = binaryBuf.array(); + for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) { + String hex = Integer.toHexString((int)binary[byteNum] & 0xFF); + if (hex.length() == 1) { + buf.append('0'); + } + buf.append(hex.toUpperCase()); + if (byteNum < numBytes - 1) { + buf.append(' '); + } + } + return buf.toString(); + } + + public String charArrayDump(CharBuffer charBuf) { + StringBuffer buf = new StringBuffer(); + int numBytes = charBuf.limit() - charBuf.arrayOffset(); + char[] charArray = charBuf.array(); + for (int charNum = 0 ; charNum < numBytes ; ++charNum) { + String hex = Integer.toHexString((int)charArray[charNum]); + for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) { + buf.append('0'); + } + buf.append(hex.toUpperCase()); + if (charNum < numBytes - 1) { + buf.append(' '); + } + } + return buf.toString(); + } +} Index: src/java/org/apache/lucene/util/IndexableBinaryStringTools.java =================================================================== --- src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (revision 0) +++ src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (revision 0) @@ -0,0 +1,311 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.nio.CharBuffer; +import java.nio.ByteBuffer; + +/** + * Provides support for converting byte sequences to Strings and back again. + * The resulting Strings preserve the original byte sequences' sort order. + * + * The Strings are constructed using a Base 8000h encoding of the original + * binary data - each char of an encoded String represents a 15-bit chunk + * from the byte sequence. Base 8000h was chosen because it allows for all + * lower 15 bits of char to be used without restriction; the surrogate range + * [U+D800-U+DFFF] does not represent valid chars, and would require + * complicated handling to avoid them and allow use of char's high bit. + * + * Although unset bits are used as padding in the final char, the original + * byte sequence could contain trailing bytes with no set bits (null bytes): + * padding is indistinguishable from valid information. To overcome this + * problem, a char is appended, indicating the number of encoded bytes in the + * final content char. + * + * This class's operations are defined over CharBuffers and ByteBuffers, to + * allow for wrapped arrays to be reused, reducing memory allocation costs for + * repeated operations. Note that this class calls array() and arrayOffset() + * on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be + * used. This class interprets the arrayOffset() and limit() values returned by + * its input buffers as beginning and end+1 positions on the wrapped array, + * resprectively; similarly, on the output buffer, arrayOffset() is the first + * position written to, and limit() is set to one past the final output array + * position. + */ +public class IndexableBinaryStringTools { + private static final CodingCase[] CODING_CASES = { + // CodingCase(int initialShift, int finalShift) + new CodingCase( 7, 1 ), + // CodingCase(int initialShift, int middleShift, int finalShift) + new CodingCase(14, 6, 2), + new CodingCase(13, 5, 3), + new CodingCase(12, 4, 4), + new CodingCase(11, 3, 5), + new CodingCase(10, 2, 6), + new CodingCase( 9, 1, 7), + new CodingCase( 8, 0 ) + }; + + /** + * Returns the number of chars required to encode the given byte sequence. + * + * @param original The byte sequence to be encoded. Must be backed by an array. + * @return The number of chars required to encode the given byte sequence + * @throws IllegalArgumentException If the given ByteBuffer is not backed by an array + */ + public static int getEncodedLength(ByteBuffer original) + throws IllegalArgumentException { + if (original.hasArray()) { + // Use long for intermediaries to protect against overflow + long length = (long)(original.limit() - original.arrayOffset()); + return (int)((length * 8L + 14L) / 15L) + 1; + } else { + throw new IllegalArgumentException("original argument must have a backing array"); + } + } + + /** + * Returns the number of bytes required to decode the given char sequence. + * + * @param encoded The char sequence to be encoded. Must be backed by an array. + * @return The number of bytes required to decode the given char sequence + * @throws IllegalArgumentException If the given CharBuffer is not backed by an array + */ + public static int getDecodedLength(CharBuffer encoded) + throws IllegalArgumentException { + if (encoded.hasArray()) { + int numChars = encoded.limit() - encoded.arrayOffset() - 1; + if (numChars <= 0) { + return 0; + } else { + int numFullBytesInFinalChar = encoded.charAt(encoded.limit() - 1); + int numEncodedChars = numChars - 1; + return (numEncodedChars * 15 + 7) / 8 + numFullBytesInFinalChar; + } + } else { + throw new IllegalArgumentException("encoded argument must have a backing array"); + } + } + + /** + * Encodes the input byte sequence into the output char sequence. Before + * calling this method, ensure that the output CharBuffer has sufficient + * capacity by calling {@link #getEncodedLength(java.nio.ByteBuffer)}. + * + * @param input The byte sequence to encode + * @param output Where the char sequence encoding result will go. The limit + * is set to one past the position of the final char. + * @throws IllegalArgumentException If either the input or the output buffer + * is not backed by an array + */ + public static void encode(ByteBuffer input, CharBuffer output) { + if (input.hasArray() && output.hasArray()) { + byte[] inputArray = input.array(); + int inputOffset = input.arrayOffset(); + int inputLength = input.limit() - inputOffset; + char[] outputArray = output.array(); + int outputOffset = output.arrayOffset(); + int outputLength = getEncodedLength(input); + output.limit(outputOffset + outputLength); // Set output final pos + 1 + output.position(0); + if (inputLength > 0) { + int inputByteNum = inputOffset; + int caseNum = 0; + int outputCharNum = outputOffset; + CodingCase codingCase; + for ( ; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength ; + ++outputCharNum ) { + codingCase = CODING_CASES[caseNum]; + if (2 == codingCase.numBytes) { + outputArray[outputCharNum] + = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) + & codingCase.finalMask) + & (short)0x7FFF); + } else { // numBytes is 3 + outputArray[outputCharNum] + = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift) + + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) + & codingCase.finalMask) + & (short)0x7FFF); + } + inputByteNum += codingCase.advanceBytes; + if (++caseNum == CODING_CASES.length) { + caseNum = 0; + } + } + // Produce final char (if any) and trailing count chars. + codingCase = CODING_CASES[caseNum]; + + if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3 + outputArray[outputCharNum++] + = (char)((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) + & (short)0x7FFF); + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = (char)1; + } else if (inputByteNum < inputLength) { + outputArray[outputCharNum++] + = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + & (short)0x7FFF); + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = caseNum == 0 ? (char)1 : (char)0; + } else { // No left over bits - last char is completely filled. + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = (char)1; + } + } + } else { + throw new IllegalArgumentException("Arguments must have backing arrays"); + } + } + + /** + * Decodes the input char sequence into the output byte sequence. Before + * calling this method, ensure that the output ByteBuffer has sufficient + * capacity by calling {@link #getDecodedLength(java.nio.CharBuffer)}. + * + * @param input The char sequence to decode + * @param output Where the byte sequence decoding result will go. The limit + * is set to one past the position of the final char. + * @throws IllegalArgumentException If either the input or the output buffer + * is not backed by an array + */ + public static void decode(CharBuffer input, ByteBuffer output) { + if (input.hasArray() && output.hasArray()) { + int numInputChars = input.limit() - input.arrayOffset() - 1; + int numOutputBytes = getDecodedLength(input); + output.limit(numOutputBytes + output.arrayOffset()); // Set output final pos + 1 + output.position(0); + byte[] outputArray = output.array(); + char[] inputArray = input.array(); + if (numOutputBytes > 0) { + int caseNum = 0; + int outputByteNum = output.arrayOffset(); + int inputCharNum = input.arrayOffset(); + short inputChar; + CodingCase codingCase; + for ( ; inputCharNum < numInputChars - 1 ; ++inputCharNum) { + codingCase = CODING_CASES[caseNum]; + inputChar = (short)inputArray[inputCharNum]; + if (2 == codingCase.numBytes) { + if (0 == caseNum) { + outputArray[outputByteNum] = (byte)(inputChar >>> codingCase.initialShift); + } else { + outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); + } + outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) + << codingCase.finalShift); + } else { // numBytes is 3 + outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); + outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) + >>> codingCase.middleShift); + outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) + << codingCase.finalShift); + } + outputByteNum += codingCase.advanceBytes; + if (++caseNum == CODING_CASES.length) { + caseNum = 0; + } + } + // Handle final char + inputChar = (short)inputArray[inputCharNum]; + codingCase = CODING_CASES[caseNum]; + if (0 == caseNum) { + outputArray[outputByteNum] = 0; + } + outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); + int bytesLeft = numOutputBytes - outputByteNum; + if (bytesLeft > 1) { + if (2 == codingCase.numBytes) { + outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) + >>> codingCase.finalShift); + } else { // numBytes is 3 + outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) + >>> codingCase.middleShift); + if (bytesLeft > 2) { + outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) + << codingCase.finalShift); + } + } + } + } + } else { + throw new IllegalArgumentException("Arguments must have backing arrays"); + } + } + + /** + * Decodes the given char sequence, which must have been encoded by + * {@link #encode(java.nio.ByteBuffer)} or + * {@link #encode(java.nio.ByteBuffer, java.nio.CharBuffer)}. + * + * @param input The char sequence to decode + * @return A byte sequence containing the decoding result. The limit + * is set to one past the position of the final char. + * @throws IllegalArgumentException If the input buffer is not backed by an + * array + */ + public static ByteBuffer decode(CharBuffer input) { + byte[] outputArray = new byte[getDecodedLength(input)]; + ByteBuffer output = ByteBuffer.wrap(outputArray); + decode(input, output); + return output; + } + + /** + * Encodes the input byte sequence. + * + * @param input The byte sequence to encode + * @return A char sequence containing the encoding result. The limit is set + * to one past the position of the final char. + * @throws IllegalArgumentException If the input buffer is not backed by an + * array + */ + public static CharBuffer encode(ByteBuffer input) { + char[] outputArray = new char[getEncodedLength(input)]; + CharBuffer output = CharBuffer.wrap(outputArray); + encode(input, output); + return output; + } + + static class CodingCase { + int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2; + short middleMask, finalMask; + + CodingCase(int initialShift, int middleShift, int finalShift) { + this.numBytes = 3; + this.initialShift = initialShift; + this.middleShift = middleShift; + this.finalShift = finalShift; + this.finalMask = (short)((short)0xFF >>> finalShift); + this.middleMask = (short)((short)0xFF << middleShift); + } + + CodingCase(int initialShift, int finalShift) { + this.numBytes = 2; + this.initialShift = initialShift; + this.finalShift = finalShift; + this.finalMask = (short)((short)0xFF >>> finalShift); + if (finalShift != 0) { + advanceBytes = 1; + } + } + } +} Index: src/site/src/documentation/content/xdocs/site.xml =================================================================== --- src/site/src/documentation/content/xdocs/site.xml (revision 709647) +++ src/site/src/documentation/content/xdocs/site.xml (working copy) @@ -53,6 +53,7 @@ + @@ -100,6 +101,7 @@ + Index: build.xml =================================================================== --- build.xml (revision 709647) +++ build.xml (working copy) @@ -239,6 +239,7 @@ + @@ -269,6 +270,7 @@ + Index: contrib/collation/pom.xml.template =================================================================== --- contrib/collation/pom.xml.template (revision 0) +++ contrib/collation/pom.xml.template (revision 0) @@ -0,0 +1,47 @@ + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-collation + + Lucene CollationKeyFilter/Analyzer & ICUCollationKeyFilter/Analyzer + + @version@ + + CollationKeyFilter, ICUCollationKeyFilter, CollationKeyAnalyzer, and + ICUCollationKeyAnalyzer - converts tokens into indexable collation keys + + jar + + + com.ibm.icu + icu4j + ${icu-version} + + + Index: contrib/collation/lib/ICU-LICENSE.txt =================================================================== --- contrib/collation/lib/ICU-LICENSE.txt (revision 0) +++ contrib/collation/lib/ICU-LICENSE.txt (revision 0) @@ -0,0 +1,33 @@ +ICU License - ICU 1.8.1 and later + +COPYRIGHT AND PERMISSION NOTICE + +Copyright (c) 1995-2008 International Business Machines Corporation and others + +All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do so, +provided that the above copyright notice(s) and this permission notice appear +in all copies of the Software and that both the above copyright notice(s) and +this permission notice appear in supporting documentation. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE +LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER +IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall not +be used in advertising or otherwise to promote the sale, use or other +dealings in this Software without prior written authorization of the +copyright holder. + +All trademarks and registered trademarks mentioned herein are the property of +their respective owners. Index: contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java =================================================================== --- contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java (revision 0) +++ contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java (revision 0) @@ -0,0 +1,310 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.PerFieldAnalyzerWrapper; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeFilter; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.RangeQuery; +import org.apache.lucene.search.ConstantScoreRangeQuery; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Document; +import org.apache.lucene.util.IndexableBinaryStringTools; +import org.apache.lucene.queryParser.analyzing.AnalyzingQueryParser; + +import java.text.Collator; +import java.util.Locale; +import java.io.IOException; +import java.nio.CharBuffer; +import java.nio.ByteBuffer; + + +public class CollationTestBase extends TestCase { + + protected String firstRangeBeginningOriginal = "\u062F"; + protected String firstRangeEndOriginal = "\u0698"; + + protected String secondRangeBeginningOriginal = "\u0633"; + protected String secondRangeEndOriginal = "\u0638"; + + /** + * Convenience method to perform the same function as CollationKeyFilter. + * + * @param keyBits the result from + * collator.getCollationKey(original).toByteArray() + * @return The encoded collation key for the original String + */ + protected String encodeCollationKey(byte[] keyBits) { + ByteBuffer begBuf = ByteBuffer.wrap(keyBits); + // Ensure that the backing char[] array is large enough to hold the encoded + // Binary String + char[] encodedBegArray + = new char[IndexableBinaryStringTools.getEncodedLength(begBuf)]; + CharBuffer encodedBegBuf = CharBuffer.wrap(encodedBegArray); + IndexableBinaryStringTools.encode(begBuf, encodedBegBuf); + return new String(encodedBegArray); + } + + public void testFarsiQueryParserCollating(Analyzer analyzer) throws Exception { + + RAMDirectory ramDir = new RAMDirectory(); + IndexWriter writer = new IndexWriter + (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field("content", "\u0633\u0627\u0628", + Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.close(); + IndexSearcher is = new IndexSearcher(ramDir); + + AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer); + aqp.setLowercaseExpandedTerms(false); + + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // index Term below should NOT be returned by a ConstantScoreRangeQuery + // with a Farsi Collator (or an Arabic one for the case when Farsi is not + // supported). + + // Test ConstantScoreRangeQuery + aqp.setUseOldRangeQuery(false); + ScoreDoc[] result + = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs; + assertEquals("The index Term should not be included.", 0, result.length); + + result = is.search(aqp.parse("[ \u0633 TO \u0638 ]"), null, 1000).scoreDocs; + assertEquals("The index Term should be included.", 1, result.length); + + // Test RangeQuery + aqp.setUseOldRangeQuery(true); + result = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs; + assertEquals("The index Term should not be included.", 0, result.length); + + result = is.search(aqp.parse("[ \u0633 TO \u0638 ]"), null, 1000).scoreDocs; + assertEquals("The index Term should be included.", 1, result.length); + + is.close(); + } + + + public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg, + String firstEnd, String secondBeg, + String secondEnd) throws Exception { + RAMDirectory ramDir = new RAMDirectory(); + IndexWriter writer = new IndexWriter + (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field("content", "\u0633\u0627\u0628", + Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("body", "body", + Field.Store.YES, Field.Index.NOT_ANALYZED)); + writer.addDocument(doc); + writer.close(); + IndexSearcher searcher = new IndexSearcher(ramDir); + Query query = new TermQuery(new Term("body","body")); + + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // index Term below should NOT be returned by a RangeFilter with a Farsi + // Collator (or an Arabic one for the case when Farsi searcher not + // supported). + ScoreDoc[] result = searcher.search + (query, new RangeFilter("content", firstBeg, firstEnd, true, true), 1).scoreDocs; + assertEquals("The index Term should not be included.", 0, result.length); + + result = searcher.search + (query, new RangeFilter("content", secondBeg, secondEnd, true, true), 1).scoreDocs; + assertEquals("The index Term should be included.", 1, result.length); + + searcher.close(); + } + + public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg, + String firstEnd, String secondBeg, + String secondEnd) throws Exception { + RAMDirectory ramDir = new RAMDirectory(); + IndexWriter writer = new IndexWriter + (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // index Term below should NOT be returned by a RangeQuery with a Farsi + // Collator (or an Arabic one for the case when Farsi is not supported). + doc.add(new Field("content", "\u0633\u0627\u0628", + Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.close(); + IndexSearcher searcher = new IndexSearcher(ramDir); + + Query query = new RangeQuery(new Term("content", firstBeg), + new Term("content", firstEnd), true); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("The index Term should not be included.", 0, hits.length); + + query = new RangeQuery(new Term("content", secondBeg), + new Term("content", secondEnd), true); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("The index Term should be included.", 1, hits.length); + searcher.close(); + } + + public void testFarsiConstantScoreRangeQuery + (Analyzer analyzer, String firstBeg, String firstEnd, + String secondBeg, String secondEnd) throws Exception { + + RAMDirectory farsiIndex = new RAMDirectory(); + IndexWriter writer = new IndexWriter + (farsiIndex, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field("content", "\u0633\u0627\u0628", + Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("body", "body", + Field.Store.YES, Field.Index.NOT_ANALYZED)); + writer.addDocument(doc); + writer.close(); + + IndexReader reader = IndexReader.open(farsiIndex); + IndexSearcher search = new IndexSearcher(reader); + + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // index Term below should NOT be returned by a ConstantScoreRangeQuery + // with a Farsi Collator (or an Arabic one for the case when Farsi is + // not supported). + Query csrq + = new ConstantScoreRangeQuery("content", firstBeg, firstEnd, true, true); + ScoreDoc[] result = search.search(csrq, null, 1000).scoreDocs; + assertEquals("The index Term should not be included.", 0, result.length); + + csrq = new ConstantScoreRangeQuery + ("content", secondBeg, secondEnd, true, true); + result = search.search(csrq, null, 1000).scoreDocs; + assertEquals("The index Term should be included.", 1, result.length); + search.close(); + } + + // Test using various international locales with accented characters (which + // sort differently depending on locale) + // + // Copied (and slightly modified) from + // org.apache.lucene.search.TestSort.testInternationalSort() + // + public void testCollationKeySort(Analyzer usAnalyzer, + Analyzer franceAnalyzer, + Analyzer swedenAnalyzer, + Analyzer denmarkAnalyzer, + String usResult) throws Exception { + RAMDirectory indexStore = new RAMDirectory(); + PerFieldAnalyzerWrapper analyzer + = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); + analyzer.addAnalyzer("US", usAnalyzer); + analyzer.addAnalyzer("France", franceAnalyzer); + analyzer.addAnalyzer("Sweden", swedenAnalyzer); + analyzer.addAnalyzer("Denmark", denmarkAnalyzer); + IndexWriter writer = new IndexWriter + (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + + // document data: + // the tracer field is used to determine which document was hit + String[][] sortData = new String[][] { + // tracer contents US France Sweden (sv_SE) Denmark (da_DK) + { "A", "x", "p\u00EAche", "p\u00EAche", "p\u00EAche", "p\u00EAche" }, + { "B", "y", "HAT", "HAT", "HAT", "HAT" }, + { "C", "x", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" }, + { "D", "y", "HUT", "HUT", "HUT", "HUT" }, + { "E", "x", "peach", "peach", "peach", "peach" }, + { "F", "y", "H\u00C5T", "H\u00C5T", "H\u00C5T", "H\u00C5T" }, + { "G", "x", "sin", "sin", "sin", "sin" }, + { "H", "y", "H\u00D8T", "H\u00D8T", "H\u00D8T", "H\u00D8T" }, + { "I", "x", "s\u00EDn", "s\u00EDn", "s\u00EDn", "s\u00EDn" }, + { "J", "y", "HOT", "HOT", "HOT", "HOT" }, + }; + + for (int i = 0 ; i < sortData.length ; ++i) { + Document doc = new Document(); + doc.add(new Field("tracer", sortData[i][0], + Field.Store.YES, Field.Index.NO)); + doc.add(new Field("contents", sortData[i][1], + Field.Store.NO, Field.Index.ANALYZED)); + if (sortData[i][2] != null) + doc.add(new Field("US", sortData[i][2], + Field.Store.NO, Field.Index.ANALYZED)); + if (sortData[i][3] != null) + doc.add(new Field("France", sortData[i][3], + Field.Store.NO, Field.Index.ANALYZED)); + if (sortData[i][4] != null) + doc.add(new Field("Sweden", sortData[i][4], + Field.Store.NO, Field.Index.ANALYZED)); + if (sortData[i][5] != null) + doc.add(new Field("Denmark", sortData[i][5], + Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + Searcher searcher = new IndexSearcher(indexStore); + + Sort sort = new Sort(); + Query queryX = new TermQuery(new Term ("contents", "x")); + Query queryY = new TermQuery(new Term ("contents", "y")); + + sort.setSort(new SortField("US", SortField.STRING)); + assertMatches(searcher, queryY, sort, usResult); + + sort.setSort(new SortField("France", SortField.STRING)); + assertMatches(searcher, queryX, sort, "EACGI"); + + sort.setSort(new SortField("Sweden", SortField.STRING)); + assertMatches(searcher, queryY, sort, "BJDFH"); + + sort.setSort(new SortField("Denmark", SortField.STRING)); + assertMatches(searcher, queryY, sort, "BJDHF"); + } + + // Make sure the documents returned by the search match the expected list + // Copied from TestSort.java + private void assertMatches(Searcher searcher, Query query, Sort sort, + String expectedResult) throws IOException { + ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs; + StringBuffer buff = new StringBuffer(10); + int n = result.length; + for (int i = 0 ; i < n ; ++i) { + Document doc = searcher.doc(result[i].doc); + String[] v = doc.getValues("tracer"); + for (int j = 0 ; j < v.length ; ++j) { + buff.append(v[j]); + } + } + assertEquals(expectedResult, buff.toString()); + } +} Index: contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java =================================================================== --- contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (revision 0) +++ contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (revision 0) @@ -0,0 +1,82 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.analysis.Analyzer; + +import java.text.Collator; +import java.util.Locale; + + +public class TestCollationKeyAnalyzer extends CollationTestBase { + + // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in + // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi + // characters properly. + private Collator collator = Collator.getInstance(new Locale("ar")); + private Analyzer analyzer = new CollationKeyAnalyzer(collator); + + private String firstRangeBeginning = encodeCollationKey + (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()); + private String firstRangeEnd = encodeCollationKey + (collator.getCollationKey(firstRangeEndOriginal).toByteArray()); + private String secondRangeBeginning = encodeCollationKey + (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()); + private String secondRangeEnd = encodeCollationKey + (collator.getCollationKey(secondRangeEndOriginal).toByteArray()); + + + public void testFarsiQueryParserCollating() throws Exception { + testFarsiQueryParserCollating(analyzer); + } + + public void testFarsiRangeFilterCollating() throws Exception { + testFarsiRangeFilterCollating + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testFarsiRangeQueryCollating() throws Exception { + testFarsiRangeQueryCollating + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testFarsiConstantScoreRangeQuery() throws Exception { + testFarsiConstantScoreRangeQuery + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testCollationKeySort() throws Exception { + Analyzer usAnalyzer + = new CollationKeyAnalyzer(Collator.getInstance(Locale.US)); + Analyzer franceAnalyzer + = new CollationKeyAnalyzer(Collator.getInstance(Locale.FRANCE)); + Analyzer swedenAnalyzer + = new CollationKeyAnalyzer(Collator.getInstance(new Locale("sv", "se"))); + Analyzer denmarkAnalyzer + = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk"))); + + // The ICU Collator and java.text.Collator implementations differ in their + // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US. + testCollationKeySort + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJDH"); + } +} Index: contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java =================================================================== --- contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (revision 0) +++ contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (revision 0) @@ -0,0 +1,99 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.KeywordTokenizer; + +import java.text.Collator; +import java.util.Locale; +import java.io.Reader; + + +public class TestCollationKeyFilter extends CollationTestBase { + + // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in + // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi + // characters properly. + private Collator collator = Collator.getInstance(new Locale("ar")); + private Analyzer analyzer = new TestAnalyzer(collator); + + private String firstRangeBeginning = encodeCollationKey + (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()); + private String firstRangeEnd = encodeCollationKey + (collator.getCollationKey(firstRangeEndOriginal).toByteArray()); + private String secondRangeBeginning = encodeCollationKey + (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()); + private String secondRangeEnd = encodeCollationKey + (collator.getCollationKey(secondRangeEndOriginal).toByteArray()); + + + public class TestAnalyzer extends Analyzer { + private Collator collator; + + TestAnalyzer(Collator collator) { + this.collator = collator; + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new KeywordTokenizer(reader); + result = new CollationKeyFilter(result, collator); + return result; + } + } + + public void testFarsiQueryParserCollating() throws Exception { + testFarsiQueryParserCollating(analyzer); + } + + + public void testFarsiRangeFilterCollating() throws Exception { + testFarsiRangeFilterCollating + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testFarsiRangeQueryCollating() throws Exception { + testFarsiRangeQueryCollating + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testFarsiConstantScoreRangeQuery() throws Exception { + testFarsiConstantScoreRangeQuery + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testCollationKeySort() throws Exception { + Analyzer usAnalyzer = new TestAnalyzer(Collator.getInstance(Locale.US)); + Analyzer franceAnalyzer + = new TestAnalyzer(Collator.getInstance(Locale.FRANCE)); + Analyzer swedenAnalyzer + = new TestAnalyzer(Collator.getInstance(new Locale("sv", "se"))); + Analyzer denmarkAnalyzer + = new TestAnalyzer(Collator.getInstance(new Locale("da", "dk"))); + + // The ICU Collator and java.text.Collator implementations differ in their + // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US. + testCollationKeySort + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJDH"); + } +} Index: contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java =================================================================== --- contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (revision 0) +++ contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (revision 0) @@ -0,0 +1,86 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import com.ibm.icu.text.Collator; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.KeywordTokenizer; + +import java.io.Reader; +import java.util.Locale; + + +public class TestICUCollationKeyAnalyzer extends CollationTestBase { + + private Collator collator = Collator.getInstance(new Locale("fa")); + private Analyzer analyzer = new ICUCollationKeyAnalyzer(collator); + + private String firstRangeBeginning = encodeCollationKey + (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()); + private String firstRangeEnd = encodeCollationKey + (collator.getCollationKey(firstRangeEndOriginal).toByteArray()); + private String secondRangeBeginning = encodeCollationKey + (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()); + private String secondRangeEnd = encodeCollationKey + (collator.getCollationKey(secondRangeEndOriginal).toByteArray()); + + + public void testFarsiQueryParserCollating() throws Exception { + testFarsiQueryParserCollating(analyzer); + } + + public void testFarsiRangeFilterCollating() throws Exception { + testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testFarsiRangeQueryCollating() throws Exception { + testFarsiRangeQueryCollating(analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testFarsiConstantScoreRangeQuery() throws Exception { + testFarsiConstantScoreRangeQuery + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + // Test using various international locales with accented characters (which + // sort differently depending on locale) + // + // Copied (and slightly modified) from + // org.apache.lucene.search.TestSort.testInternationalSort() + // + public void testCollationKeySort() throws Exception { + Analyzer usAnalyzer = new ICUCollationKeyAnalyzer + (Collator.getInstance(Locale.US)); + Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer + (Collator.getInstance(Locale.FRANCE)); + Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer + (Collator.getInstance(new Locale("sv", "se"))); + Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer + (Collator.getInstance(new Locale("da", "dk"))); + + // The ICU Collator and java.text.Collator implementations differ in their + // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US. + testCollationKeySort + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJHD"); + } +} Index: contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java =================================================================== --- contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (revision 0) +++ contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (revision 0) @@ -0,0 +1,100 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import com.ibm.icu.text.Collator; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.KeywordTokenizer; + +import java.io.Reader; +import java.util.Locale; + + +public class TestICUCollationKeyFilter extends CollationTestBase { + + private Collator collator = Collator.getInstance(new Locale("fa")); + private Analyzer analyzer = new TestAnalyzer(collator); + + private String firstRangeBeginning = encodeCollationKey + (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()); + private String firstRangeEnd = encodeCollationKey + (collator.getCollationKey(firstRangeEndOriginal).toByteArray()); + private String secondRangeBeginning = encodeCollationKey + (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()); + private String secondRangeEnd = encodeCollationKey + (collator.getCollationKey(secondRangeEndOriginal).toByteArray()); + + + public class TestAnalyzer extends Analyzer { + private Collator collator; + + TestAnalyzer(Collator collator) { + this.collator = collator; + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new KeywordTokenizer(reader); + result = new ICUCollationKeyFilter(result, collator); + return result; + } + } + + public void testFarsiQueryParserCollating() throws Exception { + testFarsiQueryParserCollating(analyzer); + } + + + public void testFarsiRangeFilterCollating() throws Exception { + testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testFarsiRangeQueryCollating() throws Exception { + testFarsiRangeQueryCollating(analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + public void testFarsiConstantScoreRangeQuery() throws Exception { + testFarsiConstantScoreRangeQuery + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + // Test using various international locales with accented characters (which + // sort differently depending on locale) + // + // Copied (and slightly modified) from + // org.apache.lucene.search.TestSort.testInternationalSort() + // + public void testCollationKeySort() throws Exception { + Analyzer usAnalyzer = new TestAnalyzer(Collator.getInstance(Locale.US)); + Analyzer franceAnalyzer + = new TestAnalyzer(Collator.getInstance(Locale.FRANCE)); + Analyzer swedenAnalyzer + = new TestAnalyzer(Collator.getInstance(new Locale("sv", "se"))); + Analyzer denmarkAnalyzer + = new TestAnalyzer(Collator.getInstance(new Locale("da", "dk"))); + + // The ICU Collator and java.text.Collator implementations differ in their + // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US. + testCollationKeySort + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJHD"); + } +} Index: contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java =================================================================== --- contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (revision 0) +++ contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (revision 0) @@ -0,0 +1,105 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.Tokenizer; + +import java.text.Collator; +import java.io.Reader; +import java.io.IOException; + +/** + *

+ * Filters {@link KeywordTokenizer} with {@link CollationKeyFilter}. + *

+ *

+ * Converts the token into its {@link java.text.CollationKey}, and then + * encodes the CollationKey with + * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow + * it to be stored as an index term. + *

+ *

+ * WARNING: Make sure you use exactly the same Collator at + * index and query time -- CollationKeys are only comparable when produced by + * the same Collator. Since {@link java.text.RuleBasedCollator}s are not + * independently versioned, it is unsafe to search against stored + * CollationKeys unless the following are exactly the same (best practice is + * to store this information with the index and check that they remain the + * same at query time): + *

+ *
    + *
  1. JVM vendor
  2. + *
  3. JVM version, including patch version
  4. + *
  5. + * The language (and country and variant, if specified) of the Locale + * used when constructing the collator via + * {@link Collator#getInstance(java.util.Locale)}. + *
  6. + *
  7. + * The collation strength used - see {@link Collator#setStrength(int)} + *
  8. + *
+ *

+ * NB 1: {@link ICUCollationKeyAnalyzer} uses ICU4J's Collator, which makes + * its version available, thus allowing collation to be versioned + * independently from the JVM. + *

+ *

+ * NB 2: CollationKeys generated by java.text.Collators are not compatible + * with those those generated by ICU Collators. Specifically, if you use + * CollationKeyAnalyzer to generate index terms, do not use + * ICUCollationKeyAnalyzer on the query side, or vice versa. + *

+ */ +public class CollationKeyAnalyzer extends Analyzer { + private Collator collator; + + CollationKeyAnalyzer(Collator collator) { + this.collator = collator; + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new KeywordTokenizer(reader); + result = new CollationKeyFilter(result, collator); + return result; + } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + + SavedStreams streams = (SavedStreams)getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new KeywordTokenizer(reader); + streams.result = new CollationKeyFilter(streams.source, collator); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } +} Index: contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java =================================================================== --- contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java (revision 0) +++ contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java (revision 0) @@ -0,0 +1,102 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.util.IndexableBinaryStringTools; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.text.Collator; + + +/** + *

+ * Converts each token into its {@link java.text.CollationKey}, and then + * encodes the CollationKey with {@link IndexableBinaryStringTools}, to allow + * it to be stored as an index term. + *

+ *

+ * WARNING: Make sure you use exactly the same Collator at + * index and query time -- CollationKeys are only comparable when produced by + * the same Collator. Since {@link java.text.RuleBasedCollator}s are not + * independently versioned, it is unsafe to search against stored + * CollationKeys unless the following are exactly the same (best practice is + * to store this information with the index and check that they remain the + * same at query time): + *

+ *
    + *
  1. JVM vendor
  2. + *
  3. JVM version, including patch version
  4. + *
  5. + * The language (and country and variant, if specified) of the Locale + * used when constructing the collator via + * {@link Collator#getInstance(java.util.Locale)}. + *
  6. + *
  7. + * The collation strength used - see {@link Collator#setStrength(int)} + *
  8. + *
+ *

+ * NB 1: {@link ICUCollationKeyFilter} uses ICU4J's Collator, which makes its + * version available, thus allowing collation to be versioned independently + * from the JVM. + *

+ *

+ * NB 2: CollationKeys generated by java.text.Collators are not compatible + * with those those generated by ICU Collators. Specifically, if you use + * CollationKeyFilter to generate index terms, do not use + * {@link ICUCollationKeyFilter} on the query side, or vice versa. + *

+ */ +public class CollationKeyFilter extends TokenFilter { + private Collator collator = null; + + /** + * @param input Source token stream + * @param collator CollationKey generator + */ + public CollationKeyFilter(TokenStream input, Collator collator) { + super(input); + this.collator = collator; + } + + public final Token next(final Token reusableToken) throws IOException { + assert reusableToken != null; + Token nextToken = input.next(reusableToken); + if (nextToken != null) { + char[] termBuffer = nextToken.termBuffer(); + String termText = new String(termBuffer, 0, nextToken.termLength()); + byte[] collationKey = collator.getCollationKey(termText).toByteArray(); + ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey); + int encodedLength + = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); + if (encodedLength > termBuffer.length) { + nextToken.resizeTermBuffer(encodedLength); + } + nextToken.setTermLength(encodedLength); + CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer()); + IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + } + return nextToken; + } +} Index: contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java =================================================================== --- contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (revision 0) +++ contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (revision 0) @@ -0,0 +1,96 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import com.ibm.icu.text.Collator; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.Tokenizer; + +import java.io.Reader; +import java.io.IOException; + + +/** + *

+ * Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}. + *

+ * Converts the token into its {@link com.ibm.icu.text.CollationKey}, and + * then encodes the CollationKey with + * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to + * be stored as an index term. + *

+ *

+ * WARNING: Make sure you use exactly the same Collator at + * index and query time -- CollationKeys are only comparable when produced by + * the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are + * independently versioned, so it is safe to search against stored + * CollationKeys if the following are exactly the same (best practice is + * to store this information with the index and check that they remain the + * same at query time): + *

+ *
    + *
  1. + * Collator version - see {@link Collator#getVersion()} + *
  2. + *
  3. + * The collation strength used - see {@link Collator#setStrength(int)} + *
  4. + *
+ *

+ * NB: CollationKeys generated by ICU Collators are not compatible with those + * generated by java.text.Collators. Specifically, if you use + * ICUCollationKeyAnalyzer to generate index terms, do not use + * {@link CollationKeyAnalyzer} on the query side, or vice versa. + *

+ */ +public class ICUCollationKeyAnalyzer extends Analyzer { + private Collator collator; + + ICUCollationKeyAnalyzer(Collator collator) { + this.collator = collator; + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new KeywordTokenizer(reader); + result = new ICUCollationKeyFilter(result, collator); + return result; + } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + + SavedStreams streams = (SavedStreams)getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new KeywordTokenizer(reader); + streams.result = new ICUCollationKeyFilter(streams.source, collator); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } +} Index: contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java =================================================================== --- contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (revision 0) +++ contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (revision 0) @@ -0,0 +1,94 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import com.ibm.icu.text.Collator; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.util.IndexableBinaryStringTools; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; + + +/** + *

+ * Converts each token into its {@link com.ibm.icu.text.CollationKey}, and + * then encodes the CollationKey with {@link IndexableBinaryStringTools}, to + * allow it to be stored as an index term. + *

+ *

+ * WARNING: Make sure you use exactly the same Collator at + * index and query time -- CollationKeys are only comparable when produced by + * the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are + * independently versioned, so it is safe to search against stored + * CollationKeys if the following are exactly the same (best practice is + * to store this information with the index and check that they remain the + * same at query time): + *

+ *
    + *
  1. + * Collator version - see {@link Collator#getVersion()} + *
  2. + *
  3. + * The collation strength used - see {@link Collator#setStrength(int)} + *
  4. + *
+ *

+ * NB: CollationKeys generated by ICU Collators are not compatible with those + * generated by java.text.Collators. Specifically, if you use + * ICUCollationKeyFilter to generate index terms, do not use + * {@link CollationKeyFilter} on the query side, or vice versa. + *

+ */ +public class ICUCollationKeyFilter extends TokenFilter { + private Collator collator = null; + + /** + * + * @param input Source token stream + * @param collator CollationKey generator + */ + public ICUCollationKeyFilter(TokenStream input, Collator collator) { + super(input); + this.collator = collator; + } + + public final Token next(final Token reusableToken) throws IOException { + assert reusableToken != null; + Token nextToken = input.next(reusableToken); + if (nextToken != null) { + char[] termBuffer = nextToken.termBuffer(); + String termText = new String(termBuffer, 0, nextToken.termLength()); + byte[] collationKey = collator.getCollationKey(termText).toByteArray(); + ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey); + int encodedLength + = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); + if (encodedLength > termBuffer.length) { + nextToken.resizeTermBuffer(encodedLength); + } + nextToken.setTermLength(encodedLength); + CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer()); + IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + } + return nextToken; + } +} Index: contrib/collation/src/java/overview.html =================================================================== --- contrib/collation/src/java/overview.html (revision 0) +++ contrib/collation/src/java/overview.html (revision 0) @@ -0,0 +1,10 @@ + + + + Apache Lucene CollationKeyFilter/Analyzer and + ICUCollationKeyFilter/Analyzer + + + + + \ No newline at end of file Index: contrib/collation/build.xml =================================================================== --- contrib/collation/build.xml (revision 0) +++ contrib/collation/build.xml (revision 0) @@ -0,0 +1,37 @@ + + + + + + + + CollationKeyFilter, ICUCollationKeyFilter, CollationKeyAnalyzer, and + ICUCollationKeyAnalyzer - converts tokens into indexable collation keys + + + + + + + + + + Index: lucene-contrib-pom.xml.template =================================================================== --- lucene-contrib-pom.xml.template (revision 709647) +++ lucene-contrib-pom.xml.template (working copy) @@ -44,5 +44,6 @@ 3.1 1.7.0 1.4 + 4.0