Index: modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java =================================================================== --- modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (revision 965632) +++ modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (working copy) @@ -59,8 +59,10 @@ // // Copied (and slightly modified) from // org.apache.lucene.search.TestSort.testInternationalSort() - // - public void testCollationKeySort() throws Exception { + // + /** @deprecated remove this when ICUCollationKeyFilter is removed */ + @Deprecated + public void testCollationKeySortBackwards() throws Exception { Analyzer usAnalyzer = new ICUCollationKeyAnalyzer (Collator.getInstance(Locale.US)); Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer @@ -75,4 +77,20 @@ testCollationKeySort (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJHD"); } + + public void testCollationKeySort() throws Exception { + Analyzer usAnalyzer + = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US)); + Analyzer franceAnalyzer + = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE)); + Analyzer swedenAnalyzer + = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se"))); + Analyzer denmarkAnalyzer + = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk"))); + + // The ICU Collator and java.text.Collator implementations differ in their + // orderings - "BFJHD" is the ordering for ICU Collator for Locale.US. + testCollationKeySort + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJHD"); + } } Index: modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java =================================================================== --- modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (revision 965632) +++ modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (working copy) @@ -26,7 +26,8 @@ import java.io.Reader; import java.util.Locale; - +/** @deprecated remove this when ICUCollationKeyFilter is removed */ +@Deprecated public class TestICUCollationKeyFilter extends CollationTestBase { private Collator collator = Collator.getInstance(new Locale("fa")); Index: modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java (revision 0) +++ modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java (revision 0) @@ -0,0 +1,96 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.collation.tokenattributes.ICUCollatedTermAttributeImpl; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeSource; + +import com.ibm.icu.text.Collator; + +/** + *

+ * Converts each token into its {@link com.ibm.icu.text.CollationKey}, and + * then encodes bytes as an index term. + *

+ *

+ * WARNING: Make sure you use exactly the same Collator at + * index and query time -- CollationKeys are only comparable when produced by + * the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are + * independently versioned, so it is safe to search against stored + * CollationKeys if the following are exactly the same (best practice is + * to store this information with the index and check that they remain the + * same at query time): + *

+ *
    + *
  1. + * Collator version - see {@link Collator#getVersion()} + *
  2. + *
  3. + * The collation strength used - see {@link Collator#setStrength(int)} + *
  4. + *
+ *

+ * CollationKeys generated by ICU Collators are not compatible with those + * generated by java.text.Collators. Specifically, if you use + * ICUCollationAttributeFactory to generate index terms, do not use + * {@link CollationAttributeFactory} on the query side, or vice versa. + *

+ *

+ * ICUCollationAttributeFactory is significantly faster and generates significantly + * shorter keys than CollationAttributeFactory. See + * http://site.icu-project.org/charts/collation-icu4j-sun for key + * generation timing and key length comparisons between ICU4J and + * java.text.Collator over several languages. + *

+ */ +public class ICUCollationAttributeFactory extends AttributeSource.AttributeFactory { + private final Collator collator; + private final AttributeSource.AttributeFactory delegate; + + /** + * Create an ICUCollationAttributeFactory, using + * {@link AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the + * factory for all other attributes. + * @param collator CollationKey generator + */ + public ICUCollationAttributeFactory(Collator collator) { + this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator); + } + + /** + * Create an ICUCollationAttributeFactory, using the supplied Attribute + * Factory as the factory for all other attributes. + * @param delegate Attribute Factory + * @param collator CollationKey generator + */ + public ICUCollationAttributeFactory(AttributeSource.AttributeFactory delegate, Collator collator) { + this.delegate = delegate; + this.collator = collator; + } + + @Override + public AttributeImpl createAttributeInstance( + Class attClass) { + return attClass.isAssignableFrom(ICUCollatedTermAttributeImpl.class) + ? new ICUCollatedTermAttributeImpl(collator) + : delegate.createAttributeInstance(attClass); + } +} Property changes on: modules\analysis\icu\src\java\org\apache\lucene\collation\ICUCollationAttributeFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (revision 965632) +++ modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (working copy) @@ -19,13 +19,12 @@ import com.ibm.icu.text.Collator; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link +import org.apache.lucene.util.Version; import java.io.Reader; -import java.io.IOException; /** @@ -33,8 +32,8 @@ * Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}. *

* Converts the token into its {@link com.ibm.icu.text.CollationKey}, and - * then encodes the CollationKey with - * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to + * then encodes the CollationKey either directly or with + * {@link IndexableBinaryStringTools} (see below), to allow it to * be stored as an index term. *

*

@@ -68,39 +67,49 @@ * generation timing and key length comparisons between ICU4J and * java.text.Collator over several languages. *

+ * + *

You must specify the required {@link Version} + * compatibility when creating ICUCollationKeyAnalyzer: + *

*/ -public final class ICUCollationKeyAnalyzer extends Analyzer { - private Collator collator; - - public ICUCollationKeyAnalyzer(Collator collator) { +public final class ICUCollationKeyAnalyzer extends ReusableAnalyzerBase { + private final Collator collator; + private final ICUCollationAttributeFactory factory; + private final Version matchVersion; + + /** + * Create a new ICUCollationKeyAnalyzer, using the specified collator. + * + * @param matchVersion See
above + * @param collator CollationKey generator + */ + public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) { + this.matchVersion = matchVersion; this.collator = collator; + this.factory = new ICUCollationAttributeFactory(collator); } - - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new KeywordTokenizer(reader); - result = new ICUCollationKeyFilter(result, collator); - return result; - } - private class SavedStreams { - Tokenizer source; - TokenStream result; + /** + * @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)} + * and specify a version instead. This ctor will be removed in Lucene 5.0 + */ + @Deprecated + public ICUCollationKeyAnalyzer(Collator collator) { + this(Version.LUCENE_31, collator); } - + @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - - SavedStreams streams = (SavedStreams)getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new KeywordTokenizer(reader); - streams.result = new ICUCollationKeyFilter(streams.source, collator); - setPreviousTokenStream(streams); + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + if (matchVersion.onOrAfter(Version.LUCENE_40)) { + KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE); + return new TokenStreamComponents(tokenizer, tokenizer); } else { - streams.source.reset(reader); + KeywordTokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator)); } - return streams.result; } } Index: modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (revision 965632) +++ modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (working copy) @@ -66,7 +66,10 @@ * generation timing and key length comparisons between ICU4J and * java.text.Collator over several languages. *

+ * @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes + * terms directly as bytes. This filter will be removed in Lucene 5.0 */ +@Deprecated public final class ICUCollationKeyFilter extends TokenFilter { private Collator collator = null; private RawCollationKey reusableKey = new RawCollationKey(); Index: modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java (revision 0) +++ modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java (revision 0) @@ -0,0 +1,50 @@ +package org.apache.lucene.collation.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; +import org.apache.lucene.util.BytesRef; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RawCollationKey; + +/** + * Extension of {@link CharTermAttributeImpl} that encodes the term + * text as a binary Unicode collation key instead of as UTF-8 bytes. + */ +public class ICUCollatedTermAttributeImpl extends CharTermAttributeImpl { + private final Collator collator; + private RawCollationKey key = new RawCollationKey(); + + /** + * Create a new ICUCollatedTermAttributeImpl + * @param collator Collation key generator + */ + public ICUCollatedTermAttributeImpl(Collator collator) { + this.collator = collator; + } + + @Override + public int toBytesRef(BytesRef target) { + collator.getRawCollationKey(toString(), key); + target.bytes = key.bytes; + target.offset = 0; + target.length = key.size; + return target.hashCode(); + } +} Property changes on: modules\analysis\icu\src\java\org\apache\lucene\collation\tokenattributes\ICUCollatedTermAttributeImpl.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (revision 965632) +++ modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (working copy) @@ -59,7 +59,9 @@ secondRangeBeginning, secondRangeEnd); } - public void testCollationKeySort() throws Exception { + /** @deprecated remove this when CollationKeyFilter is removed */ + @Deprecated + public void testCollationKeySortBackwards() throws Exception { Analyzer usAnalyzer = new CollationKeyAnalyzer(Collator.getInstance(Locale.US)); Analyzer franceAnalyzer @@ -74,4 +76,20 @@ testCollationKeySort (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJDH"); } + + public void testCollationKeySort() throws Exception { + Analyzer usAnalyzer + = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US)); + Analyzer franceAnalyzer + = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE)); + Analyzer swedenAnalyzer + = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se"))); + Analyzer denmarkAnalyzer + = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk"))); + + // The ICU Collator and java.text.Collator implementations differ in their + // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US. + testCollationKeySort + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJDH"); + } } Index: modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (revision 965632) +++ modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (working copy) @@ -26,7 +26,8 @@ import java.util.Locale; import java.io.Reader; - +/** @deprecated remove this when CollationKeyFilter is removed */ +@Deprecated public class TestCollationKeyFilter extends CollationTestBase { // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (revision 965632) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (working copy) @@ -29,8 +29,8 @@ * Emits the entire input as a single token. */ public final class KeywordTokenizer extends Tokenizer { - - private static final int DEFAULT_BUFFER_SIZE = 256; + /** Default read buffer size */ + public static final int DEFAULT_BUFFER_SIZE = 256; private boolean done = false; private int finalOffset; Index: modules/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java (revision 0) @@ -0,0 +1,103 @@ +package org.apache.lucene.collation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.Collator; + +import org.apache.lucene.collation.tokenattributes.CollatedTermAttributeImpl; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeSource; + +/** + *

+ * Converts each token into its {@link java.text.CollationKey}, and then + * encodes the bytes as an index term. + *

+ *

+ * WARNING: Make sure you use exactly the same Collator at + * index and query time -- CollationKeys are only comparable when produced by + * the same Collator. Since {@link java.text.RuleBasedCollator}s are not + * independently versioned, it is unsafe to search against stored + * CollationKeys unless the following are exactly the same (best practice is + * to store this information with the index and check that they remain the + * same at query time): + *

+ *
    + *
  1. JVM vendor
  2. + *
  3. JVM version, including patch version
  4. + *
  5. + * The language (and country and variant, if specified) of the Locale + * used when constructing the collator via + * {@link Collator#getInstance(java.util.Locale)}. + *
  6. + *
  7. + * The collation strength used - see {@link Collator#setStrength(int)} + *
  8. + *
+ *

+ * The ICUCollationAttributeFactory in the icu package of Lucene's + * contrib area uses ICU4J's Collator, which makes its + * version available, thus allowing collation to be versioned independently + * from the JVM. ICUCollationAttributeFactory is also significantly faster and + * generates significantly shorter keys than CollationAttributeFactory. See + * http://site.icu-project.org/charts/collation-icu4j-sun for key + * generation timing and key length comparisons between ICU4J and + * java.text.Collator over several languages. + *

+ *

+ * CollationKeys generated by java.text.Collators are not compatible + * with those those generated by ICU Collators. Specifically, if you use + * CollationAttributeFactory to generate index terms, do not use + * ICUCollationAttributeFactory on the query side, or vice versa. + *

+ */ +public class CollationAttributeFactory extends AttributeSource.AttributeFactory { + private final Collator collator; + private final AttributeSource.AttributeFactory delegate; + + /** + * Create a CollationAttributeFactory, using + * {@link AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the + * factory for all other attributes. + * @param collator CollationKey generator + */ + public CollationAttributeFactory(Collator collator) { + this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator); + } + + /** + * Create a CollationAttributeFactory, using the supplied Attribute Factory + * as the factory for all other attributes. + * @param delegate Attribute Factory + * @param collator CollationKey generator + */ + public CollationAttributeFactory(AttributeSource.AttributeFactory delegate, Collator collator) { + this.delegate = delegate; + this.collator = collator; + } + + @Override + public AttributeImpl createAttributeInstance( + Class attClass) { + return attClass.isAssignableFrom(CollatedTermAttributeImpl.class) + ? new CollatedTermAttributeImpl(collator) + : delegate.createAttributeInstance(attClass); + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\collation\CollationAttributeFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (revision 965632) +++ modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (working copy) @@ -18,14 +18,13 @@ */ -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link +import org.apache.lucene.util.Version; import java.text.Collator; import java.io.Reader; -import java.io.IOException; /** *

@@ -33,8 +32,8 @@ *

*

* Converts the token into its {@link java.text.CollationKey}, and then - * encodes the CollationKey with - * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow + * encodes the CollationKey either directly or with + * {@link IndexableBinaryStringTools} (see below), to allow * it to be stored as an index term. *

*

@@ -75,39 +74,49 @@ * CollationKeyAnalyzer to generate index terms, do not use * ICUCollationKeyAnalyzer on the query side, or vice versa. *

+ * + *

You must specify the required {@link Version} + * compatibility when creating CollationKeyAnalyzer: + *

*/ -public final class CollationKeyAnalyzer extends Analyzer { - private Collator collator; - - public CollationKeyAnalyzer(Collator collator) { +public final class CollationKeyAnalyzer extends ReusableAnalyzerBase { + private final Collator collator; + private final CollationAttributeFactory factory; + private final Version matchVersion; + + /** + * Create a new CollationKeyAnalyzer, using the specified collator. + * + * @param matchVersion See
above + * @param collator CollationKey generator + */ + public CollationKeyAnalyzer(Version matchVersion, Collator collator) { + this.matchVersion = matchVersion; this.collator = collator; + this.factory = new CollationAttributeFactory(collator); } - - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new KeywordTokenizer(reader); - result = new CollationKeyFilter(result, collator); - return result; - } - private class SavedStreams { - Tokenizer source; - TokenStream result; + /** + * @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)} + * and specify a version instead. This ctor will be removed in Lucene 5.0 + */ + @Deprecated + public CollationKeyAnalyzer(Collator collator) { + this(Version.LUCENE_31, collator); } - + @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - - SavedStreams streams = (SavedStreams)getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new KeywordTokenizer(reader); - streams.result = new CollationKeyFilter(streams.source, collator); - setPreviousTokenStream(streams); + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + if (matchVersion.onOrAfter(Version.LUCENE_40)) { + KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE); + return new TokenStreamComponents(tokenizer, tokenizer); } else { - streams.source.reset(reader); + KeywordTokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator)); } - return streams.result; } } Index: modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java (revision 965632) +++ modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java (working copy) @@ -71,7 +71,10 @@ * CollationKeyFilter to generate index terms, do not use * ICUCollationKeyFilter on the query side, or vice versa. *

+ * @deprecated Use {@link CollationAttributeFactory} instead, which encodes + * terms directly as bytes. This filter will be removed in Lucene 5.0 */ +@Deprecated public final class CollationKeyFilter extends TokenFilter { private final Collator collator; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); Index: modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java (revision 0) @@ -0,0 +1,48 @@ +package org.apache.lucene.collation.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.Collator; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; +import org.apache.lucene.util.BytesRef; + +/** + * Extension of {@link CharTermAttributeImpl} that encodes the term + * text as a binary Unicode collation key instead of as UTF-8 bytes. + */ +public class CollatedTermAttributeImpl extends CharTermAttributeImpl { + private final Collator collator; + + /** + * Create a new CollatedTermAttributeImpl + * @param collator Collation key generator + */ + public CollatedTermAttributeImpl(Collator collator) { + this.collator = collator; + } + + @Override + public int toBytesRef(BytesRef target) { + target.bytes = collator.getCollationKey(toString()).toByteArray(); + target.offset = 0; + target.length = target.bytes.length; + return target.hashCode(); + } + +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\collation\tokenattributes\CollatedTermAttributeImpl.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java =================================================================== --- lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (revision 965632) +++ lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (working copy) @@ -21,6 +21,8 @@ import java.nio.CharBuffer; import java.nio.ByteBuffer; +/** @deprecated Remove this test class when IndexableBinaryStringTools is removed */ +@Deprecated public class TestIndexableBinaryStringTools extends LuceneTestCase { private static final int NUM_RANDOM_TESTS = 2000*_TestUtil.getRandomMultiplier(); private static final int MAX_RANDOM_BINARY_LENGTH = 300*_TestUtil.getRandomMultiplier(); Index: lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (revision 965632) +++ lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (working copy) @@ -77,7 +77,7 @@ } // *** TermToBytesRefAttribute interface *** - public final int toBytesRef(BytesRef target) { + public int toBytesRef(BytesRef target) { return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target); } Index: lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java =================================================================== --- lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (revision 965632) +++ lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (working copy) @@ -19,6 +19,7 @@ import java.nio.CharBuffer; import java.nio.ByteBuffer; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc @link /** * Provides support for converting byte sequences to Strings and back again. @@ -52,7 +53,10 @@ * will not correctly interpret buffers returned by {@link ByteBuffer#slice}. * * @lucene.experimental + * @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly + * instead. This class will be removed in Lucene 5.0 */ +@Deprecated public class IndexableBinaryStringTools { private static final CodingCase[] CODING_CASES = {