Index: modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
===================================================================
--- modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (revision 965632)
+++ modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (working copy)
@@ -59,8 +59,10 @@
//
// Copied (and slightly modified) from
// org.apache.lucene.search.TestSort.testInternationalSort()
- //
- public void testCollationKeySort() throws Exception {
+ //
+ /** @deprecated remove this when ICUCollationKeyFilter is removed */
+ @Deprecated
+ public void testCollationKeySortBackwards() throws Exception {
Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
(Collator.getInstance(Locale.US));
Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
@@ -75,4 +77,20 @@
testCollationKeySort
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJHD");
}
+
+ public void testCollationKeySort() throws Exception {
+ Analyzer usAnalyzer
+ = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
+ Analyzer franceAnalyzer
+ = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
+ Analyzer swedenAnalyzer
+ = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
+ Analyzer denmarkAnalyzer
+ = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
+
+ // The ICU Collator and java.text.Collator implementations differ in their
+ // orderings - "BFJHD" is the ordering for ICU Collator for Locale.US.
+ testCollationKeySort
+ (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJHD");
+ }
}
Index: modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
===================================================================
--- modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (revision 965632)
+++ modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (working copy)
@@ -26,7 +26,8 @@
import java.io.Reader;
import java.util.Locale;
-
+/** @deprecated remove this when ICUCollationKeyFilter is removed */
+@Deprecated
public class TestICUCollationKeyFilter extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("fa"));
Index: modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java
===================================================================
--- modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java (revision 0)
+++ modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java (revision 0)
@@ -0,0 +1,96 @@
+package org.apache.lucene.collation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.collation.tokenattributes.ICUCollatedTermAttributeImpl;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
+
+import com.ibm.icu.text.Collator;
+
+/**
+ *
+ * Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
+ * then encodes bytes as an index term.
+ *
+ *
+ * WARNING: Make sure you use exactly the same Collator at
+ * index and query time -- CollationKeys are only comparable when produced by
+ * the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
+ * independently versioned, so it is safe to search against stored
+ * CollationKeys if the following are exactly the same (best practice is
+ * to store this information with the index and check that they remain the
+ * same at query time):
+ *
+ *
+ * -
+ * Collator version - see {@link Collator#getVersion()}
+ *
+ * -
+ * The collation strength used - see {@link Collator#setStrength(int)}
+ *
+ *
+ *
+ * CollationKeys generated by ICU Collators are not compatible with those
+ * generated by java.text.Collators. Specifically, if you use
+ * ICUCollationAttributeFactory to generate index terms, do not use
+ * {@link CollationAttributeFactory} on the query side, or vice versa.
+ *
+ *
+ * ICUCollationAttributeFactory is significantly faster and generates significantly
+ * shorter keys than CollationAttributeFactory. See
+ * http://site.icu-project.org/charts/collation-icu4j-sun for key
+ * generation timing and key length comparisons between ICU4J and
+ * java.text.Collator over several languages.
+ *
+ */
+public class ICUCollationAttributeFactory extends AttributeSource.AttributeFactory {
+ private final Collator collator;
+ private final AttributeSource.AttributeFactory delegate;
+
+ /**
+ * Create an ICUCollationAttributeFactory, using
+ * {@link AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the
+ * factory for all other attributes.
+ * @param collator CollationKey generator
+ */
+ public ICUCollationAttributeFactory(Collator collator) {
+ this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator);
+ }
+
+ /**
+ * Create an ICUCollationAttributeFactory, using the supplied Attribute
+ * Factory as the factory for all other attributes.
+ * @param delegate Attribute Factory
+ * @param collator CollationKey generator
+ */
+ public ICUCollationAttributeFactory(AttributeSource.AttributeFactory delegate, Collator collator) {
+ this.delegate = delegate;
+ this.collator = collator;
+ }
+
+ @Override
+ public AttributeImpl createAttributeInstance(
+ Class extends Attribute> attClass) {
+ return attClass.isAssignableFrom(ICUCollatedTermAttributeImpl.class)
+ ? new ICUCollatedTermAttributeImpl(collator)
+ : delegate.createAttributeInstance(attClass);
+ }
+}
Property changes on: modules\analysis\icu\src\java\org\apache\lucene\collation\ICUCollationAttributeFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java
===================================================================
--- modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (revision 965632)
+++ modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (working copy)
@@ -19,13 +19,12 @@
import com.ibm.icu.text.Collator;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
+import org.apache.lucene.util.Version;
import java.io.Reader;
-import java.io.IOException;
/**
@@ -33,8 +32,8 @@
* Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}.
*
* Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
- * then encodes the CollationKey with
- * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to
+ * then encodes the CollationKey either directly or with
+ * {@link IndexableBinaryStringTools} (see below), to allow it to
* be stored as an index term.
*
*
@@ -68,39 +67,49 @@
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
*
+ *
+ * You must specify the required {@link Version}
+ * compatibility when creating ICUCollationKeyAnalyzer:
+ *
+ * - As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ * versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ *
*/
-public final class ICUCollationKeyAnalyzer extends Analyzer {
- private Collator collator;
-
- public ICUCollationKeyAnalyzer(Collator collator) {
+public final class ICUCollationKeyAnalyzer extends ReusableAnalyzerBase {
+ private final Collator collator;
+ private final ICUCollationAttributeFactory factory;
+ private final Version matchVersion;
+
+ /**
+ * Create a new ICUCollationKeyAnalyzer, using the specified collator.
+ *
+ * @param matchVersion See above
+ * @param collator CollationKey generator
+ */
+ public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) {
+ this.matchVersion = matchVersion;
this.collator = collator;
+ this.factory = new ICUCollationAttributeFactory(collator);
}
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new KeywordTokenizer(reader);
- result = new ICUCollationKeyFilter(result, collator);
- return result;
- }
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
+ /**
+ * @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)}
+ * and specify a version instead. This ctor will be removed in Lucene 5.0
+ */
+ @Deprecated
+ public ICUCollationKeyAnalyzer(Collator collator) {
+ this(Version.LUCENE_31, collator);
}
-
+
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
-
- SavedStreams streams = (SavedStreams)getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new KeywordTokenizer(reader);
- streams.result = new ICUCollationKeyFilter(streams.source, collator);
- setPreviousTokenStream(streams);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+ KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ return new TokenStreamComponents(tokenizer, tokenizer);
} else {
- streams.source.reset(reader);
+ KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator));
}
- return streams.result;
}
}
Index: modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
===================================================================
--- modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (revision 965632)
+++ modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (working copy)
@@ -66,7 +66,10 @@
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
*
+ * @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
+ * terms directly as bytes. This filter will be removed in Lucene 5.0
*/
+@Deprecated
public final class ICUCollationKeyFilter extends TokenFilter {
private Collator collator = null;
private RawCollationKey reusableKey = new RawCollationKey();
Index: modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java
===================================================================
--- modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java (revision 0)
+++ modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java (revision 0)
@@ -0,0 +1,50 @@
+package org.apache.lucene.collation.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
+import org.apache.lucene.util.BytesRef;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RawCollationKey;
+
+/**
+ * Extension of {@link CharTermAttributeImpl} that encodes the term
+ * text as a binary Unicode collation key instead of as UTF-8 bytes.
+ */
+public class ICUCollatedTermAttributeImpl extends CharTermAttributeImpl {
+ private final Collator collator;
+ private RawCollationKey key = new RawCollationKey();
+
+ /**
+ * Create a new ICUCollatedTermAttributeImpl
+ * @param collator Collation key generator
+ */
+ public ICUCollatedTermAttributeImpl(Collator collator) {
+ this.collator = collator;
+ }
+
+ @Override
+ public int toBytesRef(BytesRef target) {
+ collator.getRawCollationKey(toString(), key);
+ target.bytes = key.bytes;
+ target.offset = 0;
+ target.length = key.size;
+ return target.hashCode();
+ }
+}
Property changes on: modules\analysis\icu\src\java\org\apache\lucene\collation\tokenattributes\ICUCollatedTermAttributeImpl.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (revision 965632)
+++ modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (working copy)
@@ -59,7 +59,9 @@
secondRangeBeginning, secondRangeEnd);
}
- public void testCollationKeySort() throws Exception {
+ /** @deprecated remove this when CollationKeyFilter is removed */
+ @Deprecated
+ public void testCollationKeySortBackwards() throws Exception {
Analyzer usAnalyzer
= new CollationKeyAnalyzer(Collator.getInstance(Locale.US));
Analyzer franceAnalyzer
@@ -74,4 +76,20 @@
testCollationKeySort
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJDH");
}
+
+ public void testCollationKeySort() throws Exception {
+ Analyzer usAnalyzer
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
+ Analyzer franceAnalyzer
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
+ Analyzer swedenAnalyzer
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
+ Analyzer denmarkAnalyzer
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
+
+ // The ICU Collator and java.text.Collator implementations differ in their
+ // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
+ testCollationKeySort
+ (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJDH");
+ }
}
Index: modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (revision 965632)
+++ modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (working copy)
@@ -26,7 +26,8 @@
import java.util.Locale;
import java.io.Reader;
-
+/** @deprecated remove this when CollationKeyFilter is removed */
+@Deprecated
public class TestCollationKeyFilter extends CollationTestBase {
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (revision 965632)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (working copy)
@@ -29,8 +29,8 @@
* Emits the entire input as a single token.
*/
public final class KeywordTokenizer extends Tokenizer {
-
- private static final int DEFAULT_BUFFER_SIZE = 256;
+ /** Default read buffer size */
+ public static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done = false;
private int finalOffset;
Index: modules/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java (revision 0)
@@ -0,0 +1,103 @@
+package org.apache.lucene.collation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.Collator;
+
+import org.apache.lucene.collation.tokenattributes.CollatedTermAttributeImpl;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ *
+ * Converts each token into its {@link java.text.CollationKey}, and then
+ * encodes the bytes as an index term.
+ *
+ *
+ * WARNING: Make sure you use exactly the same Collator at
+ * index and query time -- CollationKeys are only comparable when produced by
+ * the same Collator. Since {@link java.text.RuleBasedCollator}s are not
+ * independently versioned, it is unsafe to search against stored
+ * CollationKeys unless the following are exactly the same (best practice is
+ * to store this information with the index and check that they remain the
+ * same at query time):
+ *
+ *
+ * - JVM vendor
+ * - JVM version, including patch version
+ * -
+ * The language (and country and variant, if specified) of the Locale
+ * used when constructing the collator via
+ * {@link Collator#getInstance(java.util.Locale)}.
+ *
+ * -
+ * The collation strength used - see {@link Collator#setStrength(int)}
+ *
+ *
+ *
+ * The ICUCollationAttributeFactory in the icu package of Lucene's
+ * contrib area uses ICU4J's Collator, which makes its
+ * version available, thus allowing collation to be versioned independently
+ * from the JVM. ICUCollationAttributeFactory is also significantly faster and
+ * generates significantly shorter keys than CollationAttributeFactory. See
+ * http://site.icu-project.org/charts/collation-icu4j-sun for key
+ * generation timing and key length comparisons between ICU4J and
+ * java.text.Collator over several languages.
+ *
+ *
+ * CollationKeys generated by java.text.Collators are not compatible
+ * with those those generated by ICU Collators. Specifically, if you use
+ * CollationAttributeFactory to generate index terms, do not use
+ * ICUCollationAttributeFactory on the query side, or vice versa.
+ *
+ */
+public class CollationAttributeFactory extends AttributeSource.AttributeFactory {
+ private final Collator collator;
+ private final AttributeSource.AttributeFactory delegate;
+
+ /**
+ * Create a CollationAttributeFactory, using
+ * {@link AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the
+ * factory for all other attributes.
+ * @param collator CollationKey generator
+ */
+ public CollationAttributeFactory(Collator collator) {
+ this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator);
+ }
+
+ /**
+ * Create a CollationAttributeFactory, using the supplied Attribute Factory
+ * as the factory for all other attributes.
+ * @param delegate Attribute Factory
+ * @param collator CollationKey generator
+ */
+ public CollationAttributeFactory(AttributeSource.AttributeFactory delegate, Collator collator) {
+ this.delegate = delegate;
+ this.collator = collator;
+ }
+
+ @Override
+ public AttributeImpl createAttributeInstance(
+ Class extends Attribute> attClass) {
+ return attClass.isAssignableFrom(CollatedTermAttributeImpl.class)
+ ? new CollatedTermAttributeImpl(collator)
+ : delegate.createAttributeInstance(attClass);
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\collation\CollationAttributeFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (revision 965632)
+++ modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (working copy)
@@ -18,14 +18,13 @@
*/
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
+import org.apache.lucene.util.Version;
import java.text.Collator;
import java.io.Reader;
-import java.io.IOException;
/**
*
@@ -33,8 +32,8 @@
*
*
* Converts the token into its {@link java.text.CollationKey}, and then
- * encodes the CollationKey with
- * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow
+ * encodes the CollationKey either directly or with
+ * {@link IndexableBinaryStringTools} (see below), to allow
* it to be stored as an index term.
*
*
@@ -75,39 +74,49 @@
* CollationKeyAnalyzer to generate index terms, do not use
* ICUCollationKeyAnalyzer on the query side, or vice versa.
*
+ *
+ * You must specify the required {@link Version}
+ * compatibility when creating CollationKeyAnalyzer:
+ *
+ * - As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ * versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ *
*/
-public final class CollationKeyAnalyzer extends Analyzer {
- private Collator collator;
-
- public CollationKeyAnalyzer(Collator collator) {
+public final class CollationKeyAnalyzer extends ReusableAnalyzerBase {
+ private final Collator collator;
+ private final CollationAttributeFactory factory;
+ private final Version matchVersion;
+
+ /**
+ * Create a new CollationKeyAnalyzer, using the specified collator.
+ *
+ * @param matchVersion See above
+ * @param collator CollationKey generator
+ */
+ public CollationKeyAnalyzer(Version matchVersion, Collator collator) {
+ this.matchVersion = matchVersion;
this.collator = collator;
+ this.factory = new CollationAttributeFactory(collator);
}
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new KeywordTokenizer(reader);
- result = new CollationKeyFilter(result, collator);
- return result;
- }
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
+ /**
+ * @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)}
+ * and specify a version instead. This ctor will be removed in Lucene 5.0
+ */
+ @Deprecated
+ public CollationKeyAnalyzer(Collator collator) {
+ this(Version.LUCENE_31, collator);
}
-
+
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
-
- SavedStreams streams = (SavedStreams)getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new KeywordTokenizer(reader);
- streams.result = new CollationKeyFilter(streams.source, collator);
- setPreviousTokenStream(streams);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+ KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ return new TokenStreamComponents(tokenizer, tokenizer);
} else {
- streams.source.reset(reader);
+ KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
}
- return streams.result;
}
}
Index: modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java (revision 965632)
+++ modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java (working copy)
@@ -71,7 +71,10 @@
* CollationKeyFilter to generate index terms, do not use
* ICUCollationKeyFilter on the query side, or vice versa.
*
+ * @deprecated Use {@link CollationAttributeFactory} instead, which encodes
+ * terms directly as bytes. This filter will be removed in Lucene 5.0
*/
+@Deprecated
public final class CollationKeyFilter extends TokenFilter {
private final Collator collator;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
Index: modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java (revision 0)
@@ -0,0 +1,48 @@
+package org.apache.lucene.collation.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.Collator;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Extension of {@link CharTermAttributeImpl} that encodes the term
+ * text as a binary Unicode collation key instead of as UTF-8 bytes.
+ */
+public class CollatedTermAttributeImpl extends CharTermAttributeImpl {
+ private final Collator collator;
+
+ /**
+ * Create a new CollatedTermAttributeImpl
+ * @param collator Collation key generator
+ */
+ public CollatedTermAttributeImpl(Collator collator) {
+ this.collator = collator;
+ }
+
+ @Override
+ public int toBytesRef(BytesRef target) {
+ target.bytes = collator.getCollationKey(toString()).toByteArray();
+ target.offset = 0;
+ target.length = target.bytes.length;
+ return target.hashCode();
+ }
+
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\collation\tokenattributes\CollatedTermAttributeImpl.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (revision 965632)
+++ lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (working copy)
@@ -21,6 +21,8 @@
import java.nio.CharBuffer;
import java.nio.ByteBuffer;
+/** @deprecated Remove this test class when IndexableBinaryStringTools is removed */
+@Deprecated
public class TestIndexableBinaryStringTools extends LuceneTestCase {
private static final int NUM_RANDOM_TESTS = 2000*_TestUtil.getRandomMultiplier();
private static final int MAX_RANDOM_BINARY_LENGTH = 300*_TestUtil.getRandomMultiplier();
Index: lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (revision 965632)
+++ lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (working copy)
@@ -77,7 +77,7 @@
}
// *** TermToBytesRefAttribute interface ***
- public final int toBytesRef(BytesRef target) {
+ public int toBytesRef(BytesRef target) {
return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target);
}
Index: lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (revision 965632)
+++ lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (working copy)
@@ -19,6 +19,7 @@
import java.nio.CharBuffer;
import java.nio.ByteBuffer;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc @link
/**
* Provides support for converting byte sequences to Strings and back again.
@@ -52,7 +53,10 @@
* will not correctly interpret buffers returned by {@link ByteBuffer#slice}.
*
* @lucene.experimental
+ * @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly
+ * instead. This class will be removed in Lucene 5.0
*/
+@Deprecated
public class IndexableBinaryStringTools {
private static final CodingCase[] CODING_CASES = {