Index: solr/src/test/org/apache/solr/analysis/TestHindiFilters.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestHindiFilters.java (revision 1032556) +++ solr/src/test/org/apache/solr/analysis/TestHindiFilters.java (working copy) @@ -28,23 +28,11 @@ */ public class TestHindiFilters extends BaseTokenTestCase { /** - * Test IndicTokenizerFactory - */ - public void testTokenizer() throws Exception { - Reader reader = new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"); - IndicTokenizerFactory factory = new IndicTokenizerFactory(); - factory.init(DEFAULT_VERSION_PARAM); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" }); - } - - /** * Test IndicNormalizationFilterFactory */ public void testIndicNormalizer() throws Exception { Reader reader = new StringReader("ত্‍ अाैर"); - IndicTokenizerFactory factory = new IndicTokenizerFactory(); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); filterFactory.init(DEFAULT_VERSION_PARAM); @@ -58,7 +46,7 @@ */ public void testHindiNormalizer() throws Exception { Reader reader = new StringReader("क़िताब"); - IndicTokenizerFactory factory = new IndicTokenizerFactory(); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); @@ -74,7 +62,7 @@ */ public void testStemmer() throws Exception { Reader reader = new StringReader("किताबें"); - IndicTokenizerFactory factory = new IndicTokenizerFactory(); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); HindiStemFilterFactory stemFactory = new HindiStemFilterFactory(); Index: solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java (revision 1032556) +++ solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java (working copy) @@ -1,31 +0,0 @@ -package org.apache.solr.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.in.IndicTokenizer; - -/** Factory for {@link IndicTokenizer} */ -public class IndicTokenizerFactory extends BaseTokenizerFactory { - public Tokenizer create(Reader input) { - assureMatchVersion(); - return new IndicTokenizer(luceneMatchVersion, input); - } -} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java (revision 1032556) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java (working copy) @@ -1,44 +0,0 @@ -package org.apache.lucene.analysis.in; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; - -/** - * Test IndicTokenizer - */ -public class TestIndicTokenizer extends BaseTokenStreamTestCase { - /** Test tokenizing Indic vowels, signs, and punctuation */ - public void testBasics() throws IOException { - TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT, - new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।")); - assertTokenStreamContents(ts, - new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" }); - } - - /** Test that words with format chars such as ZWJ are kept */ - public void testFormat() throws Exception { - TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT, - new StringReader("शार्‍मा शार्‍मा")); - assertTokenStreamContents(ts, new String[] { "शार्‍मा", "शार्‍मा" }); - } -} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java (revision 0) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.fa; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.charfilter.CharFilter; + +/** + * CharFilter that replaces instances of Zero-width non-joiner with an + * ordinary space. + */ +public class PersianCharFilter extends CharFilter { + + public PersianCharFilter(CharStream in) { + super(in); + } + + public int read(char[] cbuf, int off, int len) throws IOException { + final int charsRead = super.read(cbuf, off, len); + if (charsRead > 0) { + final int end = off + charsRead; + while (off < end) { + if (cbuf[off] == '\u200C') + cbuf[off] = ' '; + off++; + } + } + return charsRead; + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\fa\PersianCharFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (revision 1032556) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (working copy) @@ -24,12 +24,15 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharReader; +import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.util.Version; @@ -140,14 +143,19 @@ * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} - * built from a {@link ArabicLetterTokenizer} filtered with + * built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}, * {@link PersianNormalizationFilter} and Persian Stop words */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader); + final Tokenizer source; + if (matchVersion.onOrAfter(Version.LUCENE_31)) { + source = new StandardTokenizer(matchVersion, new PersianCharFilter(CharReader.get(reader))); + } else { + source = new ArabicLetterTokenizer(matchVersion, reader); + } TokenStream result = new LowerCaseFilter(matchVersion, source); result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ @@ -156,6 +164,12 @@ * the order here is important: the stopword list is normalized with the * above! */ - return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); + return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)) { + public Reader charStream(Reader reader) { + return matchVersion.onOrAfter(Version.LUCENE_31) ? + new PersianCharFilter(CharReader.get(reader)) : + reader; + } + }; } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (revision 1032556) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; @@ -167,7 +168,7 @@ * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} - * built from an {@link ArabicLetterTokenizer} filtered with + * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter}, * {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter} * if a stem exclusion set is provided and {@link ArabicStemFilter}. @@ -175,7 +176,8 @@ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader); + final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ? + new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); // the order here is important: the stopword list is not normalized! result = new StopFilter( matchVersion, result, stopwords); Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (revision 1032556) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; @@ -38,7 +39,9 @@ * detect token characters. See {@link #isTokenChar(int)} and * {@link #normalize(int)} for details. * + * @deprecated Use {@link StandardTokenizer} instead. */ +@Deprecated public class ArabicLetterTokenizer extends LetterTokenizer { Index: modules/analysis/common/src/java/org/apache/lucene/analysis/util/ReusableAnalyzerBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/util/ReusableAnalyzerBase.java (revision 1032556) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/util/ReusableAnalyzerBase.java (working copy) @@ -150,13 +150,20 @@ * if the component's reset method throws an {@link IOException} */ protected boolean reset(final Reader reader) throws IOException { - source.reset(reader); + source.reset(charStream(reader)); if(sink != source) sink.reset(); // only reset if the sink reference is different from source return true; } /** + * Override this if you want to add a CharFilter chain. + */ + public Reader charStream(Reader reader) { + return reader; + } + + /** * Returns the sink {@link TokenStream} * * @return the sink {@link TokenStream}