Index: contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java =================================================================== --- contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java (revision 804680) +++ contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java (working copy) @@ -109,6 +109,7 @@ streams.source = new WhitespaceTokenizer(reader); streams.result = new LowerCaseFilter(streams.source); streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms); + setPreviousTokenStream(streams); } else { streams.source.reset(reader); streams.result.reset(); // reset the SynonymTokenFilter Index: contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java =================================================================== --- contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (revision 804680) +++ contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (working copy) @@ -57,7 +57,7 @@ * This test is the same as the above, except using an ideographic space as a separator. * This tests to ensure the stopwords are working correctly. */ - public void testChineseStopWordsDefaultTwoPhrasesIdeoSpache() throws Exception { + public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception { Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ String sentence = "我购买了道具和服装 我购买了道具和服装。"; String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" }; @@ -101,6 +101,52 @@ new String[] { "我", "购买", "test", "了", "道具", "和", "服装"}); } + /* + * Numerics are parsed as their own tokens + */ + public void testNumerics() throws Exception { + assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装1234", + new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"}); + } + + /* + * Full width alphas and numerics are folded to half-width + */ + public void testFullWidth() throws Exception { + assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装1234", + new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"}); + } + + /* + * Presentation form delimiters are removed + */ + public void testDelimiters() throws Exception { + assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买︱ Tests 了道具和服装", + new String[] { "我", "购买", "test", "了", "道具", "和", "服装"}); + } + + /* + * Text from writing systems other than Chinese and Latin are parsed as individual characters. + * (regardless of Unicode category) + */ + public void testNonChinese() throws Exception { + assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 روبرتTests 了道具和服装", + new String[] { "我", "购买", "ر", "و", "ب", "ر", "ت", "test", "了", "道具", "和", "服装"}); + } + + /* + * Test what the analyzer does with out-of-vocabulary words. + * In this case the name is Yousaf Raza Gillani. + * Currently it is being analyzed into single characters... + */ + public void testOOV() throws Exception { + assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福·拉扎·吉拉尼", + new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" }); + + assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福拉扎吉拉尼", + new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" }); + } + public void testOffsets() throws Exception { assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装", new String[] { "我", "购买", "了", "道具", "和", "服装" }, Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (revision 804680) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (working copy) @@ -90,12 +90,12 @@ } private void check(final String input, final String expected) throws IOException { - StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input)); - GermanStemFilter filter = new GermanStemFilter(tokenStream); - TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); - assertTrue(filter.incrementToken()); + Analyzer a = new GermanAnalyzer(); + TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input)); + TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); + assertTrue(tokenStream.incrementToken()); assertEquals(expected, termAtt.term()); - filter.close(); + tokenStream.close(); } private void checkReuse(Analyzer a, String input, String expected) throws IOException { Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (revision 804680) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (working copy) @@ -336,7 +336,9 @@ * @throws IOException */ public void testMatrix() throws IOException { - + // some other tests set this to null. + // set it here in case tests are run out of the usual order. + ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); Matrix matrix = new Matrix(); matrix.new Column(tokenFactory("no", 1)); Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (revision 804680) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (working copy) @@ -18,12 +18,15 @@ */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -59,6 +62,76 @@ new int[] { 1, 2, 3 }); } + /* + * Analyzer that just uses ChineseTokenizer, not ChineseFilter. + * convenience to show the behavior of the tokenizer + */ + private class JustChineseTokenizerAnalyzer extends Analyzer { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new ChineseTokenizer(reader); + } + } + + /* + * Analyzer that just uses ChineseFilter, not ChineseTokenizer. + * convenience to show the behavior of the filter. + */ + private class JustChineseFilterAnalyzer extends Analyzer { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new ChineseFilter(new WhitespaceTokenizer(reader)); + } + } + + /* + * ChineseTokenizer tokenizes numbers as one token, but they are filtered by ChineseFilter + */ + public void testNumerics() throws Exception + { + Analyzer justTokenizer = new JustChineseTokenizerAnalyzer(); + assertAnalyzesTo(justTokenizer, "中1234", new String[] { "中", "1234" }); + + // in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric token. + Analyzer a = new ChineseAnalyzer(); + assertAnalyzesTo(a, "中1234", new String[] { "中" }); + } + + /* + * ChineseTokenizer tokenizes english similar to SimpleAnalyzer. + * it will lowercase terms automatically. + * + * ChineseFilter has an english stopword list, it also removes any single character tokens. + * the stopword list is case-sensitive. + */ + public void testEnglish() throws Exception + { + Analyzer chinese = new ChineseAnalyzer(); + assertAnalyzesTo(chinese, "This is a Test. b c d", + new String[] { "test" }); + + Analyzer justTokenizer = new JustChineseTokenizerAnalyzer(); + assertAnalyzesTo(justTokenizer, "This is a Test. b c d", + new String[] { "this", "is", "a", "test", "b", "c", "d" }); + + Analyzer justFilter = new JustChineseFilterAnalyzer(); + assertAnalyzesTo(justFilter, "This is a Test. b c d", + new String[] { "This", "Test." }); + } + + private void assertAnalyzesTo(Analyzer a, String input, String[] output) + throws Exception { + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + TermAttribute termAtt = (TermAttribute) ts + .getAttribute(TermAttribute.class); + + for (int i = 0; i < output.length; i++) { + assertTrue(ts.incrementToken()); + assertEquals(output[i], termAtt.term()); + } + + assertFalse(ts.incrementToken()); + ts.close(); + } + private void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception { Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 804680) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (working copy) @@ -118,6 +118,14 @@ check("quiosque", "quiosqu"); } + public void testNormalization() throws Exception { + check("Brasil", "brasil"); // lowercase by default + check("Brasília", "brasil"); // remove diacritics + check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed + check("áá", "áá"); // token is too short: diacritics are not removed + check("ááá", "aaa"); // normally, diacritics are removed + } + public void testReusableTokenStream() throws Exception { Analyzer a = new BrazilianAnalyzer(); checkReuse(a, "boa", "boa"); @@ -126,6 +134,11 @@ checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese } + public void testStemExclusionTable() throws Exception { + BrazilianAnalyzer a = new BrazilianAnalyzer(); + a.setStemExclusionTable(new String[] { "quintessência" }); + checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged. + } private void check(final String input, final String expected) throws IOException { Analyzer analyzer = new BrazilianAnalyzer(); Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision 804680) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (working copy) @@ -18,9 +18,11 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LetterTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; @@ -35,6 +37,7 @@ import java.io.IOException; import java.io.Reader; +import java.io.StringReader; public class QueryAutoStopWordAnalyzerTest extends TestCase { String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"}; @@ -162,4 +165,37 @@ Hits h = search(a, "repetitiveField:boring"); assertFalse(h.length() == 0); } + + /* + * analyzer that does not support reuse + * it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even. + */ + private class NonreusableAnalyzer extends Analyzer { + int invocationCount = 0; + public TokenStream tokenStream(String fieldName, Reader reader) { + if (++invocationCount % 2 == 0) + return new WhitespaceTokenizer(reader); + else + return new LetterTokenizer(reader); + } + } + + public void testWrappingNonReusableAnalyzer() throws Exception { + QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new NonreusableAnalyzer()); + a.addStopWords(reader, 10); + Hits h = search(a, "repetitiveField:boring"); + assertTrue(h.length() == 0); + h = search(a, "repetitiveField:vaguelyboring"); + assertTrue(h.length() == 0); + } + + public void testTokenStream() throws Exception { + QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new WhitespaceAnalyzer()); + a.addStopWords(reader, 10); + TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring")); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + assertTrue(ts.incrementToken()); + assertEquals("this", termAtt.term()); + assertFalse(ts.incrementToken()); + } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (revision 804680) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (working copy) @@ -169,6 +169,66 @@ checkCJKToken(str, out_tokens); } + /* + * Full-width text is normalized to half-width + */ + public void testFullWidth() throws Exception { + String str = "Test 1234"; + TestToken[] out_tokens = { + newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE) + }; + checkCJKToken(str, out_tokens); + } + + /* + * Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3 + */ + public void testNonIdeographic() throws Exception { + String str = "\u4e00 روبرت موير"; + TestToken[] out_tokens = { + newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE) + }; + checkCJKToken(str, out_tokens); + } + + /* + * Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3, + * except for words are split around non-letters. + */ + public void testNonIdeographicNonLetter() throws Exception { + String str = "\u4e00 رُوبرت موير"; + TestToken[] out_tokens = { + newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE) + }; + checkCJKToken(str, out_tokens); + } + + public void testTokenStream() throws Exception { + Analyzer analyzer = new CJKAnalyzer(); + TokenStream ts = analyzer.tokenStream("dummy", new StringReader("\u4e00\u4e01\u4e02")); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + assertTrue(ts.incrementToken()); + assertEquals("\u4e00\u4e01", termAtt.term()); + assertTrue(ts.incrementToken()); + assertEquals("\u4e01\u4e02", termAtt.term()); + assertFalse(ts.incrementToken()); + } + public void testReusableTokenStream() throws Exception { Analyzer analyzer = new CJKAnalyzer(); String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053"; Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -118,10 +118,11 @@ * Create a set of words from an array * The resulting Set does case insensitive matching * TODO We should look for a faster dictionary lookup approach. - * @param dictionary - * @return + * @param dictionary + * @return {@link Set} of lowercased terms */ public static final Set makeDictionary(final String[] dictionary) { + // is the below really case insensitive? CharArraySet dict = new CharArraySet(dictionary.length, false); addAllLowerCase(dict, Arrays.asList(dictionary)); return dict; Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java (working copy) @@ -110,7 +110,7 @@ /** * Read hyphenation patterns from an XML file. * - * @param filename the filename + * @param f the filename * @throws HyphenationException In case the parsing fails */ public void loadPatterns(File f) throws HyphenationException { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (working copy) @@ -21,18 +21,21 @@ import java.util.Set; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; // for javadocs import org.apache.lucene.analysis.TokenStream; /** - * A TokenFilter that decomposes compound words found in many germanic languages + * A {@link TokenFilter} that decomposes compound words found in many Germanic languages. + *

* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find * "Donaudampfschiff" even when you only enter "schiff". * It uses a brute-force algorithm to achieve this. + *

*/ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { /** * - * @param input the token stream to process + * @param input the {@link TokenStream} to process * @param dictionary the word dictionary to match against * @param minWordSize only words longer than this get processed * @param minSubwordSize only subwords longer than this get to the output stream @@ -46,7 +49,7 @@ /** * - * @param input the token stream to process + * @param input the {@link TokenStream} to process * @param dictionary the word dictionary to match against */ public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) { @@ -55,7 +58,7 @@ /** * - * @param input the token stream to process + * @param input the {@link TokenStream} to process * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain * lower case strings. */ @@ -65,7 +68,7 @@ /** * - * @param input the token stream to process + * @param input the {@link TokenStream} to process * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain * lower case strings. * @param minWordSize only words longer than this get processed Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (working copy) @@ -24,16 +24,19 @@ import java.util.Set; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; // for javadocs import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.compound.hyphenation.Hyphenation; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.xml.sax.InputSource; /** - * A TokenFilter that decomposes compound words found in many germanic languages + * A {@link TokenFilter} that decomposes compound words found in many Germanic languages. + *

* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find - * "Donaudampfschiff" even when you only enter "schiff" It uses a hyphenation + * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation * grammar and a word dictionary to achieve this. + *

*/ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterBase { @@ -41,7 +44,7 @@ /** * - * @param input the token stream to process + * @param input the {@link TokenStream} to process * @param hyphenator the hyphenation pattern tree to use for hyphenation * @param dictionary the word dictionary to match against * @param minWordSize only words longer than this get processed @@ -60,7 +63,7 @@ /** * - * @param input the token stream to process + * @param input the {@link TokenStream} to process * @param hyphenator the hyphenation pattern tree to use for hyphenation * @param dictionary the word dictionary to match against */ @@ -72,7 +75,7 @@ /** * - * @param input the token stream to process + * @param input the {@link TokenStream} to process * @param hyphenator the hyphenation pattern tree to use for hyphenation * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain * lower case strings. @@ -85,7 +88,7 @@ /** * - * @param input the token stream to process + * @param input the {@link TokenStream} to process * @param hyphenator the hyphenation pattern tree to use for hyphenation * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain * lower case strings. Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html (working copy) @@ -5,7 +5,7 @@ A filter that decomposes compound words you find in many Germanic -languages to the word parts. This example shows what it does: +languages into the word parts. This example shows what it does: Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (working copy) @@ -36,12 +36,12 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; /** - * Analyzer for Persian. - * - * Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around - * ZWNJ in addition to space. Some persian-specific variant forms (such as farsi + * {@link Analyzer} for Persian. + *

+ * This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around + * zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords. - * + *

*/ public final class PersianAnalyzer extends Analyzer { @@ -107,11 +107,13 @@ } /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. + * Creates a {@link TokenStream} which tokenizes all the text in the provided + * {@link Reader}. * - * @return A TokenStream build from a ArabicLetterTokenizer filtered with - * LowerCaseFilter, ArabicNormalizationFilter, - * PersianNormalizationFilter and Persian Stop words + * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} + * filtered with {@link LowerCaseFilter}, + * {@link ArabicNormalizationFilter}, + * {@link PersianNormalizationFilter} and Persian Stop words */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new ArabicLetterTokenizer(reader); @@ -134,12 +136,13 @@ } /** - * Returns a (possibly reused) TokenStream which tokenizes all the text - * in the provided Reader. + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text + * in the provided {@link Reader}. * - * @return A TokenStream build from a ArabicLetterTokenizer filtered with - * LowerCaseFilter, ArabicNormalizationFilter, - * PersianNormalizationFilter and Persian Stop words + * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} + * filtered with {@link LowerCaseFilter}, + * {@link ArabicNormalizationFilter}, + * {@link PersianNormalizationFilter} and Persian Stop words */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java (working copy) @@ -24,7 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * A TokenFilter that applies {@link PersianNormalizer} to normalize the + * A {@link TokenFilter} that applies {@link PersianNormalizer} to normalize the * orthography. * */ Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (working copy) @@ -19,14 +19,19 @@ import java.io.Reader; import org.apache.lucene.analysis.CharTokenizer; +import org.apache.lucene.analysis.Tokenizer; // for javadocs +import org.apache.lucene.analysis.LetterTokenizer; // for javadocs /** - * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters - * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method, + * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer} + * by additionally looking up letters in a given "russian charset". + *

+ * The problem with + * {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method, * which doesn't know how to detect letters in encodings like CP1252 and KOI8 * (well-known problems with 0xD7 and 0xF7 chars) + *

* - * * @version $Id$ */ Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (working copy) @@ -20,7 +20,6 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java (working copy) @@ -19,11 +19,12 @@ /** * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation * for russian characters in Unicode, KOI8 and CP1252. + *

* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters. * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset * and adding logic to toLowerCase() method for that charset. + *

* - * * @version $Id$ */ public class RussianCharsets Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy) @@ -17,7 +17,6 @@ * limitations under the License. */ -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -25,11 +24,13 @@ import java.io.IOException; /** - * A filter that stems Russian words. The implementation was inspired by GermanStemFilter. - * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter , - * because RussianStemFilter only works with lowercase part of any "russian" charset. + * A {@link TokenFilter} that stems Russian words. + *

+ * The implementation was inspired by GermanStemFilter. + * The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter , + * because RussianStemFilter only works with lowercase part of any "russian" charset. + *

* - * * @version $Id$ */ public final class RussianStemFilter extends TokenFilter @@ -66,7 +67,7 @@ /** - * Set a alternative/custom RussianStemmer for this filter. + * Set a alternative/custom {@link RussianStemmer} for this filter. */ public void setStemmer(RussianStemmer stemmer) { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (working copy) @@ -29,11 +29,13 @@ import org.apache.lucene.analysis.Tokenizer; /** - * Analyzer for Russian language. Supports an external list of stopwords (words that + * {@link Analyzer} for Russian language. + *

+ * Supports an external list of stopwords (words that * will not be indexed at all). * A default set of stopwords is used unless an alternative list is specified. + *

* - * * @version $Id$ */ public final class RussianAnalyzer extends Analyzer @@ -246,10 +248,13 @@ } /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. + * Creates a {@link TokenStream} which tokenizes all the text in the + * provided {@link Reader}. * - * @return A TokenStream built from a RussianLetterTokenizer filtered with - * RussianLowerCaseFilter, StopFilter, and RussianStemFilter + * @return A {@link TokenStream} built from a + * {@link RussianLetterTokenizer} filtered with + * {@link RussianLowerCaseFilter}, {@link StopFilter}, + * and {@link RussianStemFilter} */ public TokenStream tokenStream(String fieldName, Reader reader) { @@ -266,11 +271,13 @@ }; /** - * Returns a (possibly reused) TokenStream which tokenizes all the text - * in the provided Reader. + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text + * in the provided {@link Reader}. * - * @return A TokenStream built from a RussianLetterTokenizer filtered with - * RussianLowerCaseFilter, StopFilter, and RussianStemFilter + * @return A {@link TokenStream} built from a + * {@link RussianLetterTokenizer} filtered with + * {@link RussianLowerCaseFilter}, {@link StopFilter}, + * and {@link RussianStemFilter} */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (working copy) @@ -35,12 +35,14 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; /** - * Analyzer for German language. Supports an external list of stopwords (words that + * {@link Analyzer} for German language. + *

+ * Supports an external list of stopwords (words that * will not be indexed at all) and an external list of exclusions (word that will * not be stemmed, but indexed). - * A default set of stopwords is used unless an alternative list is specified, the + * A default set of stopwords is used unless an alternative list is specified, but the * exclusion list is empty by default. - * + *

* * @version $Id$ */ @@ -65,7 +67,7 @@ }; /** - * Contains the stopwords used with the StopFilter. + * Contains the stopwords used with the {@link StopFilter}. */ private Set stopSet = new HashSet(); @@ -75,8 +77,8 @@ private Set exclusionSet = new HashSet(); /** - * Builds an analyzer with the default stop words - * (GERMAN_STOP_WORDS). + * Builds an analyzer with the default stop words: + * {@link #GERMAN_STOP_WORDS}. */ public GermanAnalyzer() { stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS); @@ -115,7 +117,7 @@ } /** - * Builds an exclusionlist from a Hashtable. + * Builds an exclusionlist from a {@link Map} */ public void setStemExclusionTable(Map exclusionlist) { exclusionSet = new HashSet(exclusionlist.keySet()); @@ -129,10 +131,11 @@ } /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. + * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * - * @return A TokenStream built from a StandardTokenizer filtered with - * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter + * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with + * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and + * {@link GermanStemFilter} */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); @@ -149,11 +152,12 @@ }; /** - * Returns a (possibly reused) TokenStream which tokenizes all the text - * in the provided Reader. + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text + * in the provided {@link Reader}. * - * @return A TokenStream built from a StandardTokenizer filtered with - * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter + * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with + * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and + * {@link GermanStemFilter} */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { if (overridesTokenStreamMethod) { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java (working copy) @@ -19,11 +19,13 @@ */ /** - * A stemmer for German words. The algorithm is based on the report + * A stemmer for German words. + *

+ * The algorithm is based on the report * "A Fast and Simple Stemming Algorithm for German Words" by Jörg * Caumanns (joerg.caumanns at isst.fhg.de). + *

* - * * @version $Id$ */ public class GermanStemmer Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (working copy) @@ -25,11 +25,13 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * A filter that stems German words. It supports a table of words that should + * A {@link TokenFilter} that stems German words. + *

+ * It supports a table of words that should * not be stemmed at all. The stemmer used can be changed at runtime after the - * filter object is created (as long as it is a GermanStemmer). + * filter object is created (as long as it is a {@link GermanStemmer}). + *

* - * * @version $Id$ */ public final class GermanStemFilter extends TokenFilter @@ -78,7 +80,7 @@ } /** - * Set a alternative/custom GermanStemmer for this filter. + * Set a alternative/custom {@link GermanStemmer} for this filter. */ public void setStemmer( GermanStemmer stemmer ) { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -76,7 +76,7 @@ /** * Constructs a ShingleFilter with the specified single size from the - * TokenStream input + * {@link TokenStream} input * * @param input input stream * @param maxShingleSize maximum shingle size produced by the filter. Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (working copy) @@ -25,8 +25,10 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; /** - * A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer. A - * shingle is another namefor a token based n-gram. + * A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}. + *

+ * A shingle is another name for a token based n-gram. + *

*/ public class ShingleAnalyzerWrapper extends Analyzer { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (working copy) @@ -129,7 +129,7 @@ /** * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. * @param token - * @return + * @return {@link ShingleMatrixFilter.TokenPositioner} * @throws IOException */ public abstract TokenPositioner getTokenPositioner(Token token) throws IOException; @@ -1014,7 +1014,7 @@ * Returns a 32 bit float from the payload, or 1f it null. * * @param token - * @return + * @return 32 bit float */ public float getWeight(Token token) { if (token.getPayload() == null || token.getPayload().getData() == null) { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html (revision 0) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html (revision 0) @@ -0,0 +1,5 @@ + + +Word n-gram filters + + Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\shingle\package.html ___________________________________________________________________ Added: svn:eol-style + native Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (working copy) @@ -30,10 +30,12 @@ import java.util.Set; /** - * Analyzer for the Greek language. Supports an external list of stopwords (words + * {@link Analyzer} for the Greek language. + *

+ * Supports an external list of stopwords (words * that will not be indexed at all). * A default set of stopwords is used unless an alternative list is specified. - * + *

*/ public final class GreekAnalyzer extends Analyzer { @@ -145,14 +147,14 @@ }; /** - * Contains the stopwords used with the StopFilter. + * Contains the stopwords used with the {@link StopFilter}. */ private Set stopSet = new HashSet(); /** * Charset for Greek letters. * Represents encoding for 24 lowercase Greek letters. - * Predefined charsets can be taken from GreekCharSets class + * Predefined charsets can be taken from {@link GreekCharsets} class */ private char[] charset; @@ -209,10 +211,10 @@ } /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. + * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * - * @return A TokenStream built from a StandardTokenizer filtered with - * GreekLowerCaseFilter and StopFilter + * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with + * {@link GreekLowerCaseFilter} and {@link StopFilter} */ public TokenStream tokenStream(String fieldName, Reader reader) { @@ -228,11 +230,11 @@ }; /** - * Returns a (possibly reused) TokenStream which tokenizes all the text - * in the provided Reader. + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text + * in the provided {@link Reader}. * - * @return A TokenStream built from a StandardTokenizer filtered with - * GreekLowerCaseFilter and StopFilter + * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with + * {@link GreekLowerCaseFilter} and {@link StopFilter} */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java (working copy) @@ -19,10 +19,11 @@ /** * GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation * for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253. + *

* Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters, * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding * the definition of a new charset as well as the required logic in the toLowerCase() method. - * + *

*/ public class GreekCharsets { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java (working copy) @@ -24,13 +24,8 @@ import org.apache.lucene.analysis.Tokenizer; /** - * Title: ChineseAnalyzer - * Description: - * Subclass of org.apache.lucene.analysis.Analyzer - * build from a ChineseTokenizer, filtered with ChineseFilter. - * Copyright: Copyright (c) 2001 - * Company: - * @version 1.0 + * An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and + * filters with {@link ChineseFilter} * */ @@ -40,9 +35,10 @@ } /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. + * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * - * @return A TokenStream build from a ChineseTokenizer filtered with ChineseFilter. + * @return A {@link TokenStream} built from a {@link ChineseTokenizer} + * filtered with {@link ChineseFilter}. */ public final TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new ChineseTokenizer(reader); @@ -56,11 +52,11 @@ }; /** - * Returns a (possibly reused) TokenStream which tokenizes all the text in the - * provided Reader. + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the + * provided {@link Reader}. * - * @return A TokenStream build from a ChineseTokenizer filtered with - * ChineseFilter. + * @return A {@link TokenStream} built from a {@link ChineseTokenizer} + * filtered with {@link ChineseFilter}. */ public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (working copy) @@ -26,18 +26,19 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * Title: ChineseFilter - * Description: Filter with a stop word table - * Rule: No digital is allowed. - * English word/token should larger than 1 character. - * One Chinese character as one Chinese word. + * A {@link TokenFilter} with a stop word table. + * * TO DO: - * 1. Add Chinese stop words, such as \ue400 - * 2. Dictionary based Chinese word extraction - * 3. Intelligent Chinese word extraction - * - * Copyright: Copyright (c) 2001 - * Company: + *
    + *
  1. Add Chinese stop words, such as \ue400 + *
  2. Dictionary based Chinese word extraction + *
  3. Intelligent Chinese word extraction + *
+ * * @version 1.0 * */ Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy) @@ -27,28 +27,29 @@ /** - * Title: ChineseTokenizer - * Description: Extract tokens from the Stream using Character.getType() - * Rule: A Chinese character as a single token - * Copyright: Copyright (c) 2001 - * Company: - * - * The difference between thr ChineseTokenizer and the - * CJKTokenizer (id=23545) is that they have different + * Tokenize Chinese text as individual chinese characters. + * + *

+ * The difference between ChineseTokenizer and + * CJKTokenizer is that they have different * token parsing logic. - * - * Let me use an example. If having a Chinese text - * "C1C2C3C4" to be indexed, the tokens returned from the - * ChineseTokenizer are C1, C2, C3, C4. And the tokens - * returned from the CJKTokenizer are C1C2, C2C3, C3C4. - * - * Therefore the index the CJKTokenizer created is much - * larger. - * + *

+ *

+ * For example, if the Chinese text + * "C1C2C3C4" is to be indexed: + *

+ *

+ *

+ * Therefore the index created by CJKTokenizer is much larger. + *

+ *

* The problem is that when searching for C1, C1C2, C1C3, * C4C2, C1C2C3 ... the ChineseTokenizer works, but the * CJKTokenizer will not work. - * + *

* @version 1.0 * */ Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html (revision 804680) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html (working copy) @@ -3,7 +3,7 @@ -Analyzer for Chinese, which indexes unigrams (individuals chinese characters). +Analyzer for Chinese, which indexes unigrams (individual chinese characters).

Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.

Input token stream