Index: solr/core/src/test/org/apache/solr/analysis/TestCJKBigramFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestCJKBigramFilterFactory.java (revision 0) +++ solr/core/src/test/org/apache/solr/analysis/TestCJKBigramFilterFactory.java (revision 0) @@ -0,0 +1,52 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Simple tests to ensure the CJK bigram factory is working. + * @deprecated + */ +public class TestCJKBigramFilterFactory extends BaseTokenTestCase { + public void testDefaults() throws Exception { + Reader reader = new StringReader("多くの学生が試験に落ちた。"); + CJKBigramFilterFactory factory = new CJKBigramFilterFactory(); + factory.init(DEFAULT_VERSION_PARAM); + TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader)); + assertTokenStreamContents(stream, + new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }); + } + + public void testHanOnly() throws Exception { + Reader reader = new StringReader("多くの学生が試験に落ちた。"); + CJKBigramFilterFactory factory = new CJKBigramFilterFactory(); + Map args = new HashMap(); + args.put("hiragana", "false"); + factory.init(args); + TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader)); + assertTokenStreamContents(stream, + new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }); + } +} Index: solr/core/src/test/org/apache/solr/analysis/TestCJKWidthFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestCJKWidthFilterFactory.java (revision 0) +++ solr/core/src/test/org/apache/solr/analysis/TestCJKWidthFilterFactory.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the CJKWidthFilterFactory is working + */ +public class TestCJKWidthFilterFactory extends BaseTokenTestCase { + public void test() throws Exception { + Reader reader = new StringReader("Test 1234"); + CJKWidthFilterFactory factory = new CJKWidthFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "Test", "1234" }); + } +} Index: solr/core/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java (revision 1224826) +++ solr/core/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java (working copy) @@ -24,7 +24,9 @@ /** * Simple tests to ensure the CJK tokenizer factory is working. + * @deprecated */ +@Deprecated public class TestCJKTokenizerFactory extends BaseTokenTestCase { /** * Ensure the tokenizer actually tokenizes CJK text correctly Index: solr/core/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java (revision 0) +++ solr/core/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java (revision 0) @@ -0,0 +1,42 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cjk.CJKWidthFilter; + +/** + * Factory for {@link CJKWidthFilter}. + *
+ * <fieldType name="text_cjk" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.CJKWidthFilterFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.CJKBigramFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ + +public class CJKWidthFilterFactory extends BaseTokenFilterFactory { + + @Override + public TokenStream create(TokenStream input) { + return new CJKWidthFilter(input); + } +} Index: solr/core/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java (revision 1224826) +++ solr/core/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java (working copy) @@ -30,8 +30,9 @@ * <tokenizer class="solr.CJKTokenizerFactory"/> * </analyzer> * </fieldType> - * + * @deprecated */ +@Deprecated public class CJKTokenizerFactory extends BaseTokenizerFactory { public CJKTokenizer create(Reader in) { return new CJKTokenizer(in); Index: solr/core/src/java/org/apache/solr/analysis/CJKBigramFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CJKBigramFilterFactory.java (revision 0) +++ solr/core/src/java/org/apache/solr/analysis/CJKBigramFilterFactory.java (revision 0) @@ -0,0 +1,64 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cjk.CJKBigramFilter; + +/** + * Factory for {@link CJKBigramFilter}. + *
+ * <fieldType name="text_cjk" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.CJKWidthFilterFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.CJKBigramFilterFactory" 
+ *       han="true" hiragana="true" 
+ *       katakana="true" hangul="true" />
+ *   </analyzer>
+ * </fieldType>
+ */ +public class CJKBigramFilterFactory extends BaseTokenFilterFactory { + int flags; + + @Override + public void init(Map args) { + super.init(args); + flags = 0; + if (getBoolean("han", true)) { + flags |= CJKBigramFilter.HAN; + } + if (getBoolean("hiragana", true)) { + flags |= CJKBigramFilter.HIRAGANA; + } + if (getBoolean("katakana", true)) { + flags |= CJKBigramFilter.KATAKANA; + } + if (getBoolean("hangul", true)) { + flags |= CJKBigramFilter.HANGUL; + } + } + + @Override + public TokenStream create(TokenStream input) { + return new CJKBigramFilter(input, flags); + } +} Index: modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java =================================================================== --- modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java (revision 0) +++ modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java (revision 0) @@ -0,0 +1,226 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.cjk.CJKBigramFilter; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; +import org.apache.lucene.analysis.util.CharArraySet; + +/** + * Tests ICUTokenizer's ability to work with CJKBigramFilter. + * Most tests adopted from TestCJKTokenizer + */ +public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase { + + /** + * ICUTokenizer+CJKBigramFilter + */ + private Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new ICUTokenizer(reader); + TokenStream result = new CJKBigramFilter(source); + return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET)); + } + }; + + /** + * ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter. + * + * ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent + * superset of CJKWidthFilter's foldings. + */ + private Analyzer analyzer2 = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new ICUTokenizer(reader); + // we put this before the CJKBigramFilter, because the normalization might combine + // some halfwidth katakana forms, which will affect the bigramming. + TokenStream result = new ICUNormalizer2Filter(source); + result = new CJKBigramFilter(source); + return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET)); + } + }; + + public void testJa1() throws IOException { + assertAnalyzesTo(analyzer, "一二三四五六七八九十", + new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" }, + new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 }, + new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testJa2() throws IOException { + assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十", + new String[] { "一", "二三", "三四", "五六", "六七", "七八", "八九", "十" }, + new int[] { 0, 2, 3, 6, 7, 8, 9, 12 }, + new int[] { 1, 4, 5, 8, 9, 10, 11, 13 }, + new String[] { "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testC() throws IOException { + assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z", + new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" }, + new int[] { 0, 4, 10, 17, 25, 30 }, + new int[] { 3, 9, 16, 24, 29, 31 }, + new String[] { "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1 }); + } + + /** + * LUCENE-2207: wrong offset calculated by end() + */ + public void testFinalOffset() throws IOException { + assertAnalyzesTo(analyzer, "あい", + new String[] { "あい" }, + new int[] { 0 }, + new int[] { 2 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "あい ", + new String[] { "あい" }, + new int[] { 0 }, + new int[] { 2 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "test", + new String[] { "test" }, + new int[] { 0 }, + new int[] { 4 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "test ", + new String[] { "test" }, + new int[] { 0 }, + new int[] { 4 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "あいtest", + new String[] { "あい", "test" }, + new int[] { 0, 2 }, + new int[] { 2, 6 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + + assertAnalyzesTo(analyzer, "testあい ", + new String[] { "test", "あい" }, + new int[] { 0, 4 }, + new int[] { 4, 6 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + } + + public void testMix() throws IOException { + assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ", + new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" }, + new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 }, + new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1}); + } + + public void testMix2() throws IOException { + assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ", + new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" }, + new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 }, + new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 }, + new String[] { "", "", "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + /** + * Non-english text (outside of CJK) is treated normally, according to unicode rules + */ + public void testNonIdeographic() throws IOException { + assertAnalyzesTo(analyzer, "一 روبرت موير", + new String[] { "一", "روبرت", "موير" }, + new int[] { 0, 2, 8 }, + new int[] { 1, 7, 12 }, + new String[] { "", "", "" }, + new int[] { 1, 1, 1 }); + } + + /** + * Same as the above, except with a nonspacing mark to show correctness. + */ + public void testNonIdeographicNonLetter() throws IOException { + assertAnalyzesTo(analyzer, "一 رُوبرت موير", + new String[] { "一", "رُوبرت", "موير" }, + new int[] { 0, 2, 9 }, + new int[] { 1, 8, 13 }, + new String[] { "", "", "" }, + new int[] { 1, 1, 1 }); + } + + public void testSurrogates() throws IOException { + assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛", + new String[] { "𩬅艱", "艱鍟", "鍟䇹", "䇹愯", "愯瀛" }, + new int[] { 0, 2, 3, 4, 5 }, + new int[] { 3, 4, 5, 6, 7 }, + new String[] { "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1 }); + } + + public void testReusableTokenStream() throws IOException { + assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ", + new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" }, + new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 }, + new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ", + new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" }, + new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 }, + new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 }, + new String[] { "", "", "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testSingleChar() throws IOException { + assertAnalyzesTo(analyzer, "一", + new String[] { "一" }, + new int[] { 0 }, + new int[] { 1 }, + new String[] { "" }, + new int[] { 1 }); + } + + public void testTokenStream() throws IOException { + assertAnalyzesTo(analyzer, "一丁丂", + new String[] { "一丁", "丁丂"}, + new int[] { 0, 1 }, + new int[] { 2, 3 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (revision 1224826) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (working copy) @@ -21,7 +21,10 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.Version; +/** @deprecated Remove when CJKTokenizer is removed (5.0) */ +@Deprecated public class TestCJKTokenizer extends BaseTokenStreamTestCase { class TestToken { @@ -41,7 +44,7 @@ } public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException { - Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); String terms[] = new String[out_tokens.length]; int startOffsets[] = new int[out_tokens.length]; int endOffsets[] = new int[out_tokens.length]; @@ -56,7 +59,7 @@ } public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException { - Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); String terms[] = new String[out_tokens.length]; int startOffsets[] = new int[out_tokens.length]; int endOffsets[] = new int[out_tokens.length]; @@ -212,13 +215,13 @@ } public void testTokenStream() throws Exception { - Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02", new String[] { "\u4e00\u4e01", "\u4e01\u4e02"}); } public void testReusableTokenStream() throws Exception { - Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053"; TestToken[] out_tokens = { @@ -273,6 +276,6 @@ /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { - checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER); + checkRandomData(random, new CJKAnalyzer(Version.LUCENE_30), 10000*RANDOM_MULTIPLIER); } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java (revision 0) @@ -0,0 +1,275 @@ +package org.apache.lucene.analysis.cjk; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharReader; +import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.charfilter.MappingCharFilter; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.util.CharArraySet; + +/** + * Most tests adopted from TestCJKTokenizer + */ +public class TestCJKAnalyzer extends BaseTokenStreamTestCase { + private Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + + public void testJa1() throws IOException { + assertAnalyzesTo(analyzer, "一二三四五六七八九十", + new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" }, + new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 }, + new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testJa2() throws IOException { + assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十", + new String[] { "一", "二三", "三四", "五六", "六七", "七八", "八九", "十" }, + new int[] { 0, 2, 3, 6, 7, 8, 9, 12 }, + new int[] { 1, 4, 5, 8, 9, 10, 11, 13 }, + new String[] { "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testC() throws IOException { + assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z", + new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" }, + new int[] { 0, 4, 10, 17, 25, 30 }, + new int[] { 3, 9, 16, 24, 29, 31 }, + new String[] { "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1 }); + } + + /** + * LUCENE-2207: wrong offset calculated by end() + */ + public void testFinalOffset() throws IOException { + assertAnalyzesTo(analyzer, "あい", + new String[] { "あい" }, + new int[] { 0 }, + new int[] { 2 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "あい ", + new String[] { "あい" }, + new int[] { 0 }, + new int[] { 2 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "test", + new String[] { "test" }, + new int[] { 0 }, + new int[] { 4 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "test ", + new String[] { "test" }, + new int[] { 0 }, + new int[] { 4 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "あいtest", + new String[] { "あい", "test" }, + new int[] { 0, 2 }, + new int[] { 2, 6 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + + assertAnalyzesTo(analyzer, "testあい ", + new String[] { "test", "あい" }, + new int[] { 0, 4 }, + new int[] { 4, 6 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + } + + public void testMix() throws IOException { + assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ", + new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" }, + new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 }, + new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1}); + } + + public void testMix2() throws IOException { + assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ", + new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" }, + new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 }, + new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 }, + new String[] { "", "", "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + /** + * Non-english text (outside of CJK) is treated normally, according to unicode rules + */ + public void testNonIdeographic() throws IOException { + assertAnalyzesTo(analyzer, "一 روبرت موير", + new String[] { "一", "روبرت", "موير" }, + new int[] { 0, 2, 8 }, + new int[] { 1, 7, 12 }, + new String[] { "", "", "" }, + new int[] { 1, 1, 1 }); + } + + /** + * Same as the above, except with a nonspacing mark to show correctness. + */ + public void testNonIdeographicNonLetter() throws IOException { + assertAnalyzesTo(analyzer, "一 رُوبرت موير", + new String[] { "一", "رُوبرت", "موير" }, + new int[] { 0, 2, 9 }, + new int[] { 1, 8, 13 }, + new String[] { "", "", "" }, + new int[] { 1, 1, 1 }); + } + + public void testSurrogates() throws IOException { + assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛", + new String[] { "𩬅艱", "艱鍟", "鍟䇹", "䇹愯", "愯瀛" }, + new int[] { 0, 2, 3, 4, 5 }, + new int[] { 3, 4, 5, 6, 7 }, + new String[] { "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1 }); + } + + public void testReusableTokenStream() throws IOException { + assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ", + new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" }, + new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 }, + new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ", + new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" }, + new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 }, + new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 }, + new String[] { "", "", "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testSingleChar() throws IOException { + assertAnalyzesTo(analyzer, "一", + new String[] { "一" }, + new int[] { 0 }, + new int[] { 1 }, + new String[] { "" }, + new int[] { 1 }); + } + + public void testTokenStream() throws IOException { + assertAnalyzesTo(analyzer, "一丁丂", + new String[] { "一丁", "丁丂"}, + new int[] { 0, 1 }, + new int[] { 2, 3 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + } + + /** test that offsets are correct when mappingcharfilter is previously applied */ + public void testChangedOffsets() throws IOException { + final NormalizeCharMap norm = new NormalizeCharMap(); + norm.add("a", "一二"); + norm.add("b", "二三"); + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer)); + } + + @Override + protected Reader initReader(Reader reader) { + return new MappingCharFilter(norm, CharReader.get(reader)); + } + }; + + assertAnalyzesTo(analyzer, "ab", + new String[] { "一二", "二二", "二三" }, + new int[] { 0, 0, 1 }, + new int[] { 1, 1, 2 }); + + // note: offsets are strange since this is how the charfilter maps them... + // before bigramming, the 4 tokens look like: + // { 0, 0, 1, 1 }, + // { 0, 1, 1, 2 } + } + + private static class FakeStandardTokenizer extends TokenFilter { + final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + public FakeStandardTokenizer(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]); + return true; + } else { + return false; + } + } + } + + public void testSingleChar2() throws Exception { + Analyzer analyzer = new Analyzer() { + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenFilter filter = new FakeStandardTokenizer(tokenizer); + filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET); + filter = new CJKBigramFilter(filter); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + assertAnalyzesTo(analyzer, "一", + new String[] { "一" }, + new int[] { 0 }, + new int[] { 1 }, + new String[] { "" }, + new int[] { 1 }); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER); + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java (revision 0) @@ -0,0 +1,67 @@ +package org.apache.lucene.analysis.cjk; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Tests for {@link CJKWidthFilter} + */ +public class TestCJKWidthFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(source, new CJKWidthFilter(source)); + } + }; + + /** + * Full-width ASCII forms normalized to half-width (basic latin) + */ + public void testFullWidthASCII() throws IOException { + assertAnalyzesTo(analyzer, "Test 1234", + new String[] { "Test", "1234" }, + new int[] { 0, 5 }, + new int[] { 4, 9 }); + } + + /** + * Half-width katakana forms normalized to standard katakana. + * A bit trickier in some cases, since half-width forms are decomposed + * and voice marks need to be recombined with a preceding base form. + */ + public void testHalfWidthKana() throws IOException { + assertAnalyzesTo(analyzer, "カタカナ", + new String[] { "カタカナ" }); + assertAnalyzesTo(analyzer, "ヴィッツ", + new String[] { "ヴィッツ" }); + assertAnalyzesTo(analyzer, "パナソニック", + new String[] { "パナソニック" }); + } + + public void testRandomData() throws IOException { + checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java (revision 0) @@ -0,0 +1,300 @@ +package org.apache.lucene.analysis.cjk; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.ArrayUtil; + +/** + * Forms bigrams of CJK terms that are generated from StandardTokenizer + * or ICUTokenizer. + */ +public final class CJKBigramFilter extends TokenFilter { + // configuration + /** bigram flag for Han Ideographs */ + public static final int HAN = 1; + /** bigram flag for Hiragana */ + public static final int HIRAGANA = 2; + /** bigram flag for Katakana */ + public static final int KATAKANA = 4; + /** bigram flag for Hangul */ + public static final int HANGUL = 8; + + /** when we emit a bigram, its then marked as this type */ + public static final String DOUBLE_TYPE = ""; + /** when we emit a unigram, its then marked as this type */ + public static final String SINGLE_TYPE = ""; + + // the types from standardtokenizer + private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; + private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; + private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; + private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; + + // sentinel value for ignoring a script + private static final Object NO = new Object(); + + // these are set to either their type or NO if we want to pass them thru + private final Object doHan; + private final Object doHiragana; + private final Object doKatakana; + private final Object doHangul; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + // buffers containing codepoint and offsets in parallel + int buffer[] = new int[8]; + int startOffset[] = new int[8]; + int endOffset[] = new int[8]; + // length of valid buffer + int bufferLen; + // current buffer index + int index; + + // the last end offset, to determine if we should bigram across tokens + int lastEndOffset; + + private boolean exhausted; + + /** + * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) + * CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)} + */ + public CJKBigramFilter(TokenStream in) { + this(in, HAN | HIRAGANA | KATAKANA | HANGUL); + } + + /** + * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed. + * @param flags OR'ed set from {@link HAN}, {@link HIRAGANA}, {@link KATAKANA}, {@link HANGUL} + */ + public CJKBigramFilter(TokenStream in, int flags) { + super(in); + doHan = (flags & HAN) == 0 ? NO : HAN_TYPE; + doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE; + doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE; + doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE; + } + + /* + * much of this complexity revolves around handling the special case of a + * "lone cjk character" where cjktokenizer would output a unigram. this + * is also the only time we ever have to captureState. + */ + @Override + public boolean incrementToken() throws IOException { + while (true) { + if (hasBufferedBigram()) { + + // case 1: we have multiple remaining codepoints buffered, + // so we can emit a bigram here. + + flushBigram(); + return true; + } else if (doNext()) { + + // case 2: look at the token type. should we form any n-grams? + + String type = typeAtt.type(); + if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) { + + // acceptable CJK type: we form n-grams from these. + // as long as the offsets are aligned, we just add these to our current buffer. + // otherwise, we clear the buffer and start over. + + if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue + if (hasBufferedUnigram()) { + + // we have a buffered unigram, and we peeked ahead to see if we could form + // a bigram, but we can't, because the offsets are unaligned. capture the state + // of this peeked data to be revisited next time thru the loop, and dump our unigram. + + loneState = captureState(); + flushUnigram(); + return true; + } + index = 0; + bufferLen = 0; + } + refill(); + } else { + + // not a CJK type: we just return these as-is. + + if (hasBufferedUnigram()) { + + // we have a buffered unigram, and we peeked ahead to see if we could form + // a bigram, but we can't, because its not a CJK type. capture the state + // of this peeked data to be revisited next time thru the loop, and dump our unigram. + + loneState = captureState(); + flushUnigram(); + return true; + } + return true; + } + } else { + + // case 3: we have only zero or 1 codepoints buffered, + // so not enough to form a bigram. But, we also have no + // more input. So if we have a buffered codepoint, emit + // a unigram, otherwise, its end of stream. + + if (hasBufferedUnigram()) { + flushUnigram(); // flush our remaining unigram + return true; + } + return false; + } + } + } + + private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams + + /** + * looks at next input token, returning false is none is available + */ + private boolean doNext() throws IOException { + if (loneState != null) { + restoreState(loneState); + loneState = null; + return true; + } else { + if (exhausted) { + return false; + } else if (input.incrementToken()) { + return true; + } else { + exhausted = true; + return false; + } + } + } + + /** + * refills buffers with new data from the current token. + */ + private void refill() throws IOException { + // compact buffers to keep them smallish if they become large + // just a safety check, but technically we only need the last codepoint + if (bufferLen > 64) { + int last = bufferLen - 1; + buffer[0] = buffer[last]; + startOffset[0] = startOffset[last]; + endOffset[0] = endOffset[last]; + bufferLen = 1; + index -= last; + } + + char termBuffer[] = termAtt.buffer(); + int len = termAtt.length(); + int start = offsetAtt.startOffset(); + int end = offsetAtt.endOffset(); + + int newSize = bufferLen + len; + buffer = ArrayUtil.grow(buffer, newSize); + startOffset = ArrayUtil.grow(startOffset, newSize); + endOffset = ArrayUtil.grow(endOffset, newSize); + lastEndOffset = end; + + if (end - start != len) { + // crazy offsets (modified by synonym or charfilter): just preserve + for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) { + cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len); + startOffset[bufferLen] = start; + endOffset[bufferLen] = end; + bufferLen++; + } + } else { + // normal offsets + for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) { + cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len); + cpLen = Character.charCount(cp); + startOffset[bufferLen] = start; + start = endOffset[bufferLen] = start + cpLen; + bufferLen++; + } + } + } + + /** + * Flushes a bigram token to output from our buffer + * This is the normal case, e.g. ABC -> AB BC + */ + private void flushBigram() { + clearAttributes(); + char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries) + int len1 = Character.toChars(buffer[index], termBuffer, 0); + int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1); + termAtt.setLength(len2); + offsetAtt.setOffset(startOffset[index], endOffset[index+1]); + typeAtt.setType(DOUBLE_TYPE); + index++; + } + + /** + * Flushes a unigram token to output from our buffer. + * This happens when we encounter isolated CJK characters, either the whole + * CJK string is a single character, or we encounter a CJK character surrounded + * by space, punctuation, english, etc, but not beside any other CJK. + */ + private void flushUnigram() { + clearAttributes(); + char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates) + int len = Character.toChars(buffer[index], termBuffer, 0); + termAtt.setLength(len); + offsetAtt.setOffset(startOffset[index], endOffset[index]); + typeAtt.setType(SINGLE_TYPE); + index++; + } + + /** + * True if we have multiple codepoints sitting in our buffer + */ + private boolean hasBufferedBigram() { + return bufferLen - index > 1; + } + + /** + * True if we have a single codepoint sitting in our buffer, where its future + * (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen + * inputs. + */ + private boolean hasBufferedUnigram() { + return bufferLen == 1 && index == 0; + } + + @Override + public void reset() throws IOException { + super.reset(); + bufferLen = 0; + index = 0; + lastEndOffset = 0; + loneState = null; + exhausted = false; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 1224826) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -44,7 +44,9 @@ * please search google * + * @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead. */ +@Deprecated public final class CJKTokenizer extends Tokenizer { //~ Static fields/initializers --------------------------------------------- /** Word token type */ Index: modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (revision 1224826) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (working copy) @@ -22,16 +22,19 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; - /** - * An {@link Analyzer} that tokenizes text with {@link CJKTokenizer} and - * filters with {@link StopFilter} - * + * An {@link Analyzer} that tokenizes text with {@link StandardTokenizer}, + * normalizes content with {@link CJKWidthFilter}, folds case with + * {@link LowerCaseFilter}, forms bigrams of CJK with {@link CJKBigramFilter}, + * and filters stopwords with {@link StopFilter} */ public final class CJKAnalyzer extends StopwordAnalyzerBase { /** @@ -86,7 +89,16 @@ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - final Tokenizer source = new CJKTokenizer(reader); - return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); + if (matchVersion.onOrAfter(Version.LUCENE_36)) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + // run the widthfilter first before bigramming, it sometimes combines characters. + TokenStream result = new CJKWidthFilter(source); + result = new LowerCaseFilter(matchVersion, result); + result = new CJKBigramFilter(result); + return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); + } else { + final Tokenizer source = new CJKTokenizer(reader); + return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); + } } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java (revision 0) @@ -0,0 +1,95 @@ +package org.apache.lucene.analysis.cjk; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.StemmerUtil; + +/** + * A {@link TokenFilter} that normalizes CJK width differences: + *
    + *
  • Folds fullwidth ASCII variants into the equivalent basic latin + *
  • Folds halfwidth Katakana variants into the equivalent kana + *
+ *

+ * NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD + * Unicode normalization. See the normalization support in the ICU package + * for full normalization. + */ +public final class CJKWidthFilter extends TokenFilter { + private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + /* halfwidth kana mappings: 0xFF65-0xFF9D + * + * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A + * as a fallback when they cannot properly combine with a preceding + * character into a composed form. + */ + private static final char KANA_NORM[] = new char[] { + 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, + 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, + 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, + 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, + 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0, + 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, + 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A + }; + + public CJKWidthFilter(TokenStream input) { + super(input); + } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char text[] = termAtt.buffer(); + int length = termAtt.length(); + for (int i = 0; i < length; i++) { + final char ch = text[i]; + if (ch >= 0xFF01 && ch <= 0xFF5E) { + // Fullwidth ASCII variants + text[i] -= 0xFEE0; + } else if (ch >= 0xFF65 && ch <= 0xFF9F) { + // Halfwidth Katakana variants + if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, length, ch)) { + length = StemmerUtil.delete(text, i--, length); + } else { + text[i] = KANA_NORM[ch - 0xFF65]; + } + } + } + termAtt.setLength(length); + return true; + } else { + return false; + } + } + + /* kana combining diffs: 0x30A6-0x30FD */ + private static final byte KANA_COMBINE_VOICED[] = new byte[] { + 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, + 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + }; + + private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, + 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + /** returns true if we successfully combined the voice mark */ + private static boolean combine(char text[], int pos, int length, char ch) { + final char prev = text[pos-1]; + if (prev >= 0x30A6 && prev <= 0x30FD) { + text[pos-1] += (ch == 0xFF9F) + ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6] + : KANA_COMBINE_VOICED[prev - 0x30A6]; + return text[pos-1] != prev; + } + return false; + } +} Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 1224826) +++ lucene/contrib/CHANGES.txt (working copy) @@ -98,6 +98,12 @@ * LUCENE-3596: DirectoryTaxonomyWriter extensions can override createIndexWriterConfig() and modify how its internal index writer is opened. (Doron Cohen) + +* LUCENE-2906: Added CJKBigramFilter that forms bigrams from StandardTokenizer or + ICUTokenizer CJK tokens, and CJKWidthFilter that normalizes halfwidth/fullwidth. + This filter supports unicode supplementary characters and you can toggle whether + bigrams are formed for each of Han/Hiragana/Katakana/Hangul independently. Deprecates + CJKTokenizer. (Tom Burton-West, Robert Muir) API Changes