Index: modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java =================================================================== --- modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java (revision 1067174) +++ modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java (working copy) @@ -70,6 +70,7 @@ outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]"); outputMacro("HanSupp", "[:Script=Han:]"); outputMacro("HiraganaSupp", "[:Script=Hiragana:]"); + outputMacro("HangulSupp", "[:Script=Hangul:]"); } static void outputHeader() { Index: modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (revision 1067174) +++ modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (working copy) @@ -20,6 +20,8 @@ import java.io.IOException; import java.io.InputStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; + import com.ibm.icu.lang.UScript; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; @@ -44,13 +46,17 @@ */ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { /** Token type for words containing ideographic characters */ - public static final String WORD_IDEO = ""; - /** Token type for words containing Japanese kana */ - public static final String WORD_KANA = ""; + public static final String WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; + /** Token type for words containing Japanese hiragana */ + public static final String WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; + /** Token type for words containing Japanese katakana */ + public static final String WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; + /** Token type for words containing Korean hangul */ + public static final String WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; /** Token type for words that contain letters */ - public static final String WORD_LETTER = ""; + public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; /** Token type for words that appear to be numbers */ - public static final String WORD_NUMBER = ""; + public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; /* * the default breakiterators in use. these can be expensive to @@ -87,9 +93,9 @@ case RuleBasedBreakIterator.WORD_IDEO: return WORD_IDEO; case RuleBasedBreakIterator.WORD_KANA: - return WORD_KANA; + return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA; case RuleBasedBreakIterator.WORD_LETTER: - return WORD_LETTER; + return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER; case RuleBasedBreakIterator.WORD_NUMBER: return WORD_NUMBER; default: /* some other custom code */ Index: modules/analysis/icu/src/java/overview.html =================================================================== --- modules/analysis/icu/src/java/overview.html (revision 1067174) +++ modules/analysis/icu/src/java/overview.html (working copy) @@ -71,6 +71,7 @@ * This tokenizer will work well in general for most languages. */ Tokenizer tokenizer = new ICUTokenizer(reader); + TokenFilter filter = new StandardFilter(MyAppVersion, tokenizer);

Collation

Index: modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (revision 1067174) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (working copy) @@ -207,4 +207,16 @@ new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"}, new String[] { "", "", "", "", "", "" }); } + + public void testKorean() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음", + new String[] { "훈민정음" }, + new String[] { "" }); + } + + public void testJapanese() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ", + new String[] { "仮", "名", "遣", "い", "カタカナ" }, + new String[] { "", "", "", "", "" }); + } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (revision 1067174) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (working copy) @@ -21,7 +21,10 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.Version; +/** @deprecated Remove when CJKTokenizer is removed (5.0) */ +@Deprecated public class TestCJKTokenizer extends BaseTokenStreamTestCase { class TestToken { @@ -41,7 +44,7 @@ } public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException { - Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); String terms[] = new String[out_tokens.length]; int startOffsets[] = new int[out_tokens.length]; int endOffsets[] = new int[out_tokens.length]; @@ -56,7 +59,7 @@ } public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException { - Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); String terms[] = new String[out_tokens.length]; int startOffsets[] = new int[out_tokens.length]; int endOffsets[] = new int[out_tokens.length]; @@ -212,13 +215,13 @@ } public void testTokenStream() throws Exception { - Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02", new String[] { "\u4e00\u4e01", "\u4e01\u4e02"}); } public void testReusableTokenStream() throws Exception { - Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053"; TestToken[] out_tokens = { Index: modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java (revision 0) @@ -0,0 +1,187 @@ +package org.apache.lucene.analysis.cjk; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; + +/** + * Most tests adopted from TestCJKTokenizer + * (modified to use assertTokenStreamContents, etc) + */ +public class TestCJKAnalyzer extends BaseTokenStreamTestCase { + private Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT); + + public void testJa1() throws IOException { + assertAnalyzesTo(analyzer, "一二三四五六七八九十", + new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" }, + new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 }, + new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testJa2() throws IOException { + assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十", + new String[] { "一", "二三", "三四", "五六", "六七", "七八", "八九", "十" }, + new int[] { 0, 2, 3, 6, 7, 8, 9, 12 }, + new int[] { 1, 4, 5, 8, 9, 10, 11, 13 }, + new String[] { "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testC() throws IOException { + assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z", + new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" }, + new int[] { 0, 4, 10, 17, 25, 30 }, + new int[] { 3, 9, 16, 24, 29, 31 }, + new String[] { "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1 }); + } + + /** + * LUCENE-2207: wrong offset calculated by end() + */ + public void testFinalOffset() throws IOException { + assertAnalyzesTo(analyzer, "あい", + new String[] { "あい" }, + new int[] { 0 }, + new int[] { 2 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "あい ", + new String[] { "あい" }, + new int[] { 0 }, + new int[] { 2 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "test", + new String[] { "test" }, + new int[] { 0 }, + new int[] { 4 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "test ", + new String[] { "test" }, + new int[] { 0 }, + new int[] { 4 }, + new String[] { "" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "あいtest", + new String[] { "あい", "test" }, + new int[] { 0, 2 }, + new int[] { 2, 6 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + + assertAnalyzesTo(analyzer, "testあい ", + new String[] { "test", "あい" }, + new int[] { 0, 4 }, + new int[] { 4, 6 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + } + + /** + * Full-width text is normalized to half-width + */ + public void testFullWidth() throws IOException { + assertAnalyzesTo(analyzer, "Test 1234", + new String[] { "test", "1234" }, + new int[] { 0, 5 }, + new int[] { 4, 9 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + } + + public void testMix() throws IOException { + assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ", + new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" }, + new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 }, + new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1}); + } + + public void testMix2() throws IOException { + assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ", + new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" }, + new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 }, + new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 }, + new String[] { "", "", "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + /** + * Non-english text (outside of CJK) is treated normally, according to unicode rules + */ + public void testNonIdeographic() throws IOException { + assertAnalyzesTo(analyzer, "一 روبرت موير", + new String[] { "一", "روبرت", "موير" }); + // nocommit: check offsets/positions/types + } + + /** + * Same as the above, except with a nonspacing mark to show correctness. + */ + public void testNonIdeographicNonLetter() throws IOException { + assertAnalyzesTo(analyzer, "一 رُوبرت موير", + new String[] { "一", "رُوبرت", "موير" }); + // nocommit: check offsets/positions/types + } + + public void testReusableTokenStream() throws IOException { + assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ", + new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" }, + new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 }, + new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 }, + new String[] { "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ", + new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" }, + new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 }, + new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 }, + new String[] { "", "", "", "", "", "", "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testSingleChar() throws IOException { + assertAnalyzesTo(analyzer, "一", + new String[] { "一" }, + new int[] { 0 }, + new int[] { 1 }, + new String[] { "" }, + new int[] { 1 }); + } + + public void testTokenStream() throws IOException { + assertAnalyzesTo(analyzer, "一丁丂", + new String[] { "一丁", "丁丂"}, + new int[] { 0, 1 }, + new int[] { 2, 3 }, + new String[] { "", "" }, + new int[] { 1, 1 }); + } +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\cjk\TestCJKAnalyzer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 1067174) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (working copy) @@ -58,7 +58,10 @@ ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp}) Han = ([\p{Script:Han}] | {HanSupp}) Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp}) +Hangul = ([\p{Script:Hangul}] | {HangulSupp}) +// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Hangul}&&\p{ALetter}]) once JFlex supports it +HangulEx = [\u1100-\u11FF\u3131-\u318E\uA960-\uA97C\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uFFA0-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC] ({Format} | {Extend})* // UAX#29 WB4. X (Extend | Format)* --> X // ALetterEx = {ALetter} ({Format} | {Extend})* @@ -90,6 +93,10 @@ public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC; public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA; + + public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA; + + public static final int HANGUL_TYPE = StandardTokenizer.HANGUL; public final int yychar() { @@ -123,6 +130,12 @@ {ExtendNumLetEx}* { return NUMERIC_TYPE; } +// subset of the below for typing purposes only! +{HangulEx}+ + { return HANGUL_TYPE; } + +{KatakanaEx}+ + { return KATAKANA_TYPE; } // UAX#29 WB5. ALetter × ALetter // WB6. ALetter × (MidLetter | MidNumLet) ALetter Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 1067174) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy) @@ -22,7 +22,9 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Version; /** @@ -46,7 +48,7 @@ @Override public final boolean incrementToken() throws IOException { if (matchVersion.onOrAfter(Version.LUCENE_31)) - return input.incrementToken(); // TODO: add some niceties for the new grammar + return incrementTokenStandard(); else return incrementTokenClassic(); } @@ -78,4 +80,97 @@ return true; } + + int buffer[] = new int[8]; + int bufferLen; + int startOffset; + int endOffset; + int index; + + private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; + private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; + private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; + private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; + public static final String NGRAM_TYPE = ""; + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + /** + * this looks totally scary, but a lot of the logic revolves around handling + * the special case of a "lone cjk character" where cjktokenizer would output a unigram + * this is also the only time we ever have to captureState + */ + public final boolean incrementTokenStandard() throws IOException { + while (true) { + if (bufferLen - index > 1) { + clearAttributes(); + final char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries) + int len1 = Character.toChars(buffer[index], termBuffer, 0); + int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1); + termAtt.setLength(len2); + offsetAtt.setOffset(startOffset, startOffset + len2); + typeAtt.setType(NGRAM_TYPE); + startOffset += len1; + index++; + return true; + } else if (loneState != null || input.incrementToken()) { + if (loneState != null) { + restoreState(loneState); + loneState = null; + } + final String type = typeAtt.type(); + if (type == HAN_TYPE || type == HIRAGANA_TYPE || type == KATAKANA_TYPE || type == HANGUL_TYPE) { + final char termBuffer[] = termAtt.buffer(); + final int len = termAtt.length(); + final int start = offsetAtt.startOffset(); + if (start != endOffset) { // unaligned, clear queue + if (bufferLen == 1 && index == 0) { + loneState = captureState(); + flushUnigram(); // flush our remaining unigram + return true; + } + index = 0; + bufferLen = 0; + startOffset = start; + endOffset = start; + } + buffer = ArrayUtil.grow(buffer, bufferLen + len); + for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) { + buffer[bufferLen++] = Character.codePointAt(termBuffer, i, len); + } + endOffset += len; + } else { + if (bufferLen == 1 && index == 0) { + loneState = captureState(); + flushUnigram(); // flush our remaining unigram + return true; + } + return true; + } + } else { + if (bufferLen == 1 && index == 0) { + flushUnigram(); // flush our remaining unigram + return true; + } + return false; + } + } + } + + private State loneState; // rarely used: only for "lone cjk characters" + public void flushUnigram() { + clearAttributes(); + final char termBuffer[] = termAtt.resizeBuffer(2); + int len = Character.toChars(buffer[index], termBuffer, 0); + termAtt.setLength(len); + offsetAtt.setOffset(startOffset, startOffset + len); + typeAtt.setType(NGRAM_TYPE); + startOffset += len; + index++; + } + + @Override + public void reset() throws IOException { + super.reset(); + bufferLen = 0; + } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 1067174) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -78,6 +78,8 @@ public static final int SOUTHEAST_ASIAN = 9; public static final int IDEOGRAPHIC = 10; public static final int HIRAGANA = 11; + public static final int KATAKANA = 12; + public static final int HANGUL = 13; /** String token types that correspond to token type int constants */ public static final String [] TOKEN_TYPES = new String [] { @@ -92,7 +94,9 @@ "", "", "", - "" + "", + "", + "" }; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (revision 1067174) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (working copy) @@ -14,7 +14,7 @@ * limitations under the License. */ -// Generated using ICU4J 4.6.0.0 on Thursday, January 6, 2011 7:02:52 PM UTC +// Generated using ICU4J 4.6.0.0 on Friday, February 4, 2011 10:49:54 PM UTC // by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros @@ -123,3 +123,6 @@ ([\ud83c][\uDE00]) | ([\ud82c][\uDC01]) ) +HangulSupp = ( + [] +) Index: modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 1067174) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -44,7 +44,9 @@ * please search google * + * @deprecated Use StandardTokenizer, StandardFilter, CJKWidthFilter, and LowerCaseFilter instead. */ +@Deprecated public final class CJKTokenizer extends Tokenizer { //~ Static fields/initializers --------------------------------------------- /** Word token type */ Index: modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (revision 1067174) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (working copy) @@ -22,8 +22,12 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; @@ -86,7 +90,15 @@ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - final Tokenizer source = new CJKTokenizer(reader); - return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); + if (matchVersion.onOrAfter(Version.LUCENE_31)) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(matchVersion, source); + result = new CJKWidthFilter(result); + result = new LowerCaseFilter(matchVersion, result); + return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); + } else { + final Tokenizer source = new CJKTokenizer(reader); + return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); + } } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.analysis.cjk; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * A {@link TokenFilter} that normalizes fullwidth forms into half-width forms. + */ +public final class CJKWidthFilter extends TokenFilter { + private CharTermAttribute termAtt; + + public CJKWidthFilter(TokenStream input) { + super(input); + termAtt = addAttribute(CharTermAttribute.class); + } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char text[] = termAtt.buffer(); + int length = termAtt.length(); + for (int i = 0; i < length; i++) { + if (text[i] >= 0xFF01 && text[i] <= 0xFF5E) + text[i] -= 0xFEE0; + // todo: add narrow decomposition + } + return true; + } else { + return false; + } + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\cjk\CJKWidthFilter.java ___________________________________________________________________ Added: svn:eol-style + native