Index: modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java =================================================================== --- modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (revision 1068357) +++ modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (working copy) @@ -128,11 +128,10 @@ /* * For chinese, tokenize as char (these can later form bigrams or whatever) - * TODO: why do full-width numerics have no word-break prop? */ public void testChinese() throws Exception { assertAnalyzesTo(a, "我是中国人。 1234 Tests ", - new String[] { "我", "是", "中", "国", "人", "tests"}); + new String[] { "我", "是", "中", "国", "人", "1234", "tests"}); } public void testEmpty() throws Exception { @@ -221,4 +220,16 @@ new String[] {"david", "has", "5000", "bones"}, new String[] { "", "", "", "" }); } + + public void testKorean() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음", + new String[] { "훈민정음" }, + new String[] { "" }); + } + + public void testJapanese() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ", + new String[] { "仮", "名", "遣", "い", "カタカナ" }, + new String[] { "", "", "", "", "" }); + } } Index: modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (revision 1068357) +++ modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (working copy) @@ -20,6 +20,8 @@ import java.io.IOException; import java.io.InputStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; + import com.ibm.icu.lang.UScript; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; @@ -44,20 +46,24 @@ */ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { /** Token type for words containing ideographic characters */ - public static final String WORD_IDEO = ""; - /** Token type for words containing Japanese kana */ - public static final String WORD_KANA = ""; + public static final String WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; + /** Token type for words containing Japanese hiragana */ + public static final String WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; + /** Token type for words containing Japanese katakana */ + public static final String WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; + /** Token type for words containing Korean hangul */ + public static final String WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; /** Token type for words that contain letters */ - public static final String WORD_LETTER = ""; + public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; /** Token type for words that appear to be numbers */ - public static final String WORD_NUMBER = ""; + public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; /* * the default breakiterators in use. these can be expensive to * instantiate, cheap to clone. */ private static final BreakIterator rootBreakIterator = - BreakIterator.getWordInstance(ULocale.ROOT); + readBreakIterator("Default.brk"); private static final BreakIterator thaiBreakIterator = BreakIterator.getWordInstance(new ULocale("th_TH")); private static final BreakIterator hebrewBreakIterator = @@ -87,9 +93,9 @@ case RuleBasedBreakIterator.WORD_IDEO: return WORD_IDEO; case RuleBasedBreakIterator.WORD_KANA: - return WORD_KANA; + return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA; case RuleBasedBreakIterator.WORD_LETTER: - return WORD_LETTER; + return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER; case RuleBasedBreakIterator.WORD_NUMBER: return WORD_NUMBER; default: /* some other custom code */ Index: modules/analysis/icu/src/data/uax29/Default.rbbi =================================================================== --- modules/analysis/icu/src/data/uax29/Default.rbbi (revision 0) +++ modules/analysis/icu/src/data/uax29/Default.rbbi (revision 0) @@ -0,0 +1,127 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Default RBBI rules, based on UAX#29. +# + +!!chain; + +# +# Character Class Definitions. +# + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$ALetter = [\p{Word_Break = ALetter}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidLetter = [\p{Word_Break = MidLetter}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; + + +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. + +$dictionary = [:LineBreak = Complex_Context:]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not + # include the dictionary characters. + +# +# Rules 4 Ignore Format and Extend characters, +# except when they appear at the beginning of a region of text. +# +$KatakanaEx = $Katakana ($Extend | $Format)*; +$ALetterEx = $ALetterPlus ($Extend | $Format)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidLetterEx = $MidLetter ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +$Hiragana = [\p{script=Hiragana}]; +$Ideographic = [\p{Ideographic}]; +$HiraganaEx = $Hiragana ($Extend | $Format)*; +$IdeographicEx = $Ideographic ($Extend | $Format)*; + +## ------------------------------------------------- + +!!forward; + + +# Rule 3 - CR x LF +# +$CR $LF; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. The rule here comes into play when the start of text +# begins with a group of Format chars, or with a "word" consisting of a single +# char that is not in any of the listed word break categories followed by +# format char(s). +[^$CR $LF $Newline]? ($Extend | $Format)+; + +$NumericEx {100}; +$ALetterEx {200}; +$KatakanaEx {300}; # note: these status values override those from rule 5 +$HiraganaEx {300}; # by virtual of being numerically larger. +$IdeographicEx {400}; # + +# +# rule 5 +# Do not break between most letters. +# +$ALetterEx $ALetterEx {200}; + +# rule 6 and 7 +$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; + +# rule 8 + +$NumericEx $NumericEx {100}; + +# rule 9 + +$ALetterEx $NumericEx {200}; + +# rule 10 + +$NumericEx $ALetterEx {200}; + +# rule 11 and 12 + +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; + +# rule 13 + +$KatakanaEx $KatakanaEx {300}; + +# rule 13a/b + +$ALetterEx $ExtendNumLetEx {200}; # (13a) +$NumericEx $ExtendNumLetEx {100}; # (13a) +$KatakanaEx $ExtendNumLetEx {300}; # (13a) +$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) + +$ExtendNumLetEx $ALetterEx {200}; # (13b) +$ExtendNumLetEx $NumericEx {100}; # (13b) +$ExtendNumLetEx $KatakanaEx {300}; # (13b) Index: modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (revision 1068357) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (working copy) @@ -207,4 +207,16 @@ new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"}, new String[] { "", "", "", "", "", "" }); } + + public void testKorean() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음 2007년 3월 22일", + new String[] { "훈민정음", "2007년", "3월", "22일" }, + new String[] { "", "", "", "" }); + } + + public void testJapanese() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ", + new String[] { "仮", "名", "遣", "い", "カタカナ" }, + new String[] { "", "", "", "", "" }); + } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (revision 1068357) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (working copy) @@ -406,4 +406,16 @@ new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"}, new String[] { "", "", "", "", "", "" }); } + + public void testKorean() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음", + new String[] { "훈민정음" }, + new String[] { "" }); + } + + public void testJapanese() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ", + new String[] { "仮", "名", "遣", "い", "カタカナ" }, + new String[] { "", "", "", "", "" }); + } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 1068357) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (working copy) @@ -59,6 +59,8 @@ Han = ([\p{Script:Han}] | {HanSupp}) Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp}) +// Script=Hangul & Aletter +HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})* // UAX#29 WB4. X (Extend | Format)* --> X // ALetterEx = {ALetter} ({Format} | {Extend})* @@ -90,6 +92,10 @@ public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC; public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA; + + public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA; + + public static final int HANGUL_TYPE = StandardTokenizer.HANGUL; public final int yychar() { @@ -123,6 +129,12 @@ {ExtendNumLetEx}* { return NUMERIC_TYPE; } +// subset of the below for typing purposes only! +{HangulEx}+ + { return HANGUL_TYPE; } + +{KatakanaEx}+ + { return KATAKANA_TYPE; } // UAX#29 WB5. ALetter × ALetter // WB6. ALetter × (MidLetter | MidNumLet) ALetter Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (revision 1068357) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (working copy) @@ -77,6 +77,8 @@ Han = ([\p{Script:Han}] | {HanSupp}) Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp}) +// Script=Hangul & Aletter +HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})* // UAX#29 WB4. X (Extend | Format)* --> X // ALetterEx = {ALetter} ({Format} | {Extend})* @@ -168,16 +170,16 @@ %{ /** Alphanumeric sequences */ - public static final String WORD_TYPE = ""; + public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; /** Numbers */ - public static final String NUMERIC_TYPE = ""; + public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */ public static final String URL_TYPE = ""; /** E-mail addresses */ - public static final String EMAIL_TYPE = " * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA */ - public static final String SOUTH_EAST_ASIAN_TYPE = ""; + public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN]; - public static final String IDEOGRAPHIC_TYPE = ""; + public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; - public static final String HIRAGANA_TYPE = ""; + public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; + public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; + + public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt @@ -316,7 +322,13 @@ {ExtendNumLetEx}* { if (populateAttributes(NUMERIC_TYPE)) return true; } +// subset of the below for typing purposes only! +{HangulEx}+ + { if (populateAttributes(HANGUL_TYPE)) return true; } +{KatakanaEx}+ + { if (populateAttributes(KATAKANA_TYPE)) return true; } + // UAX#29 WB5. ALetter × ALetter // WB6. ALetter × (MidLetter | MidNumLet) ALetter // WB7. ALetter (MidLetter | MidNumLet) × ALetter Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 1068357) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -78,6 +78,8 @@ public static final int SOUTHEAST_ASIAN = 9; public static final int IDEOGRAPHIC = 10; public static final int HIRAGANA = 11; + public static final int KATAKANA = 12; + public static final int HANGUL = 13; /** String token types that correspond to token type int constants */ public static final String [] TOKEN_TYPES = new String [] { @@ -92,7 +94,9 @@ "", "", "", - "" + "", + "", + "" }; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;