Index: solr/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java (revision 0) +++ solr/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java (revision 0) @@ -0,0 +1,30 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ja.JapaneseTokenizer; + +public class JapaneseTokenizerFactory extends BaseTokenizerFactory { + public Tokenizer create(Reader input) { + // TODO: add support for params if needed + return new JapaneseTokenizer(input); + } +} Property changes on: solr\src\java\org\apache\solr\analysis\JapaneseTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (revision 0) @@ -0,0 +1,89 @@ +package org.apache.lucene.analysis.ja; + +/* + ** + ** Mar. 24, 2009 + ** + ** The author disclaims copyright to this source code. + ** In place of a legal notice, here is a blessing: + ** + ** May you do good and not evil. + ** May you find forgiveness for yourself and forgive others. + ** May you share freely, never taking more than you give. + ** + ** Stolen from SQLite :-) + ** Any feedback is welcome. + ** Kohei TAKETA + ** + */ + +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.ja.tinysegmenter.TinySegmenter; +import org.junit.Test; + +public class TestJapaneseTokenizer extends BaseTokenStreamTestCase { + @Test + public void testBasics() throws Exception { + String str = "本日は晴天なり。"; + StringReader reader = new StringReader(str); + JapaneseTokenizer tokenizer = new JapaneseTokenizer(reader); + assertTokenStreamContents(tokenizer, + new String[] {"本日", "は", "晴天", "なり", "。"}, + new int[] {0, 2, 3, 5, 7}, + new int[] {2, 3, 5, 7, 8} + ); + } + + @Test + public void testMinimalBufferSize() throws Exception { + String str = "メロスは激怒した。" + "必ず、かの邪智暴虐の王を除かなければならぬと決意した。" + "メロスには政治がわからぬ。" + + "メロスは、村の牧人である。" + "笛を吹き、羊と遊んで暮して来た。" + "けれども邪悪に対しては、人一倍に敏感であった。"; + + StringReader reader = new StringReader(str); + StringReader reader2 = new StringReader(str); + + JapaneseTokenizer tokenizer = + new JapaneseTokenizer(reader, 7, TinySegmenter.DEFAULT_MAX_TOKEN_SIZE); + JapaneseTokenizer tokenizer2 = + new JapaneseTokenizer(reader2, 1024, TinySegmenter.DEFAULT_MAX_TOKEN_SIZE); + + while (tokenizer.incrementToken()) { + assertTrue(tokenizer2.incrementToken()); + assertEquals(tokenizer, tokenizer2); + } + assertFalse(tokenizer2.incrementToken()); + } + + @Test + public void testEmptyStream() throws Exception { + String str = ""; + StringReader reader = new StringReader(str); + JapaneseTokenizer tokenizer = new JapaneseTokenizer(reader); + assertFalse(tokenizer.incrementToken()); + } + + @Test + public void testEnglishText() throws Exception { + String str = "This is a test 1234"; + StringReader reader = new StringReader(str); + JapaneseTokenizer tokenizer = new JapaneseTokenizer(reader); + assertTokenStreamContents(tokenizer, + new String[] { "This", " ", "is", " ", "a", " ", "test", + " ", "1", "2", "3", "4" }); + } + + @Test + public void testMaxTokenSize() throws Exception { + String str = "一日作さざれば、一日食わず。"; + StringReader reader = new StringReader(str); + JapaneseTokenizer tokenizer = + new JapaneseTokenizer(reader, 1024, 2); + assertTokenStreamContents(tokenizer, + new String[] {"一日", "作", "さざ", "れ", "ば", "、", "一", "日", "食わ", "ず", "。"}, + new int[] {0, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13}, + new int[] {2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14} + ); + } +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\ja\TestJapaneseTokenizer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (revision 0) @@ -0,0 +1,94 @@ +/* + ** + ** Mar. 24, 2009 + ** + ** The author disclaims copyright to this source code. + ** In place of a legal notice, here is a blessing: + ** + ** May you do good and not evil. + ** May you find forgiveness for yourself and forgive others. + ** May you share freely, never taking more than you give. + ** + ** Stolen from SQLite :-) + ** Any feedback is welcome. + ** Kohei TAKETA + ** + */ +package org.apache.lucene.analysis.ja; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ja.tinysegmenter.BasicCodePointReader; +import org.apache.lucene.analysis.ja.tinysegmenter.TinySegmenter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + +/** + * {@link TinySegmenter}を用いて入力を分かち書きするTokenizer。 + * + * @author taketa + * + */ +public final class JapaneseTokenizer extends Tokenizer { + public static final String TOKENTYPE_WORD = "word".intern(); + + private TinySegmenter segmenter = null; + private long lastOffset = 0; + + private final int bufferSize; + private final int maxTokenSize; + + private CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + + public JapaneseTokenizer(Reader in) { + this(in, TinySegmenter.DEFAULT_BUFFER_SIZE, + TinySegmenter.DEFAULT_MAX_TOKEN_SIZE); + } + + public JapaneseTokenizer(Reader in, int bufferSize, int maxTokenSize) { + super(in); + this.bufferSize = bufferSize; + this.maxTokenSize = maxTokenSize; + segmenter = new TinySegmenter(new BasicCodePointReader(in), bufferSize, + maxTokenSize); + } + + @Override + public boolean incrementToken() throws IOException { + TinySegmenter.Token baseToken = segmenter.next(); + + if (baseToken == null) { + return false; + } + + clearAttributes(); + termAttribute.setEmpty().append(baseToken.str); + // termAttribute.setLength(baseToken.str.length()); + offsetAttribute.setOffset(correctOffset((int) baseToken.start), + correctOffset((int) baseToken.end)); + typeAttribute.setType(TOKENTYPE_WORD); + + lastOffset = baseToken.end; + + return true; + } + + @Override + public void end() { + int finalOffset = correctOffset((int) lastOffset); + offsetAttribute.setOffset(finalOffset, finalOffset); + } + + @Override + public void reset(Reader in) throws IOException { + super.reset(in); + segmenter = new TinySegmenter(new BasicCodePointReader(in), bufferSize, + maxTokenSize); + lastOffset = 0; + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\ja\JapaneseTokenizer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/CodePointReader.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/CodePointReader.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/CodePointReader.java (revision 0) @@ -0,0 +1,59 @@ +package org.apache.lucene.analysis.ja.tinysegmenter; + +/* + ** + ** Feb. 17, 2009 + ** + ** The author disclaims copyright to this source code. + ** In place of a legal notice, here is a blessing: + ** + ** May you do good and not evil. + ** May you find forgiveness for yourself and forgive others. + ** May you share freely, never taking more than you give. + ** + ** Stolen from SQLite :-) + ** Any feedback is welcome. + ** Kohei TAKETA + ** + */ + +import java.io.IOException; + +/** + * 一つずつUnicodeコードポイントを取得するためのクラス。 サロゲートペアを正しく認識する。 + */ +public interface CodePointReader { + /** + * 不正なサロゲートペアを置換するための代替文字をセットする。 このメソッドを呼び出さない場合の既定値は、 「 + * {@value #DEFAULT_ALTERNATIVE_CODEPOINT}」である。 + * + * @param cp 代替文字のコードポイント + */ + public abstract void setAlternationCodePoint(int cp); + + /** + * 不正なサロゲートペアを置換するための代替文字を取得する。 + * + * @return 代替文字のコードポイント + */ + public abstract int getAlternationCodePoint(); + + /** + * キャラクタストリーム中の現在の位置を返す。 コードポイント単位でなくchar単位で数えるので、 サロゲートペアが出現すると、位置は2大きくなる。 + * + * @return キャラクタストリーム中の位置。 + */ + public abstract long getPosition(); + + /** + * 次のコードポイントを取得する。 + * + * @return Unicodeコードポイント。 + */ + public abstract int read() throws IOException; + + /** + * 状態をリセットする。 + */ + public abstract void reset(); +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\ja\tinysegmenter\CodePointReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/BasicCodePointReader.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/BasicCodePointReader.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/BasicCodePointReader.java (revision 0) @@ -0,0 +1,117 @@ +package org.apache.lucene.analysis.ja.tinysegmenter; + +/* + ** + ** Feb. 1, 2009 + ** + ** The author disclaims copyright to this source code. + ** In place of a legal notice, here is a blessing: + ** + ** May you do good and not evil. + ** May you find forgiveness for yourself and forgive others. + ** May you share freely, never taking more than you give. + ** + ** Stolen from SQLite :-) + ** Any feedback is welcome. + ** Kohei TAKETA + ** + */ + +import java.io.IOException; +import java.io.PushbackReader; +import java.io.Reader; + +/** + * Readerをラップして、ひとつずつUnicodeコードポイントを 取得するためのデコレータクラス。サロゲートペアを正しく認識する。
+ * + * 不正なサロゲートペアは、{@link #getAlternativeCodePoint()}で得られる 代替コードポイントに置換される。 + */ +public class BasicCodePointReader implements CodePointReader { + /** + * 不正なサロゲートペアを置換する文字の既定値。 + */ + public static final int DEFAULT_ALTERNATION_CODEPOINT = '〓'; + + private PushbackReader reader = null; + private long position = 0; + private int alternationCodePoint = DEFAULT_ALTERNATION_CODEPOINT; + private boolean eos = false; + + /** + * コードポイントイテレータを構築する。 + * + * @param sequence ソースとなるcharのシーケンス + */ + public BasicCodePointReader(Reader reader) { + this.reader = new PushbackReader(reader, 1); + } + + public void setAlternationCodePoint(int cp) { + this.alternationCodePoint = cp; + } + + public int getAlternationCodePoint() { + return alternationCodePoint; + } + + public long getPosition() { + return position; + } + + public int read() throws IOException { + int ci; + char c, c2; + + if (eos) { + return -1; + } + + ci = reader.read(); + ++position; + + if (ci < 0) { + // end of character stream + eos = true; + return -1; + } else { + c = (char) ci; + } + + if (Character.isHighSurrogate(c)) { + // 次の文字を検査 + ci = reader.read(); + ++position; + if (ci < 0) { + // シーケンスがhigh surrogateで終わっている。 + // 代替文字を返すと共に、EOSフラグをONにする。 + eos = true; + --position; + return alternationCodePoint; + } + + c2 = (char) ci; + if (Character.isLowSurrogate(c2)) { + // サロゲートペアをコードポイントに変換して返す。 + return Character.toCodePoint(c, c2); + } else { + // high surrogateに続くcharが、low surrogateでない。 + // c2をプッシュバックして代替文字を返す。 + reader.unread(c2); + --position; + return alternationCodePoint; + } + } else if (Character.isLowSurrogate(c)) { + // 単独で存在するlow surrogateを発見。 + // 代替文字を返す。 + return alternationCodePoint; + } else { + // 基本文字。そのまま返す。 + return c; + } + } + + public void reset() { + position = 0; + eos = false; + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\ja\tinysegmenter\BasicCodePointReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/TinySegmenterConstants.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/TinySegmenterConstants.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ja/tinysegmenter/TinySegmenterConstants.java (revision 0) @@ -0,0 +1,350 @@ +package org.apache.lucene.analysis.ja.tinysegmenter; + +import java.util.Collections; +import java.util.Map; +import java.util.HashMap; + +public class TinySegmenterConstants +{ + public static final int BIAS = -332; + public static final String[] BC1_KEYS = {"OH","II","HH","KH"}; + public static final Integer[] BC1_VALS = {-1378,2461,6,406}; + public static final String[] BC2_KEYS = {"AN","MK","HH","IA","KI","KK","HM","AA","HN","HO","IH","II","IK","AI","IO","MH","OO"}; + public static final Integer[] BC2_VALS = {-878,3334,-4070,1327,3831,-8741,-1711,-3267,4012,3761,-1184,-1332,1721,2744,5492,-3132,-2920}; + public static final String[] BC3_KEYS = {"MK","MM","HH","HI","HK","OA","KK","HN","HO","IH","OH"}; + public static final Integer[] BC3_VALS = {1079,4034,996,626,-721,-1652,2762,-1307,-836,-301,266}; + public static final String[] BP1_KEYS = {"BB","UB","OB","OO"}; + public static final Integer[] BP1_VALS = {295,352,304,-125}; + public static final String[] BP2_KEYS = {"BO","OO"}; + public static final Integer[] BP2_VALS = {60,-1762}; + public static final String[] BQ1_KEYS = {"BHM","OHI","OKH","OKK","BII","BOH","OIH","BIM","BOO","BMH","OKA","OOO","BHH","BNH"}; + public static final Integer[] BQ1_VALS = {1521,451,-1020,904,-1158,-91,-296,886,-2597,1208,1851,2965,1150,449}; + public static final String[] BQ2_KEYS = {"BKK","OHH","BHM","BKO","BIH","OHM","OIH","UHI","BHH","BHI"}; + public static final Integer[] BQ2_VALS = {-1720,-1139,466,864,-919,-181,153,-1146,118,-1159}; + public static final String[] BQ3_KEYS = {"OHH","OKH","OKI","BNN","BII","OHM","BOH","OKO","OII","BMH","OMH","OOO","BMM","BHH","BHI","BKI"}; + public static final Integer[] BQ3_VALS = {2174,1798,-793,998,-299,439,775,-2242,280,937,-2402,11699,8335,-792,2664,419}; + public static final String[] BQ4_KEYS = {"BKK","OHH","OHK","BIH","BII","BIK","ONN","BOO","OAH","BMI","BHH"}; + public static final Integer[] BQ4_VALS = {-1806,266,-2036,3761,-4654,1348,-973,-12396,926,-3385,-3895}; + public static final String[] BW1_KEYS = {"引き","から","いう","を見","平方","B1同","てい","たち","大阪","B1あ","ませ","取り","には","てき","すで","毎日","どこ","なん","さら","こと","まで","の中","そこ","いっ","がら","とみ","さん","にも","った","ない","」と","つい","ため","した","うん","本当","でき","、と","やむ","よっ","まま","して","、同","に対","亡く","B1同","」と","です","大き","B1あ","をし","あっ","まる","京都","こん","なっ","とい","いる",",と","れた","など",",同","の一","目指","うし","れで","では","それ","こう","にし","日本"}; + public static final Integer[] BW1_VALS = {-1336,3472,1743,731,-2314,542,805,1122,1497,1404,2448,-2784,1498,1249,-3399,-2113,3887,-1113,-4143,2083,1711,741,1977,-2055,600,1922,4573,1671,3463,5713,1682,-802,601,2641,665,-2423,1127,660,-1947,-2565,2600,1104,727,-912,-1886,542,1682,3445,-2604,1404,1860,1505,-2155,2558,-1262,3015,-4915,672,660,2369,7379,727,-501,-724,-4817,-913,844,-871,-790,2468,-195}; + public static final String[] BW2_KEYS = {"――","れば","とこ","に対","11","んだ","はい","くな","一部","委員","ので","でも","いう","のに","はが","んな","新聞","とと","のの","会社","同党","との","もい","めて","しい","はず","一方","を通","少な","しか","上が","され","とみ","−−","とも","ない","本人","った","さん","に関","なが","って","っと","手権","した","かし","らか","曜日","年度","して","その","しな","もの","一人","東京","がい","らし","米国","一日","など","にお","うか","日米","たい","なの","らに","大阪","にし","府県","かも","りし","社会","から","まし","かれ","ばれ","てい","たた","にな","ただ","たち","第に","われ","てき","たと","てく","なん","同日","..","まで","きた","たは","こと","然と","この","がら","りま","でい","によ","11","でき","に従","ては","立て","でし","です","まれ","れた","ても","とい","分の","のか","ろう","出て","日本","れて","年間","日新","朝鮮","させ"}; + public static final Integer[] BW2_VALS = {-5730,4114,-1746,-14943,-669,728,1073,-1597,-1051,-1250,-7059,-4203,-1609,-6041,-1033,-4115,-4066,-2279,-6125,-1116,970,720,2230,-3153,-1819,-2532,-1375,-11877,-1050,-545,-4479,13168,5168,-13175,-3941,-2488,-2697,4589,-3977,-11388,-1313,1647,-2094,-1982,5078,-1350,-944,-601,-8669,972,-3744,939,-10713,602,-1543,853,-1611,-4268,970,-6509,-1615,2490,3372,-1253,2614,-1897,-2471,2748,-2363,-602,651,-1276,-7194,-1316,4612,1813,6144,-662,2454,-3857,-786,-1612,7901,3640,1224,2551,3099,-913,-11822,-6621,1941,-939,-8392,-1384,-4193,-3198,1620,2666,-7236,-669,-1528,-4688,-3110,-990,-3828,-4761,5409,4270,-3065,1890,-7758,2093,6067,2163,-7068,849,-1626,-722,-2355,4533}; + public static final String[] BW3_KEYS = {"でに","市","るる","では","れば","日、","た.","とし","が、","す.","んだ","に、","いい","んで","どう","いえ","新聞","た。","あり","ある","いく","れる","との","す。","そう","しい","だ.","ず,","カ月","いた","いっ","大会","とも","さを","ない","った","だ。","かけ","って","ず、","した","なく","れ,","して","しな","かっ","らし","けど","カ月","れ、","かに","がき","の,","など","がけ","いる","たい","しま","いわ","会議","にし","がっ","の、","うち","社会","から","かり","うと","の子","まし","てい","は,","ます","にな","い.","てお","われ","には","まっ","られ","まで","たの","きた","し,","こと","は、","べき","この","い。","がら","がり","か.","だっ","し、","たり","たる","さい","始め","ずに","する","です","か。","まれ","日,","ころ","あた","れた","えと","が,","ても","とう","れて","入り","に,"}; + public static final Integer[] BW3_VALS = {-1482,965,3818,2295,-3246,974,8875,2266,1816,-1310,606,-1021,5308,798,4664,2079,-5055,8875,719,3846,3029,1091,541,-1310,428,-3714,4098,3426,990,2056,1883,2217,-3543,976,1796,-4748,4098,-743,300,3426,3562,-903,854,1449,2608,-4098,1479,1374,990,854,-669,-4855,-724,2135,-1127,5600,-594,1200,1527,860,1771,-913,-724,1117,2024,6520,-2670,4798,-1000,1113,6240,1337,6943,1906,-1185,855,-605,2644,-1549,6820,6154,812,1645,1557,7397,1337,2181,1542,-1185,-4977,-2064,2857,1004,1557,-1183,-853,-714,1681,841,6521,1437,2857,-793,974,-2757,-2194,1850,1454,1816,302,-1387,1375,1232,-1021}; + public static final String[] TC1_KEYS = {"HOM","MMH","AAA","IHI","OOI","HHH","IOH","HHM","IOI","HII","HOH","IOM"}; + public static final Integer[] TC1_VALS = {-331,187,1093,1169,-1832,1029,-142,580,-1015,998,-390,467}; + public static final String[] TC2_KEYS = {"IHI","OII","HMM","KKH","HHO","HII"}; + public static final Integer[] TC2_VALS = {-1965,-2649,-1154,703,2088,-1023}; + public static final String[] TC3_KEYS = {"HHH","HHI","KOK","IOI","IIH","AAA","KKA","IIM","MHH","OHO","KKH","KHH","MHM","MHO","IHH","IHI","MMH","IHO","HOH","NNH","HII","HIK","NNO"}; + public static final Integer[] TC3_VALS = {346,-341,-1009,-542,-825,-294,491,-1035,-2694,-3393,-1217,-1216,-457,123,128,-3041,-471,-1935,-1486,-1689,-1088,731,662}; + public static final String[] TC4_KEYS = {"MOM","HHH","HHI","HHK","HHM","IIH","HHN","III","HHO","KKA","IOO","MHH","IIO","MHI","KKK","IHH","MMH","IHO","HOH","MMM","HIH","HII","KAK"}; + public static final Integer[] TC4_VALS = {841,-203,1344,365,-122,321,182,1497,669,3386,54,-405,656,201,3065,695,-241,-2324,446,661,804,679,4845}; + public static final String[] TQ1_KEYS = {"BHIH","OHHH","BOHH","OIIH","BNHH","OHIH","BIHH","BHHH","BHHI","OAKK","BIII","BOOO","OIHI"}; + public static final Integer[] TQ1_VALS = {-132,281,225,-68,-744,249,60,-227,316,482,1595,-908,200}; + public static final String[] TQ2_KEYS = {"BIHH","BKAK","BOOO","BIII"}; + public static final Integer[] TQ2_VALS = {-1401,-543,-5591,-1033}; + public static final String[] TQ3_KEYS = {"BHIH","BHII","OHII","OKAK","OOII","BHHH","OHHH","OHHI","BHHM","BIIH","BIII","OIIH","OOHH","OKKA","BMHI","BMHM","OHMH","OKHH","BOMH","OIHH"}; + public static final Integer[] TQ3_VALS = {222,-504,997,2792,-685,478,346,1729,-1073,-116,-105,1344,110,679,-863,-464,481,587,620,623}; + public static final String[] TQ4_KEYS = {"BHII","OHHH","OHHI","OKAK","OIIH","OIII","OHHO","OHIH","BHHH","OAKK","BIIH","BIII","OAAA","OIHH","BHHM","OIHI"}; + public static final Integer[] TQ4_VALS = {-966,-294,2446,-8156,626,-4007,480,-1573,-721,180,-607,-2181,-2763,1935,-3604,-493}; + public static final String[] TW1_KEYS = {"東京都","につい"}; + public static final Integer[] TW1_VALS = {2026,-4681}; + public static final String[] TW2_KEYS = {"だって","しょう","として","ある程","大きな","その後","ともに","ころが","対して","もので","社会党","ていた","一気に","いった","初めて","同時に"}; + public static final Integer[] TW2_VALS = {-1049,3873,-4657,-2049,-1255,-4430,-4517,-2434,-2721,1882,-3216,1833,-792,-1256,-1512,-8097}; + public static final String[] TW3_KEYS = {"ので、","として","のもの","にとっ","いただ","につい","してい","ので,","十二月","れから","に当た"}; + public static final Integer[] TW3_VALS = {-727,-4314,-600,-5989,-1734,-5483,1314,-727,-2287,-3752,-6247}; + public static final String[] TW4_KEYS = {"からな","ました","という","いう.","ようと","よると","たが,","ている","してい","いう。","ません","たが、"}; + public static final Integer[] TW4_VALS = {-2348,5543,1349,8576,-4258,5865,1516,1538,2958,8576,1097,1516}; + public static final String[] UC1_KEYS = {"M","O","K","A"}; + public static final Integer[] UC1_VALS = {645,-505,93,484}; + public static final String[] UC2_KEYS = {"M","N","O","H","I","A"}; + public static final Integer[] UC2_VALS = {3987,5775,646,1059,409,819}; + public static final String[] UC3_KEYS = {"A","I"}; + public static final Integer[] UC3_VALS = {-1370,2311}; + public static final String[] UC4_KEYS = {"M","N","O","H","I","K","A"}; + public static final Integer[] UC4_VALS = {3565,3876,6646,1809,-1032,-3450,-2643}; + public static final String[] UC5_KEYS = {"M","O","H","I","K"}; + public static final Integer[] UC5_VALS = {539,-831,313,-1238,-799}; + public static final String[] UC6_KEYS = {"M","O","H","I","K"}; + public static final Integer[] UC6_VALS = {247,-387,-506,-253,87}; + public static final String[] UP1_KEYS = {"O"}; + public static final Integer[] UP1_VALS = {-214}; + public static final String[] UP2_KEYS = {"B","O"}; + public static final Integer[] UP2_VALS = {69,935}; + public static final String[] UP3_KEYS = {"B"}; + public static final Integer[] UP3_VALS = {189}; + public static final String[] UQ1_KEYS = {"BH","BI","BK","BN","BO","OH","OI","OK","OO"}; + public static final Integer[] UQ1_VALS = {21,-12,-99,142,-56,-95,477,410,-2422}; + public static final String[] UQ2_KEYS = {"BH","BI","OK"}; + public static final Integer[] UQ2_VALS = {216,113,1759}; + public static final String[] UQ3_KEYS = {"BH","BI","BK","BM","BN","BO","OI","BA","ON"}; + public static final Integer[] UQ3_VALS = {42,1913,-7198,3160,6427,14761,-827,-479,-3212}; + public static final String[] UW1_KEYS = {"京","あ","委","う","が","き","「","こ","・","大","区","市","、","国","午","で","と","ど",",","に","「","の","は","日","生","理","都","も","や","よ","ら","県","り","主","れ","を","ん","・"}; + public static final Integer[] UW1_VALS = {-268,-941,729,-127,-553,121,-463,505,-135,561,-912,-411,156,-460,871,-201,-547,-123,156,-789,-463,-185,-847,-141,-408,361,-718,-466,-470,182,-292,-386,208,-402,169,-446,-137,-135}; + public static final String[] UW2_KEYS = {"揺","市","も","会","や","保","よ","最","り","初","る","れ","文","第","入","を","ん","自","ア","朝",",","カ","キ","事","本","西","新","「","」","、","見","ッ","ッ","北","〇","ア","小","子","「","カ","」","目","キ","開","相","間","副","大","学","天","太","理","人","区","県","日","立","次","三","年","不","強","東","込","世","あ","行","い","う","政","お","か","が","手","く","こ","中","さ","ざ","明","し","発","実","す","米","せ","そ","た","だ","民","主","つ","て","果","で","気","と","ど","な","議","に","の","は","ひ","調","べ","ま"}; + public static final Integer[] UW2_VALS = {-1033,-813,-1263,978,-402,362,1639,-630,-579,-3025,-694,571,-1355,810,548,-2516,2095,-1353,-587,-1843,-829,306,568,492,-1650,-744,-1682,-645,3145,-829,-3874,831,831,-3414,892,-587,-2009,-1519,-645,306,3145,-1584,568,1758,-242,-1257,-1566,-1769,760,-865,-483,752,-123,-422,-1165,-1815,-763,-2378,-758,-1060,-2150,1067,-931,3041,-302,-538,838,505,134,1522,-502,1454,-856,-1519,-412,1141,-968,878,540,-1462,1529,529,1023,-675,509,300,-1011,188,1837,-180,-861,-949,-291,-665,-268,-1740,-981,1273,1063,1198,-1764,130,-409,-1273,1010,1261,600}; + public static final String[] UW3_KEYS = {"1","低","前","関","何","作","李","村","費","口","込","立","、","学","総","々","副","〇","日","旧","右",",","」","線","平","年","〓","一","森","知","東","国","各","下","合","海","広","非","同","安","米","指","世","力","的","能","両","氏","民","府","実","思","中","あ","い","度","う","性","え","お","か","昨","が","生","主","く","け","げ","家","こ","ご","さ","用","し","元","す","通","せ","そ","第","グ","た","ち","っ","つ","て","時","で","と","町","ど","な","に","動","の","は","務","党","ひ","保","私","ふ","へ","ほ","ま","全","み","め","公","も","六","や","共","よ","ら","車","り","る","れ","軍","わ","を","金","ん","業","物","建","1","円","予","二","ア","決","再","直","和","型","特","英","小","化","少","北","系","グ","省","外","約","選","ス","者","県","税","ッ","ト","無","級","人","区","戸","千","核","今","午","ム","政","他","協","ル","ロ","」","・","当","ン","員","以","ッ","・","調","ア","教","州","法","曜","ス","−","駅","郎","ト","数","ム","分","市","自","郡","ル","最","統","ロ","ン","部","文","月","雨","初","得","長","別","電","期","見","場","開","新","妻","間","財"}; + public static final Integer[] UW3_VALS = {-800,811,2286,-1282,4265,-361,3094,364,1777,483,-1504,-960,4889,-1356,1163,-2311,4437,5827,2099,5792,1233,4889,2670,1255,-1804,2416,-3573,-1619,2438,-1528,-805,642,3588,-1759,-241,-495,-1030,2066,3906,-423,7767,-3973,-2087,365,7313,725,3815,2613,-1694,1605,-1008,-1291,653,-2696,1006,1452,2342,1822,1983,-4864,-1163,-661,3271,-273,-758,1004,388,401,1078,-3552,-3116,-1058,914,-395,4858,584,-1136,3685,-5228,1201,1319,842,-521,-1444,-1081,6167,-1248,2318,1691,1215,-899,-2788,2745,-949,4056,4555,-1872,3593,-2171,-2439,4231,-1798,1199,-5516,-4384,1574,-120,1205,-3030,2323,755,-788,-1880,-202,727,1835,649,5905,2773,1375,-1207,6620,2163,-518,484,461,-2352,-800,5807,-1193,974,551,-1073,3095,-1835,-837,1389,-3850,785,-513,1327,-3102,-1038,3066,1319,792,-241,3663,-681,874,6457,6293,401,-1350,521,979,1384,2742,4646,-488,-2309,5156,792,-783,1109,-2013,1889,-1006,1591,2201,2670,-3794,-3885,278,4513,-1368,-1350,-3794,-562,551,-1479,1155,1868,-951,874,-1723,1620,1026,521,3222,1109,457,3197,-2869,4404,1591,-937,-4229,2201,278,1200,-1489,4125,2009,2475,1905,421,1129,-1045,360,1044,1219,-1432,1764,2016,1302,-733}; + public static final String[] UW4_KEYS = {"般","前","体","子","作","回","込","立","、","。","学","総","副","〇","行","日","来","「",",","」",".","線","近","年","〓","島","一","国","賞","庁","合","警","米","署","園","議","力","的","能","率","定","氏","民","気","中","あ","い","う","性","え","地","お","か","が","き","生","ぎ","く","け","産","げ","こ","ご","さ","し","じ","す","ず","せ","そ","先","田","第","た","だ","ち","っ","つ","て","時","で","と","町","な","に","ぬ","動","ね","の","館","は","ば","務","党","ひ","び","ふ","へ","べ","ほ","ま","み","む","め","も","ゃ","や","士","共","ょ","よ","ら","車","り","―","る","れ","軍","ろ","わ","野","を","ん","業","道","物","寺","内","円","予","目","事","高","和","院","井","カ","小","化","系","球","省","済","コ","多","約","選","者","セ","県","大","ッ","校","ト","沢","人","区","支","改","首","領","際","所","メ","政","屋","ラ","輪","リ","協","ル","「","」","・","ン","谷","員","以","ッ","ー","川","・","教","ー","経","カ","器","コ","セ","側","山","郎","ト","題","メ","市","ラ","リ","ル","最","統","ン","文","後","空","月","会","初","長","都","感","電","銀","規","木","場","間","参","塁","方"}; + public static final Integer[] UW4_VALS = {-852,1623,-1286,-4802,530,1500,-3370,-2112,3930,3508,-1397,940,3879,4999,-792,1798,-442,1895,3930,3798,3508,-994,929,374,-5156,-2056,-2069,-619,730,-4556,-1834,-1184,2937,749,-1200,-244,-302,2586,-730,672,-1057,5388,-2716,-910,2210,4752,-3435,-640,553,-2514,866,2405,530,6006,-4482,-1286,-3821,-3788,-4376,-1101,-4734,2255,1979,2864,-843,-2506,-731,1251,181,4091,601,-2900,788,5034,5408,-3654,-5882,-1659,3994,1829,7410,4547,1826,5433,6499,1853,-740,1413,7396,-1984,8578,1940,-2715,-2006,4249,-4134,1345,6665,-744,1464,1051,-2082,-882,-5046,4169,-2666,2795,-1413,-1212,-1544,3351,-2922,-1481,-9726,-4841,-14896,-2613,1158,-4570,-1783,-1100,13150,-2352,-1043,-1291,-735,-809,584,788,782,922,-190,2120,-681,-2297,-1768,2145,1910,776,786,-1267,-3485,-543,1789,1067,2171,2596,2145,1287,2997,571,-724,-360,-403,-939,1036,4517,856,787,1749,-1659,-2604,-1566,-1635,2182,-1328,-881,-1433,-541,1013,-856,1895,3798,-4371,-3637,-1000,-910,544,-724,-11870,-2667,-4371,704,-11870,1146,2145,-851,1789,1287,4292,-1500,-4866,-403,-792,-1635,2771,-881,-541,-856,845,-1169,-3637,522,456,-867,-9066,950,1347,357,1192,916,-878,-2213,792,-485,-1410,-2344,1555,-2094,-856}; + public static final String[] UW5_KEYS = {"み","市","1","め","ゃ","会","党","ょ","務","り","る","E2","れ","嵐","田","わ","郎","月","を","ん","町","題","統","空","イ","席",",",".","館","新","「","長","、","。","査","イ","「","京","相","E2","間","]","大","学","省","社","区","県","ル","日","機","ル","者","年","ン","ン","選","あ","所","い","う","格","え","お","か","が","き","ぎ","く","員","げ","定","中","さ","し","語","す","挙","思","表","氏","だ","ち","的","っ","つ","て","1","で","と","ど","な","議","に","の","は","研","べ","告"}; + public static final Integer[] UW5_VALS = {502,-2991,-514,865,3350,-1153,-654,854,3519,-208,429,-32768,504,-1304,240,419,-368,-4353,-1264,327,-3912,2368,1955,-813,241,921,465,-299,-689,-1682,363,786,465,-299,932,241,363,722,1319,-32768,1191,-2762,-1296,-548,-1052,-278,-901,-4003,451,218,-1508,451,-2233,1763,-343,-343,-1018,1655,-814,331,-503,1356,1199,527,647,-421,1624,1971,312,2104,-983,1785,-871,-1537,-1371,-1073,-852,1618,872,663,-1347,-1186,1093,-3149,52,921,-18,-514,-850,-127,1682,-787,1219,-1224,-635,-578,-997,1001,848}; + public static final String[] UW6_KEYS = {"1","E1","あ","空","委","う","業","か","が","会","く","一","郎","こ","じ","区","す","学","E1","市","1","た","、","。","っ","連","て","で","と",",","な","に","後",".","の","は","福","相","中","広","も","社","員","ル","前","件","り","る","ン","ル","を","ン","者"}; + public static final Integer[] UW6_VALS = {-270,306,-307,-822,798,189,-697,241,-73,624,-121,-277,1082,-200,1782,1792,383,-960,306,887,-270,-428,227,808,573,463,-1014,101,-105,227,-253,-149,535,808,-417,-236,974,753,201,-695,-206,-507,-1212,-673,302,-800,187,-135,-496,-673,195,-496,1811}; + public static final Map BC1; + public static final Map BC2; + public static final Map BC3; + public static final Map BP1; + public static final Map BP2; + public static final Map BQ1; + public static final Map BQ2; + public static final Map BQ3; + public static final Map BQ4; + public static final Map BW1; + public static final Map BW2; + public static final Map BW3; + public static final Map TC1; + public static final Map TC2; + public static final Map TC3; + public static final Map TC4; + public static final Map TQ1; + public static final Map TQ2; + public static final Map TQ3; + public static final Map TQ4; + public static final Map TW1; + public static final Map TW2; + public static final Map TW3; + public static final Map TW4; + public static final Map UC1; + public static final Map UC2; + public static final Map UC3; + public static final Map UC4; + public static final Map UC5; + public static final Map UC6; + public static final Map UP1; + public static final Map UP2; + public static final Map UP3; + public static final Map UQ1; + public static final Map UQ2; + public static final Map UQ3; + public static final Map UW1; + public static final Map UW2; + public static final Map UW3; + public static final Map UW4; + public static final Map UW5; + public static final Map UW6; + static { + int i; + Map m; + m = new HashMap(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i(); + for (i=0; i + * TinySegmenter is freely distributable under the terms of a new BSD licence. + * For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt + * + * Ported to Java by Kohei TAKETA + */ + +import static org.apache.lucene.analysis.ja.tinysegmenter.TinySegmenterConstants.*; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * TinySegmenterのJava移植版。 + * + * @author taketa + * + */ +public class TinySegmenter { + public static class CharInfo { + public int cp; + public String str; + public String ctype; + public long start; + public long end; + } + + public static class Token { + public String str; + public long start; + public long end; + + public boolean equals(Object obj) { + if (!(obj instanceof Token)) { + return false; + } + + Token another = (Token) obj; + return ((str == null ? another.str == null : str.equals(another.str)) + && (start == another.start) && (end == another.end)); + } + + public String toString() { + return ("(" + str + "," + Long.toString(start) + "," + Long.toString(end) + ")"); + } + } + + private static String getCharType(int cp) { + if (CHINESE_NUMBER_SET.contains(cp)) { + // [一二三四五六七八九十百千万億兆] + return "M"; + } else if ((0x4E00 <= cp && cp <= 0x9fa0) || cp == '々' || cp == '〆' + || cp == 'ヵ' || cp == 'ヶ') { + // [一-龠々〆ヵヶ] + return "H"; + } else if (0x3041 <= cp && cp <= 0x3093) { + // [ぁ-ん] + return "I"; + } else if ((0x30a1 <= cp && cp <= 0x30f4) || cp == 0x30fc + || (0xff71 <= cp && cp <= 0xff9e) || cp == 0xff70) { + // [ァ-ヴーア-ン゙ー] + return "K"; + } else if (('a' <= cp && cp <= 'z') || ('A' <= cp && cp <= 'Z') + || ('a' <= cp && cp <= 'z') || ('A' <= cp && cp <= 'Z')) { + // [a-zA-Za-zA-Z] + return "A"; + } else if (('0' <= cp && cp <= '9') || ('0' <= cp && cp <= '9')) { + // [0-90-9] + return "N"; + } else { + return "O"; + } + } + + private static List getCodePoints(String str) { + int count = str.codePointCount(0, str.length()); + List result = new ArrayList(count); + int charIndex = 0; + while (charIndex < str.length()) { + int cp = str.codePointAt(charIndex); + result.add(cp); + charIndex += Character.charCount(cp); + } + + return result; + } + + public static final int DEFAULT_BUFFER_SIZE = 4096; + public static final int DEFAULT_MAX_TOKEN_SIZE = 255; + + @SuppressWarnings("serial") + private static final Set CHINESE_NUMBER_SET = Collections + .unmodifiableSet(new HashSet() { + { + addAll(getCodePoints("一二三四五六七八九十百千万億兆")); + } + }); + + private CodePointReader reader; + private boolean eos = false; + private int eosCount = 0; + + private CharInfo[] buffer; + private int end = 0; + private int position = 0; + private boolean done = false; + private String p1 = "U"; + private String p2 = "U"; + private String p3 = "U"; + + private CharInfo[] tokenBuffer; + private int tokenPosition = 0; + + public TinySegmenter(CodePointReader reader) { + this(reader, DEFAULT_BUFFER_SIZE, DEFAULT_MAX_TOKEN_SIZE); + } + + /** + * コンストラクタ + * + * @param reader CodePointReaderオブジェクト。 + * @param bufferSize バッファのサイズ。7以上でなければならない。 + * @param maxTokenSize トークンの最大サイズ。2以上でなければならない。 + */ + public TinySegmenter(CodePointReader reader, int bufferSize, int maxTokenSize) { + assert (bufferSize > 6); + assert (maxTokenSize > 1); + + this.reader = reader; + this.buffer = new CharInfo[bufferSize]; + this.tokenBuffer = new CharInfo[maxTokenSize]; + } + + public CharInfo readChar() throws IOException { + CharInfo c = new CharInfo(); + + if (eos) { + c.start = -1; + c.end = -1; + c.cp = -1; + } else { + c.start = reader.getPosition(); + c.cp = reader.read(); + c.end = reader.getPosition(); + } + + if (c.cp < 0) { + eos = true; + switch (eosCount) { + case 0: + c.str = "E1"; + c.ctype = "O"; + ++eosCount; + break; + case 1: + c.str = "E2"; + c.ctype = "O"; + ++eosCount; + break; + case 2: + c.str = "E3"; + c.ctype = "O"; + ++eosCount; + break; + default: + return null; + } + } else { + c.str = new String(Character.toChars(c.cp)); + c.ctype = getCharType(c.cp); + } + + return c; + } + + private void initBuffer() { + CharInfo c; + + { + c = new CharInfo(); + c.cp = 0; + c.str = "B3"; + c.ctype = "O"; + c.start = -1; + c.end = -1; + buffer[0] = c; + + c = new CharInfo(); + c.cp = 0; + c.str = "B2"; + c.ctype = "O"; + c.start = -1; + c.end = -1; + buffer[1] = c; + + c = new CharInfo(); + c.cp = 0; + c.str = "B1"; + c.ctype = "O"; + c.start = -1; + c.end = -1; + buffer[2] = c; + } + end = 3; + position = 3; + } + + private int fillBuffer() throws IOException { + CharInfo c; + + // 末尾の6アイテムを、先頭にコピーする。 + int src = end - 6; + int dst = 0; + if (src < 0) { + src = 0; + } + while (src < end) { + buffer[dst++] = buffer[src++]; + } + + // end, positionをリセット + int start = dst; + end = dst; + position = 3; + + // 残りの部分にデータを読み込む + while (end < buffer.length) { + c = readChar(); + if (c != null) { + buffer[end++] = c; + } else { + break; + } + } + + return (end - start); + } + + public Token next() throws IOException { + if (done) { + return null; + } + + if (end <= 0) { + // 初回呼び出し + initBuffer(); + if (fillBuffer() > 3) { + // 最初の文字をtokenBufferに格納 + tokenBuffer[tokenPosition++] = buffer[position++]; + } else { + // 空のストリーム + done = true; + return null; + } + } + + Token token = null; + do { + while (position < end - 3) { + if (isBoundary()) { + // トークン境界と判定 + if (tokenPosition > 0) { + token = makeToken(); + tokenBuffer[tokenPosition++] = buffer[position++]; + break; + } + } else { + // トークン境界ではない + tokenBuffer[tokenPosition++] = buffer[position++]; + if (tokenPosition >= tokenBuffer.length) { + // バッファがいっぱいになったので、一旦トークンとして切り出す + token = makeToken(); + break; + } + } + } + } while (token == null && fillBuffer() > 0); + + if (token == null) { + // 最後のトークンを切り出す + done = true; + token = makeToken(); + } + + return token; + } + + private Token makeToken() { + Token token = new Token(); + StringBuilder builder = new StringBuilder(); + + token.start = tokenBuffer[0].start; + for (int i = 0; i < tokenPosition; ++i) { + builder.append(tokenBuffer[i].str); + token.end = tokenBuffer[i].end; + } + + token.str = builder.toString(); + + tokenPosition = 0; + + return token; + } + + private boolean isBoundary() { + int score = BIAS; + + CharInfo c1 = buffer[position - 3]; + CharInfo c2 = buffer[position - 2]; + CharInfo c3 = buffer[position - 1]; + CharInfo c4 = buffer[position]; + CharInfo c5 = buffer[position + 1]; + CharInfo c6 = buffer[position + 2]; + + score += getScore(UP1, p1); + score += getScore(UP2, p2); + score += getScore(UP3, p3); + score += getScore(BP1, p1 + p2); + score += getScore(BP2, p2 + p3); + score += getScore(UW1, c1.str); + score += getScore(UW2, c2.str); + score += getScore(UW3, c3.str); + score += getScore(UW4, c4.str); + score += getScore(UW5, c5.str); + score += getScore(UW6, c6.str); + score += getScore(BW1, c2.str + c3.str); + score += getScore(BW2, c3.str + c4.str); + score += getScore(BW3, c4.str + c5.str); + score += getScore(TW1, c1.str + c2.str + c3.str); + score += getScore(TW2, c2.str + c3.str + c4.str); + score += getScore(TW3, c3.str + c4.str + c5.str); + score += getScore(TW4, c4.str + c5.str + c6.str); + score += getScore(UC1, c1.ctype); + score += getScore(UC2, c2.ctype); + score += getScore(UC3, c3.ctype); + score += getScore(UC4, c4.ctype); + score += getScore(UC5, c5.ctype); + score += getScore(UC6, c6.ctype); + score += getScore(BC1, c2.ctype + c3.ctype); + score += getScore(BC2, c3.ctype + c4.ctype); + score += getScore(BC3, c4.ctype + c5.ctype); + score += getScore(TC1, c1.ctype + c2.ctype + c3.ctype); + score += getScore(TC2, c2.ctype + c3.ctype + c4.ctype); + score += getScore(TC3, c3.ctype + c4.ctype + c5.ctype); + score += getScore(TC4, c4.ctype + c5.ctype + c6.ctype); + score += getScore(UQ1, p1 + c1.ctype); + score += getScore(UQ2, p2 + c2.ctype); + score += getScore(UQ3, p3 + c3.ctype); + // score += getScore(UQ1, p3 + c3.ctype); // オリジナルコード。UQ3の誤り? + score += getScore(BQ1, p2 + c2.ctype + c3.ctype); + score += getScore(BQ2, p2 + c3.ctype + c4.ctype); + score += getScore(BQ3, p3 + c2.ctype + c3.ctype); + score += getScore(BQ4, p3 + c3.ctype + c4.ctype); + score += getScore(TQ1, p2 + c1.ctype + c2.ctype + c3.ctype); + score += getScore(TQ2, p2 + c2.ctype + c3.ctype + c4.ctype); + score += getScore(TQ3, p3 + c1.ctype + c2.ctype + c3.ctype); + score += getScore(TQ4, p3 + c2.ctype + c3.ctype + c4.ctype); + + boolean result = false; + String p = "O"; + if (score > 0) { + p = "B"; + result = true; + } + p1 = p2; + p2 = p3; + p3 = p; + + return result; + } + + private int getScore(Map m, String key) { + Integer s = m.get(key); + return (s != null ? s : 0); + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\ja\tinysegmenter\TinySegmenter.java ___________________________________________________________________ Added: svn:eol-style + native