Index: solr/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java (revision 0) +++ solr/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java (revision 0) @@ -0,0 +1,37 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cjk.CJKWidthFilter; + +/** + * Factory for {@link CJKWidthFilter}. + *
+ * <fieldType name="text_jawidth" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.JapaneseTinyTokenizerFactory"/>
+ *     <filter class="solr.CJKWidthFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class CJKWidthFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new CJKWidthFilter(input); + } +} Property changes on: solr\src\java\org\apache\solr\analysis\CJKWidthFilterFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: solr/src/java/org/apache/solr/analysis/JapaneseKatakanaStemFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/JapaneseKatakanaStemFilterFactory.java (revision 0) +++ solr/src/java/org/apache/solr/analysis/JapaneseKatakanaStemFilterFactory.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter; + +/** + * Factory for {@link JapaneseKatakanaStemFilter}. + *
+ * <fieldType name="text_katakanastem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.JapaneseTinyTokenizerFactory"/>
+ *     <filter class="solr.CJKWidthFilterFactory"/>
+ *     <filter class="solr.JapaneseKatakanaStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class JapaneseKatakanaStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new JapaneseKatakanaStemFilter(input); + } +} Property changes on: solr\src\java\org\apache\solr\analysis\JapaneseKatakanaStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: solr/src/java/org/apache/solr/analysis/JapaneseTinyTokenizerFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/JapaneseTinyTokenizerFactory.java (revision 0) +++ solr/src/java/org/apache/solr/analysis/JapaneseTinyTokenizerFactory.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ja.JapaneseTinyTokenizer; +import java.io.Reader; + +/** + * Factory for {@link JapaneseTinyTokenizer}. + *
+ * <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.JapaneseTinyTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class JapaneseTinyTokenizerFactory extends BaseTokenizerFactory { + public Tokenizer create(Reader in) { + return new JapaneseTinyTokenizer(in); + } +} + Property changes on: solr\src\java\org\apache\solr\analysis\JapaneseTinyTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java (revision 0) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.ja; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests for {@link JapaneseKatakanaStemFilter} + */ +public class TestJapaneseKatakanaStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String field, Reader reader) { + Tokenizer tokenizer = new JapaneseTinyTokenizer(reader); + TokenStream stream = new JapaneseKatakanaStemFilter(tokenizer); + return new TokenStreamComponents(tokenizer, stream); + } + }; + + public void testBasics() throws IOException { + assertAnalyzesTo(analyzer, "スパゲッティー", + new String[] { "スパゲッティ" } + ); + } +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\ja\TestJapaneseKatakanaStemFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseTinyTokenizer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseTinyTokenizer.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/ja/TestJapaneseTinyTokenizer.java (revision 0) @@ -0,0 +1,87 @@ +package org.apache.lucene.analysis.ja; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +/** + * Simple tests for {@link JapaneseTinyTokenizer} + */ +public class TestJapaneseTinyTokenizer extends BaseTokenStreamTestCase { + + public void testBasics() throws Exception { + String str = "本日は晴天なり。 "; + StringReader reader = new StringReader(str); + JapaneseTinyTokenizer tokenizer = new JapaneseTinyTokenizer(reader); + assertTokenStreamContents(tokenizer, + new String[] {"本日", "は", "晴天", "なり"}, + new int[] {0, 2, 3, 5}, + new int[] {2, 3, 5, 7} + ); + } + + public void testEmptyStream() throws Exception { + String str = ""; + StringReader reader = new StringReader(str); + JapaneseTinyTokenizer tokenizer = new JapaneseTinyTokenizer(reader); + assertFalse(tokenizer.incrementToken()); + } + + public void testEnglishText() throws Exception { + String str = "This is a test. 1234 Test"; + StringReader reader = new StringReader(str); + JapaneseTinyTokenizer tokenizer = new JapaneseTinyTokenizer(reader); + assertTokenStreamContents(tokenizer, + new String[] { "This", "is", "a", "test", "1234", "Test" }); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new JapaneseTinyTokenizer(reader); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); + } + + public static void main(String args[]) throws Exception { + String str = "わずか25kバイトのソースコードで、日本語の新聞記事であれば文字単位で95%程度の精度で分かち書きが行えます。"; + JapaneseTinyTokenizer tokenizer = new JapaneseTinyTokenizer(new StringReader("")); + tokenizer.incrementToken(); + CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); + long ms1 = System.currentTimeMillis(); + for (int i = 0; i < 100000; i++) { + tokenizer.reset(new StringReader(str)); + while (tokenizer.incrementToken()) { + } + } + long ms2 = System.currentTimeMillis(); + System.out.println(ms2 - ms1); + } +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\ja\TestJapaneseTinyTokenizer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java (revision 0) @@ -0,0 +1,67 @@ +package org.apache.lucene.analysis.cjk; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +/** + * Tests for {@link CJKWidthFilter} + */ +public class TestCJKWidthFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new CJKWidthFilter(source)); + } + }; + + /** + * Full-width ASCII forms normalized to half-width (basic latin) + */ + public void testFullWidthASCII() throws IOException { + assertAnalyzesTo(analyzer, "Test 1234", + new String[] { "Test", "1234" }); + } + + /** + * Half-width katakana forms normalized to standard katakana. + * A bit trickier in some cases, since half-width forms are decomposed + * and voice marks need to be recombined with a preceding base form. + */ + public void testHalfWidthKana() throws IOException { + assertAnalyzesTo(analyzer, "カタカナ", + new String[] { "カタカナ" }); + assertAnalyzesTo(analyzer, "ヴィッツ", + new String[] { "ヴィッツ" }); + assertAnalyzesTo(analyzer, "パナソニック", + new String[] { "パナソニック" }); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); + } +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\cjk\TestCJKWidthFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (revision 0) @@ -0,0 +1,225 @@ +package org.apache.lucene.analysis.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.text.BreakIterator; +import java.util.Arrays; +import java.util.Locale; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +/** Basic tests for {@link SegmentingTokenizerBase} */ +public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase { + private Analyzer sentence = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new WholeSentenceTokenizer(reader); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + private Analyzer sentenceAndWord = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new SentenceAndWordTokenizer(reader); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + /** Some simple examples, just outputting the whole sentence boundaries as "terms" */ + public void testBasics() throws IOException { + assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence", + new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"} + ); + assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.", + new String[] { "He said, \"Are you going?\" ", + "John shook his head." } + ); + } + + /** Test a subclass that sets some custom attribute values */ + public void testCustomAttributes() throws IOException { + assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.", + new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" }, + new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 }, + new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 }, + new int[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 } + ); + } + + + /** Tests tokenstream reuse */ + public void testReuse() throws IOException { + assertAnalyzesToReuse(sentenceAndWord, "He said, \"Are you going?\"", + new String[] { "He", "said", "Are", "you", "going" }, + new int[] { 0, 3, 10, 14, 18 }, + new int[] { 2, 7, 13, 17, 23 }, + new int[] { 1, 1, 1, 1, 1,} + ); + assertAnalyzesToReuse(sentenceAndWord, "John shook his head.", + new String[] { "John", "shook", "his", "head" }, + new int[] { 0, 5, 11, 15 }, + new int[] { 4, 10, 14, 19 }, + new int[] { 1, 1, 1, 1 } + ); + } + + /** Tests TokenStream.end() */ + public void testEnd() throws IOException { + // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here. + // we add some junk whitespace to the end just to test it. + assertAnalyzesTo(sentenceAndWord, "John shook his head ", + new String[] { "John", "shook", "his", "head" } + ); + assertAnalyzesTo(sentenceAndWord, "John shook his head. ", + new String[] { "John", "shook", "his", "head" } + ); + } + + /** Tests terms which span across boundaries */ + public void testHugeDoc() throws IOException { + StringBuilder sb = new StringBuilder(); + char whitespace[] = new char[4094]; + Arrays.fill(whitespace, '\n'); + sb.append(whitespace); + sb.append("testing 1234"); + String input = sb.toString(); + assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" }); + } + + /** Tests the handling of binary/malformed data */ + public void testHugeTerm() throws IOException { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 40960; i++) { + sb.append('a'); + } + String input = sb.toString(); + char token[] = new char[4096]; + Arrays.fill(token, 'a'); + String expectedToken = new String(token); + String expected[] = { + expectedToken, expectedToken, expectedToken, + expectedToken, expectedToken, expectedToken, + expectedToken, expectedToken, expectedToken, + expectedToken + }; + assertAnalyzesTo(sentence, input, expected); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + checkRandomData(random, sentence, 10000*RANDOM_MULTIPLIER); + checkRandomData(random, sentenceAndWord, 10000*RANDOM_MULTIPLIER); + } + + // some tokenizers for testing + + /** silly tokenizer that just returns whole sentences as tokens */ + static class WholeSentenceTokenizer extends SegmentingTokenizerBase { + int sentenceStart, sentenceEnd; + boolean hasSentence; + + private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + public WholeSentenceTokenizer(Reader input) { + super(input, BreakIterator.getSentenceInstance(new Locale(""))); + } + + @Override + protected void setNextSentence(int sentenceStart, int sentenceEnd) { + this.sentenceStart = sentenceStart; + this.sentenceEnd = sentenceEnd; + hasSentence = true; + } + + @Override + protected boolean incrementWord() { + if (hasSentence) { + hasSentence = false; + clearAttributes(); + termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart); + offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd); + return true; + } else { + return false; + } + } + } + + /** + * simple tokenizer, that bumps posinc + 1 for tokens after a + * sentence boundary to inhibit phrase queries without slop. + */ + static class SentenceAndWordTokenizer extends SegmentingTokenizerBase { + int sentenceStart, sentenceEnd; + int wordStart, wordEnd; + int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost + + private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + public SentenceAndWordTokenizer(Reader input) { + super(input, BreakIterator.getSentenceInstance(new Locale(""))); + } + + @Override + protected void setNextSentence(int sentenceStart, int sentenceEnd) { + this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart; + this.sentenceEnd = sentenceEnd; + posBoost++; + } + + @Override + public void reset() throws IOException { + super.reset(); + posBoost = -1; + } + + @Override + protected boolean incrementWord() { + wordStart = wordEnd; + while (wordStart < sentenceEnd) { + if (Character.isLetterOrDigit(buffer[wordStart])) + break; + wordStart++; + } + + if (wordStart == sentenceEnd) return false; + + wordEnd = wordStart+1; + while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd])) + wordEnd++; + + clearAttributes(); + termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart); + offsetAtt.setOffset(offset+wordStart, offset+wordEnd); + posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost); + posBoost = 0; + return true; + } + } +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\util\TestSegmentingTokenizerBase.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java (revision 0) @@ -0,0 +1,78 @@ +package org.apache.lucene.analysis.ja; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadocs + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * Convert a katakana word to a normalized form by stemming a + * final KATAKANA-HIRAGANA PROLONGED SOUND MARK (U+30FC) at the end of the + * word. + *

+ * In general, most Japanese full-text search engines use more complicated + * methods which need dictionaries, which can be better than this filter in + * quality, but need a well-tuned dictionary. In contract, this filter is + * simple and maintenance-free. + *

+ * Note: This filter does not support half-width katakana characters, so you + * should convert them with {@link CJKWidthFilter} first. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + */ +public final class JapaneseKatakanaStemFilter extends TokenFilter { + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + + public JapaneseKatakanaStemFilter(TokenStream in) { + super(in); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAtt.isKeyword()) { + final char buffer[] = termAtt.buffer(); + final int length = termAtt.length(); + if (length > 3 && buffer[length-1] == '\u30FC' && isKatakanaString(buffer, length-1)) { + termAtt.setLength(length-1); + } + } + return true; + } else { + return false; + } + } + + private static boolean isKatakanaString(char s[], int length) { + for (int i = 0; i < length; i++) { + final char c = s[i]; + if (c < '\u3099' || c > '\u30FF' || c == '\u309F') // not katakana or (semi)-voiced sound marks + return false; + } + return true; + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\ja\JapaneseKatakanaStemFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseTinyTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseTinyTokenizer.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ja/JapaneseTinyTokenizer.java (revision 0) @@ -0,0 +1,194 @@ +package org.apache.lucene.analysis.ja; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.analysis.ja.TinySegmenterConstants.*; + +import java.io.Reader; +import java.text.BreakIterator; +import java.util.Locale; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.util.SegmentingTokenizerBase; + +/** + * Tokenizer for Japanese text based on + * TinySegmenter. + *

+ * This tokenizer uses no dictionary for segmentation, instead the algorithm is + * machine-learned. Text is segmented by sliding a six-character window across + * the sentence and combining the cost from features such as n-grams of characters + * and character categories. + *

+ * Some modifications from the original algorithm: + *

+ * Neither of these change the segmentation, for example, punctuation is still + * taken into context when computing the algorithm, just not produced as tokens. + */ +public final class JapaneseTinyTokenizer extends SegmentingTokenizerBase { + /** three context state variables, indicates if we broke at n-3,n-2,n-1 */ + private int p1, p2, p3; + + /** sentence being analyzed */ + private int sentenceStart, sentenceEnd; + + /** current word boundary info */ + private int start, end; + + /** we use a japanese sentence break iterator */ + private static final BreakIterator proto = BreakIterator.getSentenceInstance(Locale.JAPAN); + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + /** Create a tokenizer working on the supplied reader */ + public JapaneseTinyTokenizer(Reader input) { + super(input, (BreakIterator) proto.clone()); + } + + @Override + protected void setNextSentence(int sentenceStart, int sentenceEnd) { + p1 = p2 = p3 = PU; + this.sentenceStart = this.start = this.end = sentenceStart; + this.sentenceEnd = sentenceEnd; + } + + @Override + protected boolean incrementWord() { + if (end >= sentenceEnd) + return false; + + start = end; + for (end++; end < sentenceEnd; end++) { + if (isBoundary(end)) { + switch(tokenStatus(start, end)) { + case SKIP: + start = end; + continue; + case CONTINUE: + continue; + case BREAK: + return outputToken(); + } + } + } + + // last token of the string, return it unless its a skip token + return tokenStatus(start, end) == TokenStatus.SKIP ? false : outputToken(); + } + + private boolean outputToken() { + clearAttributes(); + termAtt.copyBuffer(buffer, start, end-start); + offsetAtt.setOffset(correctOffset(offset+start), correctOffset(offset+end)); + return true; + } + + private static enum TokenStatus { SKIP, CONTINUE, BREAK }; + + /** + * ok, we found a break from the algorithm, but if its a punctuation token, + * we want to SKIP it, if its a digit or surrogate pair we want to CONTINUE. + */ + private TokenStatus tokenStatus(int start, int end) { + final char ch = buffer[start]; + if (Character.isLetter(ch)) { + return TokenStatus.BREAK; + } else if (Character.isHighSurrogate(ch)) { // never break between surrogate pair + return TokenStatus.CONTINUE; + } else if (Character.isDigit(ch)) { // never break between runs of digits + return (end < sentenceEnd && Character.isDigit(buffer[end])) ? TokenStatus.CONTINUE : TokenStatus.BREAK; + } else { // skip punctuation-only tokens + return TokenStatus.SKIP; + } + } + + /** returns the character (or special sentence start/end marker) at this position */ + private int charAt(int pos) { + if (pos == sentenceStart-3) + return B3; + else if (pos == sentenceStart-2) + return B2; + else if (pos == sentenceStart-1) + return B1; + else if (pos == sentenceEnd) + return E1; + else if (pos == sentenceEnd+1) + return E2; + else if (pos == sentenceEnd+2) + return E3; + else + return buffer[pos]; + } + + /** true if there is a boundary at pos */ + private boolean isBoundary(int pos) { + final int c1 = charAt(pos-3); + final int t1 = charType(c1); + final int c2 = charAt(pos-2); + final int t2 = charType(c2); + final int c3 = charAt(pos-1); + final int t3 = charType(c3); + final int c4 = charAt(pos); + final int t4 = charType(c4); + final int c5 = charAt(pos+1); + final int t5 = charType(c5); + final int c6 = charAt(pos+2); + final int t6 = charType(c6); + + final int score = BIAS + // unigram context + + up1(p1) + up2(p2) + up3(p3) + // bigram context + + bp1(p1, p2) + bp2(p2, p3) + // unigram char + + uw1(c1) + uw2(c2) + uw3(c3) + uw4(c4) + uw5(c5) + uw6(c6) + // bigram char + + bw1(c2, c3) + bw2(c3, c4) + bw3(c4, c5) + // trigram char + + tw1(c1, c2, c3) + tw2(c2, c3, c4) + tw3(c3, c4, c5) + tw4(c4, c5, c6) + // unigram category + + uc1(t1) + uc2(t2) + uc3(t3) + uc4(t4) + uc5(t5) + uc6(t6) + // bigram category + + bc1(t2, t3) + bc2(t3, t4) + bc3(t4, t5) + // trigram category + + tc1(t1, t2, t3) + tc2(t2, t3, t4) + tc3(t3, t4, t5) + tc4(t4, t5, t6) + // unigram context+category + + uq1(p1, t1) + uq2(p2, t2) + uq3(p3, t3) + // bigram context+category + + bq1(p2, t2, t3) + bq2(p2, t3, t4) + bq3(p3, t2, t3) + bq4(p3, t3, t4) + // trigram context+category + + tq1(p2, t1, t2, t3) + tq2(p2, t2, t3, t4) + tq3(p3, t1, t2, t3) + tq4(p3, t2, t3, t4); + + // shift contextual state variables back one position + p1 = p2; + p2 = p3; + + if (score > 0) { + p3 = PB; // contextual state for n-1 was a break + return true; + } else { + p3 = PO; // not a break + return false; + } + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\ja\JapaneseTinyTokenizer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ja/TinySegmenterConstants.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ja/TinySegmenterConstants.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ja/TinySegmenterConstants.java (revision 0) @@ -0,0 +1,1745 @@ +package org.apache.lucene.analysis.ja; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Constants and macros for {@link JapaneseTinyTokenizer}. + */ +class TinySegmenterConstants { + + /** contextual state: start of string */ + public static final int PU = 0; + /** contextual state: non-boundary */ + public static final int PO = 1; + /** contextual state: boundary */ + public static final int PB = 2; + + /** character category: other */ + public static final int O = 0; + /** character category: ideographic number */ + public static final int M = 1; + /** character category: ideographic */ + public static final int H = 2; + /** character category: hiragana */ + public static final int I = 3; + /** character category: katakana */ + public static final int K = 4; + /** character category: alphabetic */ + public static final int A = 5; + /** character category: numeric */ + public static final int N = 6; + + /** begin marker: sentence start-1 */ + public static final int B1 = 0xE000; + /** begin marker: sentence start-2 */ + public static final int B2 = 0xE001; + /** begin marker: sentence start-3 */ + public static final int B3 = 0xE003; + /** end marker: sentence end */ + public static final int E1 = 0xE004; + /** end marker: sentence end+1 */ + public static final int E2 = 0xE005; + /** end marker: sentence end+2 */ + public static final int E3 = 0xE006; + + /** returns character category for ch */ + public static final int charType(int ch) { + switch(ch) { + case 0x4E00: + case 0x4E8C: + case 0x4E09: + case 0x56DB: + case 0x4E94: + case 0x516D: + case 0x4E03: + case 0x516B: + case 0x4E5D: + case 0x5341: + case 0x767E: + case 0x5343: + case 0x4E07: + case 0x5104: + case 0x5146: return M; + case 0x3005: + case 0x3006: + case 0x30F5: + case 0x30F6: return H; + case 0x30FC: + case 0xFF9E: + case 0xFF70: return K; + + default: + if (ch >= 0x4E00 && ch <= 0x9FA0) + return H; + else if (ch >= 0x3041 && ch <= 0x3093) + return I; + else if (ch >= 0x30A1 && ch <= 0x30F4) + return K; + else if (ch >= 0xFF71 && ch <= 0xFF9D) + return K; + else if (ch >= 0x0041 && ch <= 0x005A) + return A; + else if (ch >= 0x0061 && ch <= 0x007A) + return A; + else if (ch >= 0xFF21 && ch <= 0xFF3A) + return A; + else if (ch >= 0xFF41 && ch <= 0xFF5A) + return A; + else if (ch >= 0x0030 && ch <= 0x0039) + return N; + else if (ch >= 0xFF10 && ch <= 0xFF19) + return N; + else + return O; + } + } + + /** bias: if the accumulated score is greater than -BIAS its a break */ + public static final int BIAS = -332; + + // for all cost functions below, n is the position of the potential break + + /** bigram category cost for (n-2, n-1) */ + public static final int bc1(int i0, int i1) { + switch(i0 <<3| i1) { + case O <<3| H: return -1378; + case I <<3| I: return 2461; + case H <<3| H: return 6; + case K <<3| H: return 406; + default: return 0; + } + } + + /** bigram category cost for (n-1, n) */ + public static final int bc2(int i0, int i1) { + switch(i0 <<3| i1) { + case A <<3| N: return -878; + case M <<3| K: return 3334; + case H <<3| H: return -4070; + case I <<3| A: return 1327; + case K <<3| I: return 3831; + case K <<3| K: return -8741; + case H <<3| M: return -1711; + case A <<3| A: return -3267; + case H <<3| N: return 4012; + case H <<3| O: return 3761; + case I <<3| H: return -1184; + case I <<3| I: return -1332; + case I <<3| K: return 1721; + case A <<3| I: return 2744; + case I <<3| O: return 5492; + case M <<3| H: return -3132; + case O <<3| O: return -2920; + default: return 0; + } + } + + /** bigram category cost for (n, n+1) */ + public static final int bc3(int i0, int i1) { + switch(i0 <<3| i1) { + case M <<3| K: return 1079; + case M <<3| M: return 4034; + case H <<3| H: return 996; + case H <<3| I: return 626; + case H <<3| K: return -721; + case O <<3| A: return -1652; + case K <<3| K: return 2762; + case H <<3| N: return -1307; + case H <<3| O: return -836; + case I <<3| H: return -301; + case O <<3| H: return 266; + default: return 0; + } + } + + /** bigram context cost for (n-3, n-2) */ + public static final int bp1(int i0, int i1) { + switch(i0 <<3| i1) { + case PB <<3| PB: return 295; + case PU <<3| PB: return 352; + case PO <<3| PB: return 304; + case PO <<3| PO: return -125; + default: return 0; + } + } + + /** bigram context cost for (n-2, n-1) */ + public static final int bp2(int i0, int i1) { + switch(i0 <<3| i1) { + case PB <<3| PO: return 60; + case PO <<3| PO: return -1762; + default: return 0; + } + } + + /** bigram category (n-2, n-1) with context (n-2) cost */ + public static final int bq1(int i0, int i1, int i2) { + switch(i0 <<6| i1 <<3| i2) { + case PB <<6| H <<3| M: return 1521; + case PO <<6| H <<3| I: return 451; + case PO <<6| K <<3| H: return -1020; + case PO <<6| K <<3| K: return 904; + case PB <<6| I <<3| I: return -1158; + case PB <<6| O <<3| H: return -91; + case PO <<6| I <<3| H: return -296; + case PB <<6| I <<3| M: return 886; + case PB <<6| O <<3| O: return -2597; + case PB <<6| M <<3| H: return 1208; + case PO <<6| K <<3| A: return 1851; + case PO <<6| O <<3| O: return 2965; + case PB <<6| H <<3| H: return 1150; + case PB <<6| N <<3| H: return 449; + default: return 0; + } + } + + /** bigram category (n-1, n) with context (n-2) cost */ + public static final int bq2(int i0, int i1, int i2) { + switch(i0 <<6| i1 <<3| i2) { + case PB <<6| K <<3| K: return -1720; + case PO <<6| H <<3| H: return -1139; + case PB <<6| H <<3| M: return 466; + case PB <<6| K <<3| O: return 864; + case PB <<6| I <<3| H: return -919; + case PO <<6| H <<3| M: return -181; + case PO <<6| I <<3| H: return 153; + case PU <<6| H <<3| I: return -1146; + case PB <<6| H <<3| H: return 118; + case PB <<6| H <<3| I: return -1159; + default: return 0; + } + } + + /** bigram category (n-2, n-1) with context (n-1) cost */ + public static final int bq3(int i0, int i1, int i2) { + switch(i0 <<6| i1 <<3| i2) { + case PO <<6| H <<3| H: return 2174; + case PO <<6| K <<3| H: return 1798; + case PO <<6| K <<3| I: return -793; + case PB <<6| N <<3| N: return 998; + case PB <<6| I <<3| I: return -299; + case PO <<6| H <<3| M: return 439; + case PB <<6| O <<3| H: return 775; + case PO <<6| K <<3| O: return -2242; + case PO <<6| I <<3| I: return 280; + case PB <<6| M <<3| H: return 937; + case PO <<6| M <<3| H: return -2402; + case PO <<6| O <<3| O: return 11699; + case PB <<6| M <<3| M: return 8335; + case PB <<6| H <<3| H: return -792; + case PB <<6| H <<3| I: return 2664; + case PB <<6| K <<3| I: return 419; + default: return 0; + } + } + + /** bigram category (n-1, n) with context (n-1) cost */ + public static final int bq4(int i0, int i1, int i2) { + switch(i0 <<6| i1 <<3| i2) { + case PB <<6| K <<3| K: return -1806; + case PO <<6| H <<3| H: return 266; + case PO <<6| H <<3| K: return -2036; + case PB <<6| I <<3| H: return 3761; + case PB <<6| I <<3| I: return -4654; + case PB <<6| I <<3| K: return 1348; + case PO <<6| N <<3| N: return -973; + case PB <<6| O <<3| O: return -12396; + case PO <<6| A <<3| H: return 926; + case PB <<6| M <<3| I: return -3385; + case PB <<6| H <<3| H: return -3895; + default: return 0; + } + } + + /** bigram character cost for (n-2, n-1) */ + public static final int bw1(int i0, int i1) { + switch(i0 <<16| i1) { + case 0x5f15 <<16| 0x304d: return (i0 == 0x5f15 && i1 == 0x304d) ? -1336 : 0; + case 0x304b <<16| 0x3089: return (i0 == 0x304b && i1 == 0x3089) ? 3472 : 0; + case 0x3044 <<16| 0x3046: return (i0 == 0x3044 && i1 == 0x3046) ? 1743 : 0; + case 0x3092 <<16| 0x898b: return (i0 == 0x3092 && i1 == 0x898b) ? 731 : 0; + case 0x5e73 <<16| 0x65b9: return (i0 == 0x5e73 && i1 == 0x65b9) ? -2314 : 0; + case 0xe000 <<16| 0x540c: return (i0 == 0xe000 && i1 == 0x540c) ? 542 : 0; + case 0x3066 <<16| 0x3044: return (i0 == 0x3066 && i1 == 0x3044) ? 805 : 0; + case 0x305f <<16| 0x3061: return (i0 == 0x305f && i1 == 0x3061) ? 1122 : 0; + case 0x5927 <<16| 0x962a: return (i0 == 0x5927 && i1 == 0x962a) ? 1497 : 0; + case 0x307e <<16| 0x305b: return (i0 == 0x307e && i1 == 0x305b) ? 2448 : 0; + case 0x53d6 <<16| 0x308a: return (i0 == 0x53d6 && i1 == 0x308a) ? -2784 : 0; + case 0x306b <<16| 0x306f: return (i0 == 0x306b && i1 == 0x306f) ? 1498 : 0; + case 0x3066 <<16| 0x304d: return (i0 == 0x3066 && i1 == 0x304d) ? 1249 : 0; + case 0x3059 <<16| 0x3067: return (i0 == 0x3059 && i1 == 0x3067) ? -3399 : 0; + case 0x6bce <<16| 0x65e5: return (i0 == 0x6bce && i1 == 0x65e5) ? -2113 : 0; + case 0x3069 <<16| 0x3053: return (i0 == 0x3069 && i1 == 0x3053) ? 3887 : 0; + case 0x306a <<16| 0x3093: return (i0 == 0x306a && i1 == 0x3093) ? -1113 : 0; + case 0x3055 <<16| 0x3089: return (i0 == 0x3055 && i1 == 0x3089) ? -4143 : 0; + case 0x3053 <<16| 0x3068: return (i0 == 0x3053 && i1 == 0x3068) ? 2083 : 0; + case 0x307e <<16| 0x3067: return (i0 == 0x307e && i1 == 0x3067) ? 1711 : 0; + case 0x306e <<16| 0x4e2d: return (i0 == 0x306e && i1 == 0x4e2d) ? 741 : 0; + case 0x305d <<16| 0x3053: return (i0 == 0x305d && i1 == 0x3053) ? 1977 : 0; + case 0x3044 <<16| 0x3063: return (i0 == 0x3044 && i1 == 0x3063) ? -2055 : 0; + case 0x304c <<16| 0x3089: return (i0 == 0x304c && i1 == 0x3089) ? 600 : 0; + case 0x3068 <<16| 0x307f: return (i0 == 0x3068 && i1 == 0x307f) ? 1922 : 0; + case 0x3055 <<16| 0x3093: return (i0 == 0x3055 && i1 == 0x3093) ? 4573 : 0; + case 0x306b <<16| 0x3082: return (i0 == 0x306b && i1 == 0x3082) ? 1671 : 0; + case 0x3063 <<16| 0x305f: return (i0 == 0x3063 && i1 == 0x305f) ? 3463 : 0; + case 0x306a <<16| 0x3044: return (i0 == 0x306a && i1 == 0x3044) ? 5713 : 0; + case 0x300d <<16| 0x3068: return (i0 == 0x300d && i1 == 0x3068) ? 1682 : 0; + case 0x3064 <<16| 0x3044: return (i0 == 0x3064 && i1 == 0x3044) ? -802 : 0; + case 0x305f <<16| 0x3081: return (i0 == 0x305f && i1 == 0x3081) ? 601 : 0; + case 0x3057 <<16| 0x305f: return (i0 == 0x3057 && i1 == 0x305f) ? 2641 : 0; + case 0x3046 <<16| 0x3093: return (i0 == 0x3046 && i1 == 0x3093) ? 665 : 0; + case 0x672c <<16| 0x5f53: return (i0 == 0x672c && i1 == 0x5f53) ? -2423 : 0; + case 0x3067 <<16| 0x304d: return (i0 == 0x3067 && i1 == 0x304d) ? 1127 : 0; + case 0x3001 <<16| 0x3068: return (i0 == 0x3001 && i1 == 0x3068) ? 660 : 0; + case 0x3084 <<16| 0x3080: return (i0 == 0x3084 && i1 == 0x3080) ? -1947 : 0; + case 0x3088 <<16| 0x3063: return (i0 == 0x3088 && i1 == 0x3063) ? -2565 : 0; + case 0x307e <<16| 0x307e: return (i0 == 0x307e && i1 == 0x307e) ? 2600 : 0; + case 0x3057 <<16| 0x3066: return (i0 == 0x3057 && i1 == 0x3066) ? 1104 : 0; + case 0x3001 <<16| 0x540c: return (i0 == 0x3001 && i1 == 0x540c) ? 727 : 0; + case 0x306b <<16| 0x5bfe: return (i0 == 0x306b && i1 == 0x5bfe) ? -912 : 0; + case 0x4ea1 <<16| 0x304f: return (i0 == 0x4ea1 && i1 == 0x304f) ? -1886 : 0; + case 0xff63 <<16| 0x3068: return (i0 == 0xff63 && i1 == 0x3068) ? 1682 : 0; + case 0x3067 <<16| 0x3059: return (i0 == 0x3067 && i1 == 0x3059) ? 3445 : 0; + case 0x5927 <<16| 0x304d: return (i0 == 0x5927 && i1 == 0x304d) ? -2604 : 0; + case 0xe000 <<16| 0x3042: return (i0 == 0xe000 && i1 == 0x3042) ? 1404 : 0; + case 0x3092 <<16| 0x3057: return (i0 == 0x3092 && i1 == 0x3057) ? 1860 : 0; + case 0x3042 <<16| 0x3063: return (i0 == 0x3042 && i1 == 0x3063) ? 1505 : 0; + case 0x307e <<16| 0x308b: return (i0 == 0x307e && i1 == 0x308b) ? -2155 : 0; + case 0x4eac <<16| 0x90fd: return (i0 == 0x4eac && i1 == 0x90fd) ? 2558 : 0; + case 0x3053 <<16| 0x3093: return (i0 == 0x3053 && i1 == 0x3093) ? -1262 : 0; + case 0x306a <<16| 0x3063: return (i0 == 0x306a && i1 == 0x3063) ? 3015 : 0; + case 0x3068 <<16| 0x3044: return (i0 == 0x3068 && i1 == 0x3044) ? -4915 : 0; + case 0x3044 <<16| 0x308b: return (i0 == 0x3044 && i1 == 0x308b) ? 672 : 0; + case 0x002c <<16| 0x3068: return (i0 == 0x002c && i1 == 0x3068) ? 660 : 0; + case 0x308c <<16| 0x305f: return (i0 == 0x308c && i1 == 0x305f) ? 2369 : 0; + case 0x306a <<16| 0x3069: return (i0 == 0x306a && i1 == 0x3069) ? 7379 : 0; + case 0x002c <<16| 0x540c: return (i0 == 0x002c && i1 == 0x540c) ? 727 : 0; + case 0x306e <<16| 0x4e00: return (i0 == 0x306e && i1 == 0x4e00) ? -501 : 0; + case 0x76ee <<16| 0x6307: return (i0 == 0x76ee && i1 == 0x6307) ? -724 : 0; + case 0x3046 <<16| 0x3057: return (i0 == 0x3046 && i1 == 0x3057) ? -4817 : 0; + case 0x308c <<16| 0x3067: return (i0 == 0x308c && i1 == 0x3067) ? -913 : 0; + case 0x3067 <<16| 0x306f: return (i0 == 0x3067 && i1 == 0x306f) ? 844 : 0; + case 0x305d <<16| 0x308c: return (i0 == 0x305d && i1 == 0x308c) ? -871 : 0; + case 0x3053 <<16| 0x3046: return (i0 == 0x3053 && i1 == 0x3046) ? -790 : 0; + case 0x306b <<16| 0x3057: return (i0 == 0x306b && i1 == 0x3057) ? 2468 : 0; + case 0x65e5 <<16| 0x672c: return (i0 == 0x65e5 && i1 == 0x672c) ? -195 : 0; + default: return 0; + } + } + + /** bigram character cost for (n-1, n) */ + public static final int bw2(int i0, int i1) { + switch(i0 <<16| i1) { + case 0x2015 <<16| 0x2015: return (i0 == 0x2015 && i1 == 0x2015) ? -5730 : 0; + case 0x308c <<16| 0x3070: return (i0 == 0x308c && i1 == 0x3070) ? 4114 : 0; + case 0x3068 <<16| 0x3053: return (i0 == 0x3068 && i1 == 0x3053) ? -1746 : 0; + case 0x306b <<16| 0x5bfe: return (i0 == 0x306b && i1 == 0x5bfe) ? -14943 : 0; + case 0x0031 <<16| 0x0031: return (i0 == 0x0031 && i1 == 0x0031) ? -669 : 0; + case 0x3093 <<16| 0x3060: return (i0 == 0x3093 && i1 == 0x3060) ? 728 : 0; + case 0x306f <<16| 0x3044: return (i0 == 0x306f && i1 == 0x3044) ? 1073 : 0; + case 0x304f <<16| 0x306a: return (i0 == 0x304f && i1 == 0x306a) ? -1597 : 0; + case 0x4e00 <<16| 0x90e8: return (i0 == 0x4e00 && i1 == 0x90e8) ? -1051 : 0; + case 0x59d4 <<16| 0x54e1: return (i0 == 0x59d4 && i1 == 0x54e1) ? -1250 : 0; + case 0x306e <<16| 0x3067: return (i0 == 0x306e && i1 == 0x3067) ? -7059 : 0; + case 0x3067 <<16| 0x3082: return (i0 == 0x3067 && i1 == 0x3082) ? -4203 : 0; + case 0x3044 <<16| 0x3046: return (i0 == 0x3044 && i1 == 0x3046) ? -1609 : 0; + case 0x306e <<16| 0x306b: return (i0 == 0x306e && i1 == 0x306b) ? -6041 : 0; + case 0x306f <<16| 0x304c: return (i0 == 0x306f && i1 == 0x304c) ? -1033 : 0; + case 0x3093 <<16| 0x306a: return (i0 == 0x3093 && i1 == 0x306a) ? -4115 : 0; + case 0x65b0 <<16| 0x805e: return (i0 == 0x65b0 && i1 == 0x805e) ? -4066 : 0; + case 0x3068 <<16| 0x3068: return (i0 == 0x3068 && i1 == 0x3068) ? -2279 : 0; + case 0x306e <<16| 0x306e: return (i0 == 0x306e && i1 == 0x306e) ? -6125 : 0; + case 0x4f1a <<16| 0x793e: return (i0 == 0x4f1a && i1 == 0x793e) ? -1116 : 0; + case 0x540c <<16| 0x515a: return (i0 == 0x540c && i1 == 0x515a) ? 970 : 0; + case 0x3068 <<16| 0x306e: return (i0 == 0x3068 && i1 == 0x306e) ? 720 : 0; + case 0x3082 <<16| 0x3044: return (i0 == 0x3082 && i1 == 0x3044) ? 2230 : 0; + case 0x3081 <<16| 0x3066: return (i0 == 0x3081 && i1 == 0x3066) ? -3153 : 0; + case 0x3057 <<16| 0x3044: return (i0 == 0x3057 && i1 == 0x3044) ? -1819 : 0; + case 0x306f <<16| 0x305a: return (i0 == 0x306f && i1 == 0x305a) ? -2532 : 0; + case 0x4e00 <<16| 0x65b9: return (i0 == 0x4e00 && i1 == 0x65b9) ? -1375 : 0; + case 0x3092 <<16| 0x901a: return (i0 == 0x3092 && i1 == 0x901a) ? -11877 : 0; + case 0x5c11 <<16| 0x306a: return (i0 == 0x5c11 && i1 == 0x306a) ? -1050 : 0; + case 0x3057 <<16| 0x304b: return (i0 == 0x3057 && i1 == 0x304b) ? -545 : 0; + case 0x4e0a <<16| 0x304c: return (i0 == 0x4e0a && i1 == 0x304c) ? -4479 : 0; + case 0x3055 <<16| 0x308c: return (i0 == 0x3055 && i1 == 0x308c) ? 13168 : 0; + case 0x3068 <<16| 0x307f: return (i0 == 0x3068 && i1 == 0x307f) ? 5168 : 0; + case 0x2212 <<16| 0x2212: return (i0 == 0x2212 && i1 == 0x2212) ? -13175 : 0; + case 0x3068 <<16| 0x3082: return (i0 == 0x3068 && i1 == 0x3082) ? -3941 : 0; + case 0x306a <<16| 0x3044: return (i0 == 0x306a && i1 == 0x3044) ? -2488 : 0; + case 0x672c <<16| 0x4eba: return (i0 == 0x672c && i1 == 0x4eba) ? -2697 : 0; + case 0x3063 <<16| 0x305f: return (i0 == 0x3063 && i1 == 0x305f) ? 4589 : 0; + case 0x3055 <<16| 0x3093: return (i0 == 0x3055 && i1 == 0x3093) ? -3977 : 0; + case 0x306b <<16| 0x95a2: return (i0 == 0x306b && i1 == 0x95a2) ? -11388 : 0; + case 0x306a <<16| 0x304c: return (i0 == 0x306a && i1 == 0x304c) ? -1313 : 0; + case 0x3063 <<16| 0x3066: return (i0 == 0x3063 && i1 == 0x3066) ? 1647 : 0; + case 0x3063 <<16| 0x3068: return (i0 == 0x3063 && i1 == 0x3068) ? -2094 : 0; + case 0x624b <<16| 0x6a29: return (i0 == 0x624b && i1 == 0x6a29) ? -1982 : 0; + case 0x3057 <<16| 0x305f: return (i0 == 0x3057 && i1 == 0x305f) ? 5078 : 0; + case 0x304b <<16| 0x3057: return (i0 == 0x304b && i1 == 0x3057) ? -1350 : 0; + case 0x3089 <<16| 0x304b: return (i0 == 0x3089 && i1 == 0x304b) ? -944 : 0; + case 0x66dc <<16| 0x65e5: return (i0 == 0x66dc && i1 == 0x65e5) ? -601 : 0; + case 0x5e74 <<16| 0x5ea6: return (i0 == 0x5e74 && i1 == 0x5ea6) ? -8669 : 0; + case 0x3057 <<16| 0x3066: return (i0 == 0x3057 && i1 == 0x3066) ? 972 : 0; + case 0x305d <<16| 0x306e: return (i0 == 0x305d && i1 == 0x306e) ? -3744 : 0; + case 0x3057 <<16| 0x306a: return (i0 == 0x3057 && i1 == 0x306a) ? 939 : 0; + case 0x3082 <<16| 0x306e: return (i0 == 0x3082 && i1 == 0x306e) ? -10713 : 0; + case 0x4e00 <<16| 0x4eba: return (i0 == 0x4e00 && i1 == 0x4eba) ? 602 : 0; + case 0x6771 <<16| 0x4eac: return (i0 == 0x6771 && i1 == 0x4eac) ? -1543 : 0; + case 0x304c <<16| 0x3044: return (i0 == 0x304c && i1 == 0x3044) ? 853 : 0; + case 0x3089 <<16| 0x3057: return (i0 == 0x3089 && i1 == 0x3057) ? -1611 : 0; + case 0x7c73 <<16| 0x56fd: return (i0 == 0x7c73 && i1 == 0x56fd) ? -4268 : 0; + case 0x4e00 <<16| 0x65e5: return (i0 == 0x4e00 && i1 == 0x65e5) ? 970 : 0; + case 0x306a <<16| 0x3069: return (i0 == 0x306a && i1 == 0x3069) ? -6509 : 0; + case 0x306b <<16| 0x304a: return (i0 == 0x306b && i1 == 0x304a) ? -1615 : 0; + case 0x3046 <<16| 0x304b: return (i0 == 0x3046 && i1 == 0x304b) ? 2490 : 0; + case 0x65e5 <<16| 0x7c73: return (i0 == 0x65e5 && i1 == 0x7c73) ? 3372 : 0; + case 0x305f <<16| 0x3044: return (i0 == 0x305f && i1 == 0x3044) ? -1253 : 0; + case 0x306a <<16| 0x306e: return (i0 == 0x306a && i1 == 0x306e) ? 2614 : 0; + case 0x3089 <<16| 0x306b: return (i0 == 0x3089 && i1 == 0x306b) ? -1897 : 0; + case 0x5927 <<16| 0x962a: return (i0 == 0x5927 && i1 == 0x962a) ? -2471 : 0; + case 0x306b <<16| 0x3057: return (i0 == 0x306b && i1 == 0x3057) ? 2748 : 0; + case 0x5e9c <<16| 0x770c: return (i0 == 0x5e9c && i1 == 0x770c) ? -2363 : 0; + case 0x304b <<16| 0x3082: return (i0 == 0x304b && i1 == 0x3082) ? -602 : 0; + case 0x308a <<16| 0x3057: return (i0 == 0x308a && i1 == 0x3057) ? 651 : 0; + case 0x793e <<16| 0x4f1a: return (i0 == 0x793e && i1 == 0x4f1a) ? -1276 : 0; + case 0x304b <<16| 0x3089: return (i0 == 0x304b && i1 == 0x3089) ? -7194 : 0; + case 0x307e <<16| 0x3057: return (i0 == 0x307e && i1 == 0x3057) ? -1316 : 0; + case 0x304b <<16| 0x308c: return (i0 == 0x304b && i1 == 0x308c) ? 4612 : 0; + case 0x3070 <<16| 0x308c: return (i0 == 0x3070 && i1 == 0x308c) ? 1813 : 0; + case 0x3066 <<16| 0x3044: return (i0 == 0x3066 && i1 == 0x3044) ? 6144 : 0; + case 0x305f <<16| 0x305f: return (i0 == 0x305f && i1 == 0x305f) ? -662 : 0; + case 0x306b <<16| 0x306a: return (i0 == 0x306b && i1 == 0x306a) ? 2454 : 0; + case 0x305f <<16| 0x3060: return (i0 == 0x305f && i1 == 0x3060) ? -3857 : 0; + case 0x305f <<16| 0x3061: return (i0 == 0x305f && i1 == 0x3061) ? -786 : 0; + case 0x7b2c <<16| 0x306b: return (i0 == 0x7b2c && i1 == 0x306b) ? -1612 : 0; + case 0x308f <<16| 0x308c: return (i0 == 0x308f && i1 == 0x308c) ? 7901 : 0; + case 0x3066 <<16| 0x304d: return (i0 == 0x3066 && i1 == 0x304d) ? 3640 : 0; + case 0x305f <<16| 0x3068: return (i0 == 0x305f && i1 == 0x3068) ? 1224 : 0; + case 0x3066 <<16| 0x304f: return (i0 == 0x3066 && i1 == 0x304f) ? 2551 : 0; + case 0x306a <<16| 0x3093: return (i0 == 0x306a && i1 == 0x3093) ? 3099 : 0; + case 0x540c <<16| 0x65e5: return (i0 == 0x540c && i1 == 0x65e5) ? -913 : 0; + case 0x002e <<16| 0x002e: return (i0 == 0x002e && i1 == 0x002e) ? -11822 : 0; + case 0x307e <<16| 0x3067: return (i0 == 0x307e && i1 == 0x3067) ? -6621 : 0; + case 0x304d <<16| 0x305f: return (i0 == 0x304d && i1 == 0x305f) ? 1941 : 0; + case 0x305f <<16| 0x306f: return (i0 == 0x305f && i1 == 0x306f) ? -939 : 0; + case 0x3053 <<16| 0x3068: return (i0 == 0x3053 && i1 == 0x3068) ? -8392 : 0; + case 0x7136 <<16| 0x3068: return (i0 == 0x7136 && i1 == 0x3068) ? -1384 : 0; + case 0x3053 <<16| 0x306e: return (i0 == 0x3053 && i1 == 0x306e) ? -4193 : 0; + case 0x304c <<16| 0x3089: return (i0 == 0x304c && i1 == 0x3089) ? -3198 : 0; + case 0x308a <<16| 0x307e: return (i0 == 0x308a && i1 == 0x307e) ? 1620 : 0; + case 0x3067 <<16| 0x3044: return (i0 == 0x3067 && i1 == 0x3044) ? 2666 : 0; + case 0x306b <<16| 0x3088: return (i0 == 0x306b && i1 == 0x3088) ? -7236 : 0; + case 0xff11 <<16| 0xff11: return (i0 == 0xff11 && i1 == 0xff11) ? -669 : 0; + case 0x3067 <<16| 0x304d: return (i0 == 0x3067 && i1 == 0x304d) ? -1528 : 0; + case 0x306b <<16| 0x5f93: return (i0 == 0x306b && i1 == 0x5f93) ? -4688 : 0; + case 0x3066 <<16| 0x306f: return (i0 == 0x3066 && i1 == 0x306f) ? -3110 : 0; + case 0x7acb <<16| 0x3066: return (i0 == 0x7acb && i1 == 0x3066) ? -990 : 0; + case 0x3067 <<16| 0x3057: return (i0 == 0x3067 && i1 == 0x3057) ? -3828 : 0; + case 0x3067 <<16| 0x3059: return (i0 == 0x3067 && i1 == 0x3059) ? -4761 : 0; + case 0x307e <<16| 0x308c: return (i0 == 0x307e && i1 == 0x308c) ? 5409 : 0; + case 0x308c <<16| 0x305f: return (i0 == 0x308c && i1 == 0x305f) ? 4270 : 0; + case 0x3066 <<16| 0x3082: return (i0 == 0x3066 && i1 == 0x3082) ? -3065 : 0; + case 0x3068 <<16| 0x3044: return (i0 == 0x3068 && i1 == 0x3044) ? 1890 : 0; + case 0x5206 <<16| 0x306e: return (i0 == 0x5206 && i1 == 0x306e) ? -7758 : 0; + case 0x306e <<16| 0x304b: return (i0 == 0x306e && i1 == 0x304b) ? 2093 : 0; + case 0x308d <<16| 0x3046: return (i0 == 0x308d && i1 == 0x3046) ? 6067 : 0; + case 0x51fa <<16| 0x3066: return (i0 == 0x51fa && i1 == 0x3066) ? 2163 : 0; + case 0x65e5 <<16| 0x672c: return (i0 == 0x65e5 && i1 == 0x672c) ? -7068 : 0; + case 0x308c <<16| 0x3066: return (i0 == 0x308c && i1 == 0x3066) ? 849 : 0; + case 0x5e74 <<16| 0x9593: return (i0 == 0x5e74 && i1 == 0x9593) ? -1626 : 0; + case 0x65e5 <<16| 0x65b0: return (i0 == 0x65e5 && i1 == 0x65b0) ? -722 : 0; + case 0x671d <<16| 0x9bae: return (i0 == 0x671d && i1 == 0x9bae) ? -2355 : 0; + case 0x3055 <<16| 0x305b: return (i0 == 0x3055 && i1 == 0x305b) ? 4533 : 0; + default: return 0; + } + } + + /** bigram character cost for (n, n+1) */ + public static final int bw3(int i0, int i1) { + switch(i0 <<16| i1) { + case 0x3067 <<16| 0x306b: return (i0 == 0x3067 && i1 == 0x306b) ? -1482 : 0; + case 0xff82 <<16| 0x5e02: return (i0 == 0xff82 && i1 == 0x5e02) ? 965 : 0; + case 0x308b <<16| 0x308b: return (i0 == 0x308b && i1 == 0x308b) ? 3818 : 0; + case 0x3067 <<16| 0x306f: return (i0 == 0x3067 && i1 == 0x306f) ? 2295 : 0; + case 0x308c <<16| 0x3070: return (i0 == 0x308c && i1 == 0x3070) ? -3246 : 0; + case 0x65e5 <<16| 0x3001: return (i0 == 0x65e5 && i1 == 0x3001) ? 974 : 0; + case 0x305f <<16| 0x002e: return (i0 == 0x305f && i1 == 0x002e) ? 8875 : 0; + case 0x3068 <<16| 0x3057: return (i0 == 0x3068 && i1 == 0x3057) ? 2266 : 0; + case 0x304c <<16| 0x3001: return (i0 == 0x304c && i1 == 0x3001) ? 1816 : 0; + case 0x3059 <<16| 0x002e: return (i0 == 0x3059 && i1 == 0x002e) ? -1310 : 0; + case 0x3093 <<16| 0x3060: return (i0 == 0x3093 && i1 == 0x3060) ? 606 : 0; + case 0x306b <<16| 0x3001: return (i0 == 0x306b && i1 == 0x3001) ? -1021 : 0; + case 0x3044 <<16| 0x3044: return (i0 == 0x3044 && i1 == 0x3044) ? 5308 : 0; + case 0x3093 <<16| 0x3067: return (i0 == 0x3093 && i1 == 0x3067) ? 798 : 0; + case 0x3069 <<16| 0x3046: return (i0 == 0x3069 && i1 == 0x3046) ? 4664 : 0; + case 0x3044 <<16| 0x3048: return (i0 == 0x3044 && i1 == 0x3048) ? 2079 : 0; + case 0x65b0 <<16| 0x805e: return (i0 == 0x65b0 && i1 == 0x805e) ? -5055 : 0; + case 0x305f <<16| 0x3002: return (i0 == 0x305f && i1 == 0x3002) ? 8875 : 0; + case 0x3042 <<16| 0x308a: return (i0 == 0x3042 && i1 == 0x308a) ? 719 : 0; + case 0x3042 <<16| 0x308b: return (i0 == 0x3042 && i1 == 0x308b) ? 3846 : 0; + case 0x3044 <<16| 0x304f: return (i0 == 0x3044 && i1 == 0x304f) ? 3029 : 0; + case 0x308c <<16| 0x308b: return (i0 == 0x308c && i1 == 0x308b) ? 1091 : 0; + case 0x3068 <<16| 0x306e: return (i0 == 0x3068 && i1 == 0x306e) ? 541 : 0; + case 0x3059 <<16| 0x3002: return (i0 == 0x3059 && i1 == 0x3002) ? -1310 : 0; + case 0x305d <<16| 0x3046: return (i0 == 0x305d && i1 == 0x3046) ? 428 : 0; + case 0x3057 <<16| 0x3044: return (i0 == 0x3057 && i1 == 0x3044) ? -3714 : 0; + case 0x3060 <<16| 0x002e: return (i0 == 0x3060 && i1 == 0x002e) ? 4098 : 0; + case 0x305a <<16| 0x002c: return (i0 == 0x305a && i1 == 0x002c) ? 3426 : 0; + case 0x30ab <<16| 0x6708: return (i0 == 0x30ab && i1 == 0x6708) ? 990 : 0; + case 0x3044 <<16| 0x305f: return (i0 == 0x3044 && i1 == 0x305f) ? 2056 : 0; + case 0x3044 <<16| 0x3063: return (i0 == 0x3044 && i1 == 0x3063) ? 1883 : 0; + case 0x5927 <<16| 0x4f1a: return (i0 == 0x5927 && i1 == 0x4f1a) ? 2217 : 0; + case 0x3068 <<16| 0x3082: return (i0 == 0x3068 && i1 == 0x3082) ? -3543 : 0; + case 0x3055 <<16| 0x3092: return (i0 == 0x3055 && i1 == 0x3092) ? 976 : 0; + case 0x306a <<16| 0x3044: return (i0 == 0x306a && i1 == 0x3044) ? 1796 : 0; + case 0x3063 <<16| 0x305f: return (i0 == 0x3063 && i1 == 0x305f) ? -4748 : 0; + case 0x3060 <<16| 0x3002: return (i0 == 0x3060 && i1 == 0x3002) ? 4098 : 0; + case 0x304b <<16| 0x3051: return (i0 == 0x304b && i1 == 0x3051) ? -743 : 0; + case 0x3063 <<16| 0x3066: return (i0 == 0x3063 && i1 == 0x3066) ? 300 : 0; + case 0x305a <<16| 0x3001: return (i0 == 0x305a && i1 == 0x3001) ? 3426 : 0; + case 0x3057 <<16| 0x305f: return (i0 == 0x3057 && i1 == 0x305f) ? 3562 : 0; + case 0x306a <<16| 0x304f: return (i0 == 0x306a && i1 == 0x304f) ? -903 : 0; + case 0x308c <<16| 0x002c: return (i0 == 0x308c && i1 == 0x002c) ? 854 : 0; + case 0x3057 <<16| 0x3066: return (i0 == 0x3057 && i1 == 0x3066) ? 1449 : 0; + case 0x3057 <<16| 0x306a: return (i0 == 0x3057 && i1 == 0x306a) ? 2608 : 0; + case 0x304b <<16| 0x3063: return (i0 == 0x304b && i1 == 0x3063) ? -4098 : 0; + case 0x3089 <<16| 0x3057: return (i0 == 0x3089 && i1 == 0x3057) ? 1479 : 0; + case 0x3051 <<16| 0x3069: return (i0 == 0x3051 && i1 == 0x3069) ? 1374 : 0; + case 0xff76 <<16| 0x6708: return (i0 == 0xff76 && i1 == 0x6708) ? 990 : 0; + case 0x308c <<16| 0x3001: return (i0 == 0x308c && i1 == 0x3001) ? 854 : 0; + case 0x304b <<16| 0x306b: return (i0 == 0x304b && i1 == 0x306b) ? -669 : 0; + case 0x304c <<16| 0x304d: return (i0 == 0x304c && i1 == 0x304d) ? -4855 : 0; + case 0x306e <<16| 0x002c: return (i0 == 0x306e && i1 == 0x002c) ? -724 : 0; + case 0x306a <<16| 0x3069: return (i0 == 0x306a && i1 == 0x3069) ? 2135 : 0; + case 0x304c <<16| 0x3051: return (i0 == 0x304c && i1 == 0x3051) ? -1127 : 0; + case 0x3044 <<16| 0x308b: return (i0 == 0x3044 && i1 == 0x308b) ? 5600 : 0; + case 0x305f <<16| 0x3044: return (i0 == 0x305f && i1 == 0x3044) ? -594 : 0; + case 0x3057 <<16| 0x307e: return (i0 == 0x3057 && i1 == 0x307e) ? 1200 : 0; + case 0x3044 <<16| 0x308f: return (i0 == 0x3044 && i1 == 0x308f) ? 1527 : 0; + case 0x4f1a <<16| 0x8b70: return (i0 == 0x4f1a && i1 == 0x8b70) ? 860 : 0; + case 0x306b <<16| 0x3057: return (i0 == 0x306b && i1 == 0x3057) ? 1771 : 0; + case 0x304c <<16| 0x3063: return (i0 == 0x304c && i1 == 0x3063) ? -913 : 0; + case 0x306e <<16| 0x3001: return (i0 == 0x306e && i1 == 0x3001) ? -724 : 0; + case 0x3046 <<16| 0x3061: return (i0 == 0x3046 && i1 == 0x3061) ? 1117 : 0; + case 0x793e <<16| 0x4f1a: return (i0 == 0x793e && i1 == 0x4f1a) ? 2024 : 0; + case 0x304b <<16| 0x3089: return (i0 == 0x304b && i1 == 0x3089) ? 6520 : 0; + case 0x304b <<16| 0x308a: return (i0 == 0x304b && i1 == 0x308a) ? -2670 : 0; + case 0x3046 <<16| 0x3068: return (i0 == 0x3046 && i1 == 0x3068) ? 4798 : 0; + case 0x306e <<16| 0x5b50: return (i0 == 0x306e && i1 == 0x5b50) ? -1000 : 0; + case 0x307e <<16| 0x3057: return (i0 == 0x307e && i1 == 0x3057) ? 1113 : 0; + case 0x3066 <<16| 0x3044: return (i0 == 0x3066 && i1 == 0x3044) ? 6240 : 0; + case 0x306f <<16| 0x002c: return (i0 == 0x306f && i1 == 0x002c) ? 1337 : 0; + case 0x307e <<16| 0x3059: return (i0 == 0x307e && i1 == 0x3059) ? 6943 : 0; + case 0x306b <<16| 0x306a: return (i0 == 0x306b && i1 == 0x306a) ? 1906 : 0; + case 0x3044 <<16| 0x002e: return (i0 == 0x3044 && i1 == 0x002e) ? -1185 : 0; + case 0x3066 <<16| 0x304a: return (i0 == 0x3066 && i1 == 0x304a) ? 855 : 0; + case 0x308f <<16| 0x308c: return (i0 == 0x308f && i1 == 0x308c) ? -605 : 0; + case 0x306b <<16| 0x306f: return (i0 == 0x306b && i1 == 0x306f) ? 2644 : 0; + case 0x307e <<16| 0x3063: return (i0 == 0x307e && i1 == 0x3063) ? -1549 : 0; + case 0x3089 <<16| 0x308c: return (i0 == 0x3089 && i1 == 0x308c) ? 6820 : 0; + case 0x307e <<16| 0x3067: return (i0 == 0x307e && i1 == 0x3067) ? 6154 : 0; + case 0x305f <<16| 0x306e: return (i0 == 0x305f && i1 == 0x306e) ? 812 : 0; + case 0x304d <<16| 0x305f: return (i0 == 0x304d && i1 == 0x305f) ? 1645 : 0; + case 0x3057 <<16| 0x002c: return (i0 == 0x3057 && i1 == 0x002c) ? 1557 : 0; + case 0x3053 <<16| 0x3068: return (i0 == 0x3053 && i1 == 0x3068) ? 7397 : 0; + case 0x306f <<16| 0x3001: return (i0 == 0x306f && i1 == 0x3001) ? 1337 : 0; + case 0x3079 <<16| 0x304d: return (i0 == 0x3079 && i1 == 0x304d) ? 2181 : 0; + case 0x3053 <<16| 0x306e: return (i0 == 0x3053 && i1 == 0x306e) ? 1542 : 0; + case 0x3044 <<16| 0x3002: return (i0 == 0x3044 && i1 == 0x3002) ? -1185 : 0; + case 0x304c <<16| 0x3089: return (i0 == 0x304c && i1 == 0x3089) ? -4977 : 0; + case 0x304c <<16| 0x308a: return (i0 == 0x304c && i1 == 0x308a) ? -2064 : 0; + case 0x304b <<16| 0x002e: return (i0 == 0x304b && i1 == 0x002e) ? 2857 : 0; + case 0x3060 <<16| 0x3063: return (i0 == 0x3060 && i1 == 0x3063) ? 1004 : 0; + case 0x3057 <<16| 0x3001: return (i0 == 0x3057 && i1 == 0x3001) ? 1557 : 0; + case 0x305f <<16| 0x308a: return (i0 == 0x305f && i1 == 0x308a) ? -1183 : 0; + case 0x305f <<16| 0x308b: return (i0 == 0x305f && i1 == 0x308b) ? -853 : 0; + case 0x3055 <<16| 0x3044: return (i0 == 0x3055 && i1 == 0x3044) ? -714 : 0; + case 0x59cb <<16| 0x3081: return (i0 == 0x59cb && i1 == 0x3081) ? 1681 : 0; + case 0x305a <<16| 0x306b: return (i0 == 0x305a && i1 == 0x306b) ? 841 : 0; + case 0x3059 <<16| 0x308b: return (i0 == 0x3059 && i1 == 0x308b) ? 6521 : 0; + case 0x3067 <<16| 0x3059: return (i0 == 0x3067 && i1 == 0x3059) ? 1437 : 0; + case 0x304b <<16| 0x3002: return (i0 == 0x304b && i1 == 0x3002) ? 2857 : 0; + case 0x307e <<16| 0x308c: return (i0 == 0x307e && i1 == 0x308c) ? -793 : 0; + case 0x65e5 <<16| 0x002c: return (i0 == 0x65e5 && i1 == 0x002c) ? 974 : 0; + case 0x3053 <<16| 0x308d: return (i0 == 0x3053 && i1 == 0x308d) ? -2757 : 0; + case 0x3042 <<16| 0x305f: return (i0 == 0x3042 && i1 == 0x305f) ? -2194 : 0; + case 0x308c <<16| 0x305f: return (i0 == 0x308c && i1 == 0x305f) ? 1850 : 0; + case 0x3048 <<16| 0x3068: return (i0 == 0x3048 && i1 == 0x3068) ? 1454 : 0; + case 0x304c <<16| 0x002c: return (i0 == 0x304c && i1 == 0x002c) ? 1816 : 0; + case 0x3066 <<16| 0x3082: return (i0 == 0x3066 && i1 == 0x3082) ? 302 : 0; + case 0x3068 <<16| 0x3046: return (i0 == 0x3068 && i1 == 0x3046) ? -1387 : 0; + case 0x308c <<16| 0x3066: return (i0 == 0x308c && i1 == 0x3066) ? 1375 : 0; + case 0x5165 <<16| 0x308a: return (i0 == 0x5165 && i1 == 0x308a) ? 1232 : 0; + case 0x306b <<16| 0x002c: return (i0 == 0x306b && i1 == 0x002c) ? -1021 : 0; + default: return 0; + } + } + + /** trigram category cost for (n-3, n-2, n-1) */ + public static final int tc1(int i0, int i1, int i2) { + switch(i0 <<6| i1 <<3| i2) { + case H <<6| O <<3| M: return -331; + case M <<6| M <<3| H: return 187; + case A <<6| A <<3| A: return 1093; + case I <<6| H <<3| I: return 1169; + case O <<6| O <<3| I: return -1832; + case H <<6| H <<3| H: return 1029; + case I <<6| O <<3| H: return -142; + case H <<6| H <<3| M: return 580; + case I <<6| O <<3| I: return -1015; + case H <<6| I <<3| I: return 998; + case H <<6| O <<3| H: return -390; + case I <<6| O <<3| M: return 467; + default: return 0; + } + } + + /** trigram category cost for (n-2, n-1, n) */ + public static final int tc2(int i0, int i1, int i2) { + switch(i0 <<6| i1 <<3| i2) { + case I <<6| H <<3| I: return -1965; + case O <<6| I <<3| I: return -2649; + case H <<6| M <<3| M: return -1154; + case K <<6| K <<3| H: return 703; + case H <<6| H <<3| O: return 2088; + case H <<6| I <<3| I: return -1023; + default: return 0; + } + } + + /** trigram category cost for (n-1, n, n+1) */ + public static final int tc3(int i0, int i1, int i2) { + switch(i0 <<6| i1 <<3| i2) { + case H <<6| H <<3| H: return 346; + case H <<6| H <<3| I: return -341; + case K <<6| O <<3| K: return -1009; + case I <<6| O <<3| I: return -542; + case I <<6| I <<3| H: return -825; + case A <<6| A <<3| A: return -294; + case K <<6| K <<3| A: return 491; + case I <<6| I <<3| M: return -1035; + case M <<6| H <<3| H: return -2694; + case O <<6| H <<3| O: return -3393; + case K <<6| K <<3| H: return -1217; + case K <<6| H <<3| H: return -1216; + case M <<6| H <<3| M: return -457; + case M <<6| H <<3| O: return 123; + case I <<6| H <<3| H: return 128; + case I <<6| H <<3| I: return -3041; + case M <<6| M <<3| H: return -471; + case I <<6| H <<3| O: return -1935; + case H <<6| O <<3| H: return -1486; + case N <<6| N <<3| H: return -1689; + case H <<6| I <<3| I: return -1088; + case H <<6| I <<3| K: return 731; + case N <<6| N <<3| O: return 662; + default: return 0; + } + } + + /** trigram category cost for (n, n+1, n+2) */ + public static final int tc4(int i0, int i1, int i2) { + switch(i0 <<6| i1 <<3| i2) { + case M <<6| O <<3| M: return 841; + case H <<6| H <<3| H: return -203; + case H <<6| H <<3| I: return 1344; + case H <<6| H <<3| K: return 365; + case H <<6| H <<3| M: return -122; + case I <<6| I <<3| H: return 321; + case H <<6| H <<3| N: return 182; + case I <<6| I <<3| I: return 1497; + case H <<6| H <<3| O: return 669; + case K <<6| K <<3| A: return 3386; + case I <<6| O <<3| O: return 54; + case M <<6| H <<3| H: return -405; + case I <<6| I <<3| O: return 656; + case M <<6| H <<3| I: return 201; + case K <<6| K <<3| K: return 3065; + case I <<6| H <<3| H: return 695; + case M <<6| M <<3| H: return -241; + case I <<6| H <<3| O: return -2324; + case H <<6| O <<3| H: return 446; + case M <<6| M <<3| M: return 661; + case H <<6| I <<3| H: return 804; + case H <<6| I <<3| I: return 679; + case K <<6| A <<3| K: return 4845; + default: return 0; + } + } + + /** trigram category (n-3, n-2, n-1) with context (n-2) cost */ + public static final int tq1(int i0, int i1, int i2, int i3) { + switch(i0 <<9| i1 <<6| i2 <<3| i3) { + case PB <<9| H <<6| I <<3| H: return -132; + case PO <<9| H <<6| H <<3| H: return 281; + case PB <<9| O <<6| H <<3| H: return 225; + case PO <<9| I <<6| I <<3| H: return -68; + case PB <<9| N <<6| H <<3| H: return -744; + case PO <<9| H <<6| I <<3| H: return 249; + case PB <<9| I <<6| H <<3| H: return 60; + case PB <<9| H <<6| H <<3| H: return -227; + case PB <<9| H <<6| H <<3| I: return 316; + case PO <<9| A <<6| K <<3| K: return 482; + case PB <<9| I <<6| I <<3| I: return 1595; + case PB <<9| O <<6| O <<3| O: return -908; + case PO <<9| I <<6| H <<3| I: return 200; + default: return 0; + } + } + + /** trigram category (n-2, n-1, n) with context (n-2) cost */ + public static final int tq2(int i0, int i1, int i2, int i3) { + switch(i0 <<9| i1 <<6| i2 <<3| i3) { + case PB <<9| I <<6| H <<3| H: return -1401; + case PB <<9| K <<6| A <<3| K: return -543; + case PB <<9| O <<6| O <<3| O: return -5591; + case PB <<9| I <<6| I <<3| I: return -1033; + default: return 0; + } + } + + /** trigram category (n-3, n-2, n-1) with context (n-1) cost */ + public static final int tq3(int i0, int i1, int i2, int i3) { + switch(i0 <<9| i1 <<6| i2 <<3| i3) { + case PB <<9| H <<6| I <<3| H: return 222; + case PB <<9| H <<6| I <<3| I: return -504; + case PO <<9| H <<6| I <<3| I: return 997; + case PO <<9| K <<6| A <<3| K: return 2792; + case PO <<9| O <<6| I <<3| I: return -685; + case PB <<9| H <<6| H <<3| H: return 478; + case PO <<9| H <<6| H <<3| H: return 346; + case PO <<9| H <<6| H <<3| I: return 1729; + case PB <<9| H <<6| H <<3| M: return -1073; + case PB <<9| I <<6| I <<3| H: return -116; + case PB <<9| I <<6| I <<3| I: return -105; + case PO <<9| I <<6| I <<3| H: return 1344; + case PO <<9| O <<6| H <<3| H: return 110; + case PO <<9| K <<6| K <<3| A: return 679; + case PB <<9| M <<6| H <<3| I: return -863; + case PB <<9| M <<6| H <<3| M: return -464; + case PO <<9| H <<6| M <<3| H: return 481; + case PO <<9| K <<6| H <<3| H: return 587; + case PB <<9| O <<6| M <<3| H: return 620; + case PO <<9| I <<6| H <<3| H: return 623; + default: return 0; + } + } + + /** trigram category (n-2, n-1, n) with context (n-1) cost */ + public static final int tq4(int i0, int i1, int i2, int i3) { + switch(i0 <<9| i1 <<6| i2 <<3| i3) { + case PB <<9| H <<6| I <<3| I: return -966; + case PO <<9| H <<6| H <<3| H: return -294; + case PO <<9| H <<6| H <<3| I: return 2446; + case PO <<9| K <<6| A <<3| K: return -8156; + case PO <<9| I <<6| I <<3| H: return 626; + case PO <<9| I <<6| I <<3| I: return -4007; + case PO <<9| H <<6| H <<3| O: return 480; + case PO <<9| H <<6| I <<3| H: return -1573; + case PB <<9| H <<6| H <<3| H: return -721; + case PO <<9| A <<6| K <<3| K: return 180; + case PB <<9| I <<6| I <<3| H: return -607; + case PB <<9| I <<6| I <<3| I: return -2181; + case PO <<9| A <<6| A <<3| A: return -2763; + case PO <<9| I <<6| H <<3| H: return 1935; + case PB <<9| H <<6| H <<3| M: return -3604; + case PO <<9| I <<6| H <<3| I: return -493; + default: return 0; + } + } + + /** trigram character cost for (n-3, n-2, n-1) */ + public static final int tw1(int i0, int i1, int i2) { + switch(i0 ^ i1 ^ i2) { + case 0x6771 ^ 0x4eac ^ 0x90fd: return (i0 == 0x6771 && i1 == 0x4eac && i2 == 0x90fd) ? 2026 : 0; + case 0x306b ^ 0x3064 ^ 0x3044: return (i0 == 0x306b && i1 == 0x3064 && i2 == 0x3044) ? -4681 : 0; + default: return 0; + } + } + + /** trigram character cost for (n-2, n-1, n) */ + public static final int tw2(int i0, int i1, int i2) { + switch(i0 ^ i1 ^ i2) { + case 0x3060 ^ 0x3063 ^ 0x3066: return (i0 == 0x3060 && i1 == 0x3063 && i2 == 0x3066) ? -1049 : 0; + case 0x3057 ^ 0x3087 ^ 0x3046: return (i0 == 0x3057 && i1 == 0x3087 && i2 == 0x3046) ? 3873 : 0; + case 0x3068 ^ 0x3057 ^ 0x3066: return (i0 == 0x3068 && i1 == 0x3057 && i2 == 0x3066) ? -4657 : 0; + case 0x3042 ^ 0x308b ^ 0x7a0b: return (i0 == 0x3042 && i1 == 0x308b && i2 == 0x7a0b) ? -2049 : 0; + case 0x5927 ^ 0x304d ^ 0x306a: return (i0 == 0x5927 && i1 == 0x304d && i2 == 0x306a) ? -1255 : 0; + case 0x305d ^ 0x306e ^ 0x5f8c: return (i0 == 0x305d && i1 == 0x306e && i2 == 0x5f8c) ? -4430 : 0; + case 0x3068 ^ 0x3082 ^ 0x306b: return (i0 == 0x3068 && i1 == 0x3082 && i2 == 0x306b) ? -4517 : 0; + case 0x3053 ^ 0x308d ^ 0x304c: return (i0 == 0x3053 && i1 == 0x308d && i2 == 0x304c) ? -2434 : 0; + case 0x5bfe ^ 0x3057 ^ 0x3066: return (i0 == 0x5bfe && i1 == 0x3057 && i2 == 0x3066) ? -2721 : 0; + case 0x3082 ^ 0x306e ^ 0x3067: return (i0 == 0x3082 && i1 == 0x306e && i2 == 0x3067) ? 1882 : 0; + case 0x793e ^ 0x4f1a ^ 0x515a: return (i0 == 0x793e && i1 == 0x4f1a && i2 == 0x515a) ? -3216 : 0; + case 0x3066 ^ 0x3044 ^ 0x305f: return (i0 == 0x3066 && i1 == 0x3044 && i2 == 0x305f) ? 1833 : 0; + case 0x4e00 ^ 0x6c17 ^ 0x306b: return (i0 == 0x4e00 && i1 == 0x6c17 && i2 == 0x306b) ? -792 : 0; + case 0x3044 ^ 0x3063 ^ 0x305f: return (i0 == 0x3044 && i1 == 0x3063 && i2 == 0x305f) ? -1256 : 0; + case 0x521d ^ 0x3081 ^ 0x3066: return (i0 == 0x521d && i1 == 0x3081 && i2 == 0x3066) ? -1512 : 0; + case 0x540c ^ 0x6642 ^ 0x306b: return (i0 == 0x540c && i1 == 0x6642 && i2 == 0x306b) ? -8097 : 0; + default: return 0; + } + } + + /** trigram character cost for (n-1, n, n+1) */ + public static final int tw3(int i0, int i1, int i2) { + switch(i0 ^ i1 ^ i2) { + case 0x306e ^ 0x3067 ^ 0x3001: return (i0 == 0x306e && i1 == 0x3067 && i2 == 0x3001) ? -727 : 0; + case 0x3068 ^ 0x3057 ^ 0x3066: return (i0 == 0x3068 && i1 == 0x3057 && i2 == 0x3066) ? -4314 : 0; + case 0x306e ^ 0x3082 ^ 0x306e: return (i0 == 0x306e && i1 == 0x3082 && i2 == 0x306e) ? -600 : 0; + case 0x306b ^ 0x3068 ^ 0x3063: return (i0 == 0x306b && i1 == 0x3068 && i2 == 0x3063) ? -5989 : 0; + case 0x3044 ^ 0x305f ^ 0x3060: return (i0 == 0x3044 && i1 == 0x305f && i2 == 0x3060) ? -1734 : 0; + case 0x306b ^ 0x3064 ^ 0x3044: return (i0 == 0x306b && i1 == 0x3064 && i2 == 0x3044) ? -5483 : 0; + case 0x3057 ^ 0x3066 ^ 0x3044: return (i0 == 0x3057 && i1 == 0x3066 && i2 == 0x3044) ? 1314 : 0; + case 0x306e ^ 0x3067 ^ 0x002c: return (i0 == 0x306e && i1 == 0x3067 && i2 == 0x002c) ? -727 : 0; + case 0x5341 ^ 0x4e8c ^ 0x6708: return (i0 == 0x5341 && i1 == 0x4e8c && i2 == 0x6708) ? -2287 : 0; + case 0x308c ^ 0x304b ^ 0x3089: return (i0 == 0x308c && i1 == 0x304b && i2 == 0x3089) ? -3752 : 0; + case 0x306b ^ 0x5f53 ^ 0x305f: return (i0 == 0x306b && i1 == 0x5f53 && i2 == 0x305f) ? -6247 : 0; + default: return 0; + } + } + + /** trigram character cost for (n, n+1, n+2) */ + public static final int tw4(int i0, int i1, int i2) { + switch(i0 ^ i1 ^ i2) { + case 0x304b ^ 0x3089 ^ 0x306a: return (i0 == 0x304b && i1 == 0x3089 && i2 == 0x306a) ? -2348 : 0; + case 0x307e ^ 0x3057 ^ 0x305f: return (i0 == 0x307e && i1 == 0x3057 && i2 == 0x305f) ? 5543 : 0; + case 0x3068 ^ 0x3044 ^ 0x3046: return (i0 == 0x3068 && i1 == 0x3044 && i2 == 0x3046) ? 1349 : 0; + case 0x3044 ^ 0x3046 ^ 0x002e: return (i0 == 0x3044 && i1 == 0x3046 && i2 == 0x002e) ? 8576 : 0; + case 0x3088 ^ 0x3046 ^ 0x3068: return (i0 == 0x3088 && i1 == 0x3046 && i2 == 0x3068) ? -4258 : 0; + case 0x3088 ^ 0x308b ^ 0x3068: return (i0 == 0x3088 && i1 == 0x308b && i2 == 0x3068) ? 5865 : 0; + case 0x305f ^ 0x304c ^ 0x002c: return (i0 == 0x305f && i1 == 0x304c && i2 == 0x002c) ? 1516 : 0; + case 0x3066 ^ 0x3044 ^ 0x308b: return (i0 == 0x3066 && i1 == 0x3044 && i2 == 0x308b) ? 1538 : 0; + case 0x3057 ^ 0x3066 ^ 0x3044: return (i0 == 0x3057 && i1 == 0x3066 && i2 == 0x3044) ? 2958 : 0; + case 0x3044 ^ 0x3046 ^ 0x3002: return (i0 == 0x3044 && i1 == 0x3046 && i2 == 0x3002) ? 8576 : 0; + case 0x307e ^ 0x305b ^ 0x3093: return (i0 == 0x307e && i1 == 0x305b && i2 == 0x3093) ? 1097 : 0; + case 0x305f ^ 0x304c ^ 0x3001: return (i0 == 0x305f && i1 == 0x304c && i2 == 0x3001) ? 1516 : 0; + default: return 0; + } + } + + /** unigram category cost for n-3 */ + public static final int uc1(int i0) { + switch(i0) { + case M: return 645; + case O: return -505; + case K: return 93; + case A: return 484; + default: return 0; + } + } + + /** unigram category cost for n-2 */ + public static final int uc2(int i0) { + switch(i0) { + case M: return 3987; + case N: return 5775; + case O: return 646; + case H: return 1059; + case I: return 409; + case A: return 819; + default: return 0; + } + } + + /** unigram category cost for n-1 */ + public static final int uc3(int i0) { + switch(i0) { + case A: return -1370; + case I: return 2311; + default: return 0; + } + } + + /** unigram category cost for n */ + public static final int uc4(int i0) { + switch(i0) { + case M: return 3565; + case N: return 3876; + case O: return 6646; + case H: return 1809; + case I: return -1032; + case K: return -3450; + case A: return -2643; + default: return 0; + } + } + + /** unigram category cost for n+1 */ + public static final int uc5(int i0) { + switch(i0) { + case M: return 539; + case O: return -831; + case H: return 313; + case I: return -1238; + case K: return -799; + default: return 0; + } + } + + /** unigram category cost for n+2 */ + public static final int uc6(int i0) { + switch(i0) { + case M: return 247; + case O: return -387; + case H: return -506; + case I: return -253; + case K: return 87; + default: return 0; + } + } + + /** unigram context cost for n-3 */ + public static final int up1(int i0) { + switch(i0) { + case PO: return -214; + default: return 0; + } + } + + /** unigram context cost for n-2 */ + public static final int up2(int i0) { + switch(i0) { + case PB: return 69; + case PO: return 935; + default: return 0; + } + } + + /** unigram context cost for n-1 */ + public static final int up3(int i0) { + switch(i0) { + case PB: return 189; + default: return 0; + } + } + + /** unigram category (n-3) with context (n-3) cost */ + public static final int uq1(int i0, int i1) { + switch(i0 <<3| i1) { + case PB <<3| H: return 21; + case PB <<3| I: return -12; + case PB <<3| K: return -99; + case PB <<3| N: return 142; + case PB <<3| O: return -56; + case PO <<3| H: return -95; + case PO <<3| I: return 477; + case PO <<3| K: return 410; + case PO <<3| O: return -2422; + default: return 0; + } + } + + /** unigram category (n-2) with context (n-2) cost */ + public static final int uq2(int i0, int i1) { + switch(i0 <<3| i1) { + case PB <<3| H: return 216; + case PB <<3| I: return 113; + case PO <<3| K: return 1759; + default: return 0; + } + } + + /** unigram category (n-1) with context (n-1) cost */ + public static final int uq3(int i0, int i1) { + switch(i0 <<3| i1) { + case PB <<3| H: return 42; + case PB <<3| I: return 1913; + case PB <<3| K: return -7198; + case PB <<3| M: return 3160; + case PB <<3| N: return 6427; + case PB <<3| O: return 14761; + case PO <<3| I: return -827; + case PB <<3| A: return -479; + case PO <<3| N: return -3212; + default: return 0; + } + } + + /** unigram character cost for n-3 */ + public static final int uw1(int i0) { + switch(i0) { + case 0x4eac: return -268; + case 0x3042: return -941; + case 0x59d4: return 729; + case 0x3046: return -127; + case 0x304c: return -553; + case 0x304d: return 121; + case 0xff62: return -463; + case 0x3053: return 505; + case 0xff65: return -135; + case 0x5927: return 561; + case 0x533a: return -912; + case 0x5e02: return -411; + case 0x3001: return 156; + case 0x56fd: return -460; + case 0x5348: return 871; + case 0x3067: return -201; + case 0x3068: return -547; + case 0x3069: return -123; + case 0x002c: return 156; + case 0x306b: return -789; + case 0x300c: return -463; + case 0x306e: return -185; + case 0x306f: return -847; + case 0x65e5: return -141; + case 0x751f: return -408; + case 0x7406: return 361; + case 0x90fd: return -718; + case 0x3082: return -466; + case 0x3084: return -470; + case 0x3088: return 182; + case 0x3089: return -292; + case 0x770c: return -386; + case 0x308a: return 208; + case 0x4e3b: return -402; + case 0x308c: return 169; + case 0x3092: return -446; + case 0x3093: return -137; + case 0x30fb: return -135; + default: return 0; + } + } + + /** unigram character cost for n-2 */ + public static final int uw2(int i0) { + switch(i0) { + case 0x63fa: return -1033; + case 0x5e02: return -813; + case 0x3082: return -1263; + case 0x4f1a: return 978; + case 0x3084: return -402; + case 0x4fdd: return 362; + case 0x3088: return 1639; + case 0x6700: return -630; + case 0x308a: return -579; + case 0x521d: return -3025; + case 0x308b: return -694; + case 0x308c: return 571; + case 0x6587: return -1355; + case 0x7b2c: return 810; + case 0x5165: return 548; + case 0x3092: return -2516; + case 0x3093: return 2095; + case 0x81ea: return -1353; + case 0x30a2: return -587; + case 0x671d: return -1843; + case 0x002c: return -829; + case 0x30ab: return 306; + case 0x30ad: return 568; + case 0x4e8b: return 492; + case 0x672c: return -1650; + case 0x897f: return -744; + case 0x65b0: return -1682; + case 0xff62: return -645; + case 0xff63: return 3145; + case 0x3001: return -829; + case 0x898b: return -3874; + case 0x30c3: return 831; + case 0xff6f: return 831; + case 0x5317: return -3414; + case 0x3007: return 892; + case 0xff71: return -587; + case 0x5c0f: return -2009; + case 0x5b50: return -1519; + case 0x300c: return -645; + case 0xff76: return 306; + case 0x300d: return 3145; + case 0x76ee: return -1584; + case 0xff77: return 568; + case 0x958b: return 1758; + case 0x76f8: return -242; + case 0x9593: return -1257; + case 0x526f: return -1566; + case 0x5927: return -1769; + case 0x5b66: return 760; + case 0x5929: return -865; + case 0x592a: return -483; + case 0x7406: return 752; + case 0x4eba: return -123; + case 0x533a: return -422; + case 0x770c: return -1165; + case 0x65e5: return -1815; + case 0x7acb: return -763; + case 0x6b21: return -2378; + case 0x4e09: return -758; + case 0x5e74: return -1060; + case 0x4e0d: return -2150; + case 0x5f37: return 1067; + case 0x6771: return -931; + case 0x8fbc: return 3041; + case 0x4e16: return -302; + case 0x3042: return -538; + case 0x884c: return 838; + case 0x3044: return 505; + case 0x3046: return 134; + case 0x653f: return 1522; + case 0x304a: return -502; + case 0x304b: return 1454; + case 0x304c: return -856; + case 0x624b: return -1519; + case 0x304f: return -412; + case 0x3053: return 1141; + case 0x4e2d: return -968; + case 0x3055: return 878; + case 0x3056: return 540; + case 0x660e: return -1462; + case 0x3057: return 1529; + case 0x767a: return 529; + case 0x5b9f: return 1023; + case 0x3059: return -675; + case 0x7c73: return 509; + case 0x305b: return 300; + case 0x305d: return -1011; + case 0x305f: return 188; + case 0x3060: return 1837; + case 0x6c11: return -180; + case 0x4e3b: return -861; + case 0x3064: return -949; + case 0x3066: return -291; + case 0x679c: return -665; + case 0x3067: return -268; + case 0x6c17: return -1740; + case 0x3068: return -981; + case 0x3069: return 1273; + case 0x306a: return 1063; + case 0x8b70: return 1198; + case 0x306b: return -1764; + case 0x306e: return 130; + case 0x306f: return -409; + case 0x3072: return -1273; + case 0x8abf: return 1010; + case 0x3079: return 1261; + case 0x307e: return 600; + default: return 0; + } + } + + /** unigram character cost for n-1 */ + public static final int uw3(int i0) { + switch(i0) { + case 0x0031: return -800; + case 0x4f4e: return 811; + case 0x524d: return 2286; + case 0x95a2: return -1282; + case 0x4f55: return 4265; + case 0x4f5c: return -361; + case 0x674e: return 3094; + case 0x6751: return 364; + case 0x8cbb: return 1777; + case 0x53e3: return 483; + case 0x8fbc: return -1504; + case 0x7acb: return -960; + case 0x3001: return 4889; + case 0x5b66: return -1356; + case 0x7dcf: return 1163; + case 0x3005: return -2311; + case 0x526f: return 4437; + case 0x3007: return 5827; + case 0x65e5: return 2099; + case 0x65e7: return 5792; + case 0x53f3: return 1233; + case 0x002c: return 4889; + case 0x300d: return 2670; + case 0x7dda: return 1255; + case 0x5e73: return -1804; + case 0x5e74: return 2416; + case 0x3013: return -3573; + case 0x4e00: return -1619; + case 0x68ee: return 2438; + case 0x77e5: return -1528; + case 0x6771: return -805; + case 0x56fd: return 642; + case 0x5404: return 3588; + case 0x4e0b: return -1759; + case 0x5408: return -241; + case 0x6d77: return -495; + case 0x5e83: return -1030; + case 0x975e: return 2066; + case 0x540c: return 3906; + case 0x5b89: return -423; + case 0x7c73: return 7767; + case 0x6307: return -3973; + case 0x4e16: return -2087; + case 0x529b: return 365; + case 0x7684: return 7313; + case 0x80fd: return 725; + case 0x4e21: return 3815; + case 0x6c0f: return 2613; + case 0x6c11: return -1694; + case 0x5e9c: return 1605; + case 0x5b9f: return -1008; + case 0x601d: return -1291; + case 0x4e2d: return 653; + case 0x3042: return -2696; + case 0x3044: return 1006; + case 0x5ea6: return 1452; + case 0x3046: return 2342; + case 0x6027: return 1822; + case 0x3048: return 1983; + case 0x304a: return -4864; + case 0x304b: return -1163; + case 0x6628: return -661; + case 0x304c: return 3271; + case 0x751f: return -273; + case 0x4e3b: return -758; + case 0x304f: return 1004; + case 0x3051: return 388; + case 0x3052: return 401; + case 0x5bb6: return 1078; + case 0x3053: return -3552; + case 0x3054: return -3116; + case 0x3055: return -1058; + case 0x7528: return 914; + case 0x3057: return -395; + case 0x5143: return 4858; + case 0x3059: return 584; + case 0x901a: return -1136; + case 0x305b: return 3685; + case 0x305d: return -5228; + case 0x7b2c: return 1201; + case 0x305f: return 842; + case 0x3061: return -521; + case 0x3063: return -1444; + case 0x3064: return -1081; + case 0x3066: return 6167; + case 0x6642: return -1248; + case 0x3067: return 2318; + case 0x3068: return 1691; + case 0x753a: return 1215; + case 0x3069: return -899; + case 0x306a: return -2788; + case 0x306b: return 2745; + case 0x52d5: return -949; + case 0x306e: return 4056; + case 0x306f: return 4555; + case 0x52d9: return -1872; + case 0x515a: return 3593; + case 0x3072: return -2171; + case 0x4fdd: return -2439; + case 0x79c1: return 4231; + case 0x3075: return -1798; + case 0x3078: return 1199; + case 0x307b: return -5516; + case 0x307e: return -4384; + case 0x5168: return 1574; + case 0x307f: return -120; + case 0x3081: return 1205; + case 0x516c: return -3030; + case 0x3082: return 2323; + case 0x516d: return 755; + case 0x3084: return -788; + case 0x5171: return -1880; + case 0x3088: return -202; + case 0x3089: return 727; + case 0x8eca: return 1835; + case 0x308a: return 649; + case 0x308b: return 5905; + case 0x308c: return 2773; + case 0x8ecd: return 1375; + case 0x308f: return -1207; + case 0x3092: return 6620; + case 0x91d1: return 2163; + case 0x3093: return -518; + case 0x696d: return 484; + case 0x7269: return 461; + case 0x5efa: return -2352; + case 0xff11: return -800; + case 0x5186: return 5807; + case 0x4e88: return -1193; + case 0x4e8c: return 974; + case 0x30a2: return 551; + case 0x6c7a: return -1073; + case 0x518d: return 3095; + case 0x76f4: return -1835; + case 0x548c: return -837; + case 0x578b: return 1389; + case 0x7279: return -3850; + case 0x82f1: return 785; + case 0x5c0f: return -513; + case 0x5316: return 1327; + case 0x5c11: return -3102; + case 0x5317: return -1038; + case 0x7cfb: return 3066; + case 0x30b0: return 1319; + case 0x7701: return 792; + case 0x5916: return -241; + case 0x7d04: return 3663; + case 0x9078: return -681; + case 0x30b9: return 874; + case 0x8005: return 6457; + case 0x770c: return 6293; + case 0x7a0e: return 401; + case 0x30c3: return -1350; + case 0x30c8: return 521; + case 0x7121: return 979; + case 0x7d1a: return 1384; + case 0x4eba: return 2742; + case 0x533a: return 4646; + case 0x6238: return -488; + case 0x5343: return -2309; + case 0x6838: return 5156; + case 0x4eca: return 792; + case 0x5348: return -783; + case 0x30e0: return 1109; + case 0x653f: return -2013; + case 0x4ed6: return 1889; + case 0x5354: return -1006; + case 0x30eb: return 1591; + case 0x30ed: return 2201; + case 0xff63: return 2670; + case 0xff65: return -3794; + case 0x5f53: return -3885; + case 0x30f3: return 278; + case 0x54e1: return 4513; + case 0x4ee5: return -1368; + case 0xff6f: return -1350; + case 0x30fb: return -3794; + case 0x8abf: return -562; + case 0xff71: return 551; + case 0x6559: return -1479; + case 0x5dde: return 1155; + case 0x6cd5: return 1868; + case 0x66dc: return -951; + case 0xff7d: return 874; + case 0x2212: return -1723; + case 0x99c5: return 1620; + case 0x90ce: return 1026; + case 0xff84: return 521; + case 0x6570: return 3222; + case 0xff91: return 1109; + case 0x5206: return 457; + case 0x5e02: return 3197; + case 0x81ea: return -2869; + case 0x90e1: return 4404; + case 0xff99: return 1591; + case 0x6700: return -937; + case 0x7d71: return -4229; + case 0xff9b: return 2201; + case 0xff9d: return 278; + case 0x90e8: return 1200; + case 0x6587: return -1489; + case 0x6708: return 4125; + case 0x96e8: return 2009; + case 0x521d: return 2475; + case 0x5f97: return 1905; + case 0x9577: return 421; + case 0x5225: return 1129; + case 0x96fb: return -1045; + case 0x671f: return 360; + case 0x898b: return 1044; + case 0x5834: return 1219; + case 0x958b: return -1432; + case 0x65b0: return 1764; + case 0x59bb: return 2016; + case 0x9593: return 1302; + case 0x8ca1: return -733; + default: return 0; + } + } + + /** unigram character cost for n */ + public static final int uw4(int i0) { + switch(i0) { + case 0x822c: return -852; + case 0x524d: return 1623; + case 0x4f53: return -1286; + case 0x5b50: return -4802; + case 0x4f5c: return 530; + case 0x56de: return 1500; + case 0x8fbc: return -3370; + case 0x7acb: return -2112; + case 0x3001: return 3930; + case 0x3002: return 3508; + case 0x5b66: return -1397; + case 0x7dcf: return 940; + case 0x526f: return 3879; + case 0x3007: return 4999; + case 0x884c: return -792; + case 0x65e5: return 1798; + case 0x6765: return -442; + case 0x300c: return 1895; + case 0x002c: return 3930; + case 0x300d: return 3798; + case 0x002e: return 3508; + case 0x7dda: return -994; + case 0x8fd1: return 929; + case 0x5e74: return 374; + case 0x3013: return -5156; + case 0x5cf6: return -2056; + case 0x4e00: return -2069; + case 0x56fd: return -619; + case 0x8cde: return 730; + case 0x5e81: return -4556; + case 0x5408: return -1834; + case 0x8b66: return -1184; + case 0x7c73: return 2937; + case 0x7f72: return 749; + case 0x5712: return -1200; + case 0x8b70: return -244; + case 0x529b: return -302; + case 0x7684: return 2586; + case 0x80fd: return -730; + case 0x7387: return 672; + case 0x5b9a: return -1057; + case 0x6c0f: return 5388; + case 0x6c11: return -2716; + case 0x6c17: return -910; + case 0x4e2d: return 2210; + case 0x3042: return 4752; + case 0x3044: return -3435; + case 0x3046: return -640; + case 0x6027: return 553; + case 0x3048: return -2514; + case 0x5730: return 866; + case 0x304a: return 2405; + case 0x304b: return 530; + case 0x304c: return 6006; + case 0x304d: return -4482; + case 0x751f: return -1286; + case 0x304e: return -3821; + case 0x304f: return -3788; + case 0x3051: return -4376; + case 0x7523: return -1101; + case 0x3052: return -4734; + case 0x3053: return 2255; + case 0x3054: return 1979; + case 0x3055: return 2864; + case 0x3057: return -843; + case 0x3058: return -2506; + case 0x3059: return -731; + case 0x305a: return 1251; + case 0x305b: return 181; + case 0x305d: return 4091; + case 0x5148: return 601; + case 0x7530: return -2900; + case 0x7b2c: return 788; + case 0x305f: return 5034; + case 0x3060: return 5408; + case 0x3061: return -3654; + case 0x3063: return -5882; + case 0x3064: return -1659; + case 0x3066: return 3994; + case 0x6642: return 1829; + case 0x3067: return 7410; + case 0x3068: return 4547; + case 0x753a: return 1826; + case 0x306a: return 5433; + case 0x306b: return 6499; + case 0x306c: return 1853; + case 0x52d5: return -740; + case 0x306d: return 1413; + case 0x306e: return 7396; + case 0x9928: return -1984; + case 0x306f: return 8578; + case 0x3070: return 1940; + case 0x52d9: return -2715; + case 0x515a: return -2006; + case 0x3072: return 4249; + case 0x3073: return -4134; + case 0x3075: return 1345; + case 0x3078: return 6665; + case 0x3079: return -744; + case 0x307b: return 1464; + case 0x307e: return 1051; + case 0x307f: return -2082; + case 0x3080: return -882; + case 0x3081: return -5046; + case 0x3082: return 4169; + case 0x3083: return -2666; + case 0x3084: return 2795; + case 0x58eb: return -1413; + case 0x5171: return -1212; + case 0x3087: return -1544; + case 0x3088: return 3351; + case 0x3089: return -2922; + case 0x8eca: return -1481; + case 0x308a: return -9726; + case 0x2015: return -4841; + case 0x308b: return -14896; + case 0x308c: return -2613; + case 0x8ecd: return 1158; + case 0x308d: return -4570; + case 0x308f: return -1783; + case 0x91ce: return -1100; + case 0x3092: return 13150; + case 0x3093: return -2352; + case 0x696d: return -1043; + case 0x9053: return -1291; + case 0x7269: return -735; + case 0x5bfa: return -809; + case 0x5185: return 584; + case 0x5186: return 788; + case 0x4e88: return 782; + case 0x76ee: return 922; + case 0x4e8b: return -190; + case 0x9ad8: return 2120; + case 0x548c: return -681; + case 0x9662: return -2297; + case 0x4e95: return -1768; + case 0x30ab: return 2145; + case 0x5c0f: return 1910; + case 0x5316: return 776; + case 0x7cfb: return 786; + case 0x7403: return -1267; + case 0x7701: return -3485; + case 0x6e08: return -543; + case 0x30b3: return 1789; + case 0x591a: return 1067; + case 0x7d04: return 2171; + case 0x9078: return 2596; + case 0x8005: return 2145; + case 0x30bb: return 1287; + case 0x770c: return 2997; + case 0x5927: return 571; + case 0x30c3: return -724; + case 0x6821: return -360; + case 0x30c8: return -403; + case 0x6ca2: return -939; + case 0x4eba: return 1036; + case 0x533a: return 4517; + case 0x652f: return 856; + case 0x6539: return 787; + case 0x9996: return 1749; + case 0x9818: return -1659; + case 0x969b: return -2604; + case 0x6240: return -1566; + case 0x30e1: return -1635; + case 0x653f: return 2182; + case 0x5c4b: return -1328; + case 0x30e9: return -881; + case 0x8f2a: return -1433; + case 0x30ea: return -541; + case 0x5354: return 1013; + case 0x30eb: return -856; + case 0xff62: return 1895; + case 0xff63: return 3798; + case 0xff65: return -4371; + case 0x30f3: return -3637; + case 0x8c37: return -1000; + case 0x54e1: return -910; + case 0x4ee5: return 544; + case 0xff6f: return -724; + case 0xff70: return -11870; + case 0x5ddd: return -2667; + case 0x30fb: return -4371; + case 0x6559: return 704; + case 0x30fc: return -11870; + case 0x7d4c: return 1146; + case 0xff76: return 2145; + case 0x5668: return -851; + case 0xff7a: return 1789; + case 0xff7e: return 1287; + case 0x5074: return 4292; + case 0x5c71: return -1500; + case 0x90ce: return -4866; + case 0xff84: return -403; + case 0x984c: return -792; + case 0xff92: return -1635; + case 0x5e02: return 2771; + case 0xff97: return -881; + case 0xff98: return -541; + case 0xff99: return -856; + case 0x6700: return 845; + case 0x7d71: return -1169; + case 0xff9d: return -3637; + case 0x6587: return 522; + case 0x5f8c: return 456; + case 0x7a7a: return -867; + case 0x6708: return -9066; + case 0x4f1a: return 950; + case 0x521d: return 1347; + case 0x9577: return 357; + case 0x90fd: return 1192; + case 0x611f: return 916; + case 0x96fb: return -878; + case 0x9280: return -2213; + case 0x898f: return 792; + case 0x6728: return -485; + case 0x5834: return -1410; + case 0x9593: return -2344; + case 0x53c2: return 1555; + case 0x5841: return -2094; + case 0x65b9: return -856; + default: return 0; + } + } + + /** unigram character cost for n+1 */ + public static final int uw5(int i0) { + switch(i0) { + case 0x307f: return 502; + case 0x5e02: return -2991; + case 0x0031: return -514; + case 0x3081: return 865; + case 0x3083: return 3350; + case 0x4f1a: return -1153; + case 0x515a: return -654; + case 0x3087: return 854; + case 0x52d9: return 3519; + case 0x308a: return -208; + case 0x308b: return 429; + case 0x308c: return 504; + case 0x5d50: return -1304; + case 0x7530: return 240; + case 0x308f: return 419; + case 0x90ce: return -368; + case 0x6708: return -4353; + case 0x3092: return -1264; + case 0x3093: return 327; + case 0x753a: return -3912; + case 0x984c: return 2368; + case 0x7d71: return 1955; + case 0x7a7a: return -813; + case 0x30a4: return 241; + case 0x5e2d: return 921; + case 0x002c: return 465; + case 0x002e: return -299; + case 0x9928: return -689; + case 0x65b0: return -1682; + case 0xff62: return 363; + case 0x9577: return 786; + case 0x3001: return 465; + case 0x3002: return -299; + case 0x67fb: return 932; + case 0xff72: return 241; + case 0x300c: return 363; + case 0x4eac: return 722; + case 0x76f8: return 1319; + case 0xe005: return -32768; + case 0x9593: return 1191; + case 0x005d: return -2762; + case 0x5927: return -1296; + case 0x5b66: return -548; + case 0x7701: return -1052; + case 0x793e: return -278; + case 0x533a: return -901; + case 0x770c: return -4003; + case 0x30eb: return 451; + case 0x65e5: return 218; + case 0x6a5f: return -1508; + case 0xff99: return 451; + case 0x8005: return -2233; + case 0x5e74: return 1763; + case 0xff9d: return -343; + case 0x30f3: return -343; + case 0x9078: return -1018; + case 0x3042: return 1655; + case 0x6240: return -814; + case 0x3044: return 331; + case 0x3046: return -503; + case 0x683c: return 1356; + case 0x3048: return 1199; + case 0x304a: return 527; + case 0x304b: return 647; + case 0x304c: return -421; + case 0x304d: return 1624; + case 0x304e: return 1971; + case 0x304f: return 312; + case 0x54e1: return 2104; + case 0x3052: return -983; + case 0x5b9a: return 1785; + case 0x4e2d: return -871; + case 0x3055: return -1537; + case 0x3057: return -1371; + case 0x8a9e: return -1073; + case 0x3059: return -852; + case 0x6319: return 1618; + case 0x601d: return 872; + case 0x8868: return 663; + case 0x6c0f: return -1347; + case 0x3060: return -1186; + case 0x3061: return 1093; + case 0x7684: return -3149; + case 0x3063: return 52; + case 0x3064: return 921; + case 0x3066: return -18; + case 0xff11: return -514; + case 0x3067: return -850; + case 0x3068: return -127; + case 0x3069: return 1682; + case 0x306a: return -787; + case 0x8b70: return 1219; + case 0x306b: return -1224; + case 0x306e: return -635; + case 0x306f: return -578; + case 0x7814: return -997; + case 0x3079: return 1001; + case 0x544a: return 848; + default: return 0; + } + } + + /** unigram character cost for n+2 */ + public static final int uw6(int i0) { + switch(i0) { + case 0x0031: return -270; + case 0xe004: return 306; + case 0x3042: return -307; + case 0x7a7a: return -822; + case 0x59d4: return 798; + case 0x3046: return 189; + case 0x696d: return -697; + case 0x304b: return 241; + case 0x304c: return -73; + case 0x4f1a: return 624; + case 0x304f: return -121; + case 0x4e00: return -277; + case 0x90ce: return 1082; + case 0x3053: return -200; + case 0x3058: return 1782; + case 0x533a: return 1792; + case 0x3059: return 383; + case 0x5b66: return -960; + case 0x5e02: return 887; + case 0xff11: return -270; + case 0x305f: return -428; + case 0x3001: return 227; + case 0x3002: return 808; + case 0x3063: return 573; + case 0x9023: return 463; + case 0x3066: return -1014; + case 0x3067: return 101; + case 0x3068: return -105; + case 0x002c: return 227; + case 0x306a: return -253; + case 0x306b: return -149; + case 0x5f8c: return 535; + case 0x002e: return 808; + case 0x306e: return -417; + case 0x306f: return -236; + case 0x798f: return 974; + case 0x76f8: return 753; + case 0x4e2d: return 201; + case 0x5e83: return -695; + case 0x3082: return -206; + case 0x793e: return -507; + case 0x54e1: return -1212; + case 0xff99: return -673; + case 0x524d: return 302; + case 0x4ef6: return -800; + case 0x308a: return 187; + case 0x308b: return -135; + case 0xff9d: return -496; + case 0x30eb: return -673; + case 0x3092: return 195; + case 0x30f3: return -496; + case 0x8005: return 1811; + default: return 0; + } + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\ja\TinySegmenterConstants.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ja/package.html =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ja/package.html (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ja/package.html (revision 0) @@ -0,0 +1,22 @@ + + + + +Analysis components for Japanese. + + Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\ja\package.html ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java (revision 0) @@ -0,0 +1,112 @@ +package org.apache.lucene.analysis.cjk; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.StemmerUtil; + +/** + * A {@link TokenFilter} that normalizes CJK width differences: + * + *

+ * NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD + * Unicode normalization. See the normalization support in the ICU package + * for full normalization. + */ +public final class CJKWidthFilter extends TokenFilter { + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + /* halfwidth kana mappings: 0xFF65-0xFF9D + * + * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A + * as a fallback when they cannot properly combine with a preceding + * character into a composed form. + */ + private static final char KANA_NORM[] = new char[] { + 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, + 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, + 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, + 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, + 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0, + 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, + 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A + }; + + public CJKWidthFilter(TokenStream input) { + super(input); + } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char text[] = termAtt.buffer(); + int length = termAtt.length(); + for (int i = 0; i < length; i++) { + final char ch = text[i]; + if (ch >= 0xFF01 && ch <= 0xFF5E) { + // Fullwidth ASCII variants + text[i] -= 0xFEE0; + } else if (ch >= 0xFF65 && ch <= 0xFF9F) { + // Halfwidth Katakana variants + if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, length, ch)) { + length = StemmerUtil.delete(text, i--, length); + } else { + text[i] = KANA_NORM[ch - 0xFF65]; + } + } + } + termAtt.setLength(length); + return true; + } else { + return false; + } + } + + /* kana combining diffs: 0x30A6-0x30FD */ + private static final byte KANA_COMBINE_VOICED[] = new byte[] { + 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, + 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + }; + + private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, + 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + /** returns true if we successfully combined the voice mark */ + private static boolean combine(char text[], int pos, int length, char ch) { + final char prev = text[pos-1]; + if (prev >= 0x30A6 && prev <= 0x30FD) { + text[pos-1] += (ch == 0xFF9F) + ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6] + : KANA_COMBINE_VOICED[prev - 0x30A6]; + return text[pos-1] != prev; + } + return false; + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\cjk\CJKWidthFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (revision 0) @@ -0,0 +1,284 @@ +package org.apache.lucene.analysis.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import java.text.BreakIterator; +import java.text.CharacterIterator; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Breaks text into sentences with a {@link BreakIterator} and + * allows subclasses to decompose these sentences into words. + *

+ * This can be used by subclasses that need sentence context + * for tokenization purposes, such as CJK segmenters. + *

+ * Additionally it can be used by subclasses that want to mark + * sentence boundaries (with a custom attribute, extra token, position + * increment, etc) for downstream processing. + * + * @lucene.experimental + */ +public abstract class SegmentingTokenizerBase extends Tokenizer { + protected static final int BUFFERMAX = 4096; + protected final char buffer[] = new char[BUFFERMAX]; + /** true length of text in the buffer */ + private int length = 0; + /** length in buffer that can be evaluated safely, up to a safe end point */ + private int usableLength = 0; + /** accumulated offset of previous buffers for this reader, for offsetAtt */ + protected int offset = 0; + + private final BreakIterator iterator; + private final CharArrayIterator wrapper = new CharArrayIterator(); + + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + /** + * Construct a new SegmenterBase from the given Reader, using + * the provided BreakIterator for sentence segmentation. + *

+ * Note that you should never share BreakIterators across different + * TokenStreams, instead a newly created or cloned one should always + * be provided to this constructor. + */ + public SegmentingTokenizerBase(Reader input, BreakIterator iterator) { + super(input); + this.iterator = iterator; + } + + @Override + public final boolean incrementToken() throws IOException { + if (length == 0 || !incrementWord()) { + while (!incrementSentence()) { + refill(); + if (length <= 0) // no more bytes to read; + return false; + } + } + + return true; + } + + @Override + public void reset() throws IOException { + wrapper.setText(buffer, 0, 0); + iterator.setText(wrapper); + length = usableLength = offset = 0; + } + + @Override + public void reset(Reader input) throws IOException { + this.input = input; + reset(); + } + + @Override + public final void end() throws IOException { + final int finalOffset = correctOffset(length < 0 ? offset : offset + length); + offsetAtt.setOffset(finalOffset, finalOffset); + } + + /** Returns the last unambiguous break position in the text. */ + private int findSafeEnd() { + for (int i = length - 1; i >= 0; i--) + if (isSafeEnd(buffer[i])) + return i + 1; + return -1; + } + + /** For sentence tokenization, these are the unambiguous break positions. */ + protected boolean isSafeEnd(char ch) { + switch(ch) { + case 0x000D: + case 0x000A: + case 0x0085: + case 0x2028: + case 0x2029: + return true; + default: + return false; + } + } + + /** + * Refill the buffer, accumulating the offset and setting usableLength to the + * last unambiguous break position + */ + private void refill() throws IOException { + offset += usableLength; + int leftover = length - usableLength; + System.arraycopy(buffer, usableLength, buffer, 0, leftover); + int requested = buffer.length - leftover; + int returned = input.read(buffer, leftover, requested); + length = returned < 0 ? leftover : returned + leftover; + if (returned < requested) /* reader has been emptied, process the rest */ + usableLength = length; + else { /* still more data to be read, find a safe-stopping place */ + usableLength = findSafeEnd(); + if (usableLength < 0) + usableLength = length; /* + * more than IOBUFFER of text without breaks, + * gonna possibly truncate tokens + */ + } + + wrapper.setText(buffer, 0, Math.max(0, usableLength)); + iterator.setText(wrapper); + } + + /** + * return true if there is a token from the buffer, or null if it is + * exhausted. + */ + private boolean incrementSentence() throws IOException { + if (length == 0) // we must refill the buffer + return false; + + while (true) { + int start = iterator.current(); + + if (start == BreakIterator.DONE) + return false; // BreakIterator exhausted + + // find the next set of boundaries + int end = iterator.next(); + + if (end == BreakIterator.DONE) + return false; // BreakIterator exhausted + + setNextSentence(start, end); + if (incrementWord()) { + return true; + } + } + } + + /** Provides the next input sentence for analysis */ + protected abstract void setNextSentence(int sentenceStart, int sentenceEnd); + /** Returns true if another word is available */ + protected abstract boolean incrementWord(); + + /** A CharacterIterator used internally for sentence breaks */ + static class CharArrayIterator implements CharacterIterator { + private char array[]; + private int start; + private int index; + private int length; + private int limit; + + public char [] getText() { + return array; + } + + public int getStart() { + return start; + } + + public int getLength() { + return length; + } + + /** + * Set a new region of text to be examined by this iterator + * + * @param array text buffer to examine + * @param start offset into buffer + * @param length maximum length to examine + */ + void setText(final char array[], int start, int length) { + this.array = array; + this.start = start; + this.index = start; + this.length = length; + this.limit = start + length; + } + + public char current() { + return (index == limit) ? DONE : jreBugWorkaround(array[index]); + } + + // on modern jres, supplementary codepoints with [:Sentence_Break=Format:] + // trigger a bug in RulebasedBreakIterator! work around this for now + // by lying about all surrogates to the sentence tokenizer, instead + // we treat them all as SContinue so we won't break around them. + final char jreBugWorkaround(char ch) { + return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch; + } + + public char first() { + index = start; + return current(); + } + + public int getBeginIndex() { + return 0; + } + + public int getEndIndex() { + return length; + } + + public int getIndex() { + return index - start; + } + + public char last() { + index = (limit == start) ? limit : limit - 1; + return current(); + } + + public char next() { + if (++index >= limit) { + index = limit; + return DONE; + } else { + return current(); + } + } + + public char previous() { + if (--index < start) { + index = start; + return DONE; + } else { + return current(); + } + } + + public char setIndex(int position) { + if (position < getBeginIndex() || position > getEndIndex()) + throw new IllegalArgumentException("Illegal Position: " + position); + index = start + position; + return current(); + } + + @Override + public Object clone() { + CharArrayIterator clone = new CharArrayIterator(); + clone.setText(array, start, length); + clone.index = index; + return clone; + } + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\util\SegmentingTokenizerBase.java ___________________________________________________________________ Added: svn:eol-style + native