Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java (revision 0) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java (working copy) @@ -0,0 +1,516 @@ +package org.apache.lucene.analysis.ja; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.math.BigDecimal; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Normalizes Japanese numbers + */ +public class JapaneseNumberFilter extends TokenFilter { + + private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + private static char NO_NUMERAL = Character.MAX_VALUE; + + private static char[] numerals; + + private static char[] exponents; + + private StringBuilder numeral; + + private State state; + + private int startOffset; + + private int endOffset; + + static { + numerals = new char[0x10000]; + for (int i = 0; i < numerals.length; i++) { + numerals[i] = NO_NUMERAL; + } + numerals['〇'] = 0; // 〇 U+3007 0 + numerals['一'] = 1; // 一 U+4E00 1 + numerals['二'] = 2; // 二 U+4E8C 2 + numerals['三'] = 3; // 三 U+4E09 3 + numerals['四'] = 4; // 四 U+56DB 4 + numerals['五'] = 5; // 五 U+4E94 5 + numerals['六'] = 6; // 六 U+516D 6 + numerals['七'] = 7; // 七 U+4E03 7 + numerals['八'] = 8; // 八 U+516B 8 + numerals['九'] = 9; // 九 U+4E5D 9 + + exponents = new char[0x10000]; + for (int i = 0; i < exponents.length; i++) { + exponents[i] = 0; + } + exponents['十'] = 1; // 十 U+5341 10 + exponents['百'] = 2; // 百 U+767E 100 + exponents['千'] = 3; // 千 U+5343 1,000 + exponents['万'] = 4; // 万 U+4E07 10,000 + exponents['億'] = 8; // 億 U+5104 100,000,000 + exponents['兆'] = 12; // 兆 U+5146 1,000,000,000,000 + exponents['京'] = 16; // 京 U+4EAC 10,000,000,000,000,000 + exponents['垓'] = 20; // 垓 U+5793 100,000,000,000,000,000,000 + } + + public JapaneseNumberFilter(TokenStream input) { + super(input); + } + + @Override + public final boolean incrementToken() throws IOException { + + // Emit previously captured token we read past earlier + if (state != null) { + restoreState(state); + state = null; + return true; + } + + if (!input.incrementToken()) { + return false; + } + + if (keywordAttr.isKeyword()) { + return true; + } + + boolean moreTokens = true; + boolean composedToken = false; + String term = termAttr.toString(); + + while (moreTokens && isNumeral(term)) { + if (!composedToken) { + startOffset = offsetAttr.startOffset(); + } + + composedToken = true; + endOffset = offsetAttr.endOffset(); + numeral.append(term); + + moreTokens = input.incrementToken(); + + if (moreTokens) { + term = termAttr.toString(); + } + } + + if (composedToken) { + + if (moreTokens) { + // We have read past all numerals and there are still tokens left, so + // capture the state of this token and emit it on our next incrementToken() + state = captureState(); + } + + String normalizedNumber = normalizeNumber(numeral.toString()); + + termAttr.setEmpty(); + termAttr.append(normalizedNumber); + offsetAttr.setOffset(startOffset, endOffset); + + numeral = new StringBuilder(); + return true; + } + + return moreTokens; + } + + @Override + public void reset() throws IOException { + super.reset(); + startOffset = 0; + endOffset = 0; + state = null; + numeral = new StringBuilder(); + } + + /** + * Normalizes a Japanese number + * + * @param number number or normalize + * @return normalized number, or number to normalize on error (no op) + */ + public String normalizeNumber(String number) { + try { + BigDecimal normalizedNumber = parseNumber(new NumberBuffer(number)); + if (normalizedNumber == null) { + return number; + } + return normalizedNumber.toBigIntegerExact().toString(); + + } catch (NumberFormatException | ArithmeticException e) { + // Return the source number in case of error, i.e. malformed input + return number; + } + } + + /** + * Parses a Japanese number + * + * @param buffer buffer to parse + * @return parsed number, or null on error or end of input + */ + private BigDecimal parseNumber(NumberBuffer buffer) { + BigDecimal sum = BigDecimal.ZERO; + BigDecimal result = parseLargePair(buffer); + + if (result == null) { + return null; + } + + while (result != null) { + sum = sum.add(result); + result = parseLargePair(buffer); + } + + return sum; + } + + /** + * Parses a pair of large numbers, i.e. large kanji factor is 10,000(万)or larger + * + * @param buffer buffer to parse + * @return parsed pair, or null on error or end of input + */ + private BigDecimal parseLargePair(NumberBuffer buffer) { + BigDecimal first = parseMediumNumber(buffer); + BigDecimal second = parseLargeKanjiNumeral(buffer); + + if (first == null && second == null) { + return null; + } + + if (second == null) { + // If there's no second factor, we return the first one + // This can happen if we our number is smaller than 10,000 (万) + return first; + } + + if (first == null) { + // If there's no first factor, just return the second one, + // which is the same as multiplying by 1, i.e. with 万 + return second; + } + + return first.multiply(second); + } + + /** + * Parses a "medium sized" number, typically less than 10,000(万), but might be larger + * due to a larger factor from {link parseBasicNumber}. + * + * @param buffer buffer to parse + * @return parsed number, or null on error or end of input + */ + private BigDecimal parseMediumNumber(NumberBuffer buffer) { + BigDecimal sum = BigDecimal.ZERO; + BigDecimal result = parseMediumPair(buffer); + + if (result == null) { + return null; + } + + while (result != null) { + sum = sum.add(result); + result = parseMediumPair(buffer); + } + + return sum; + } + + /** + * Parses a pair of "medium sized" numbers, i.e. large kanji factor is at most 1,000(千) + * + * @param buffer buffer to parse + * @return parsed pair, or null on error or end of input + */ + private BigDecimal parseMediumPair(NumberBuffer buffer) { + + BigDecimal first = parseBasicNumber(buffer); + BigDecimal second = parseMediumKanjiNumeral(buffer); + + if (first == null && second == null) { + return null; + } + + if (second == null) { + // If there's no second factor, we return the first one + // This can happen if we just have a plain number such as 五 + return first; + } + + if (first == null) { + // If there's no first factor, just return the second one, + // which is the same as multiplying by 1, i.e. with 千 + return second; + } + + // Return factors multiplied + return first.multiply(second); + } + + /** + * Parse a basic number, which is a sequence of Arabic numbers or a sequence or 0-9 kanji numerals (〇 to 九). + * + * @param buffer buffer to parse + * @return parsed number, or null on error or end of input + */ + private BigDecimal parseBasicNumber(NumberBuffer buffer) { + StringBuilder builder = new StringBuilder(); + int i = buffer.position(); + + while (i < buffer.length()) { + char c = buffer.charAt(i); + + if (isArabicNumeral(c)) { + // Arabic numerals; 0 to 9 or 0 to 9 (full-width) + builder.append(arabicNumeralValue(c)); + } else if (isKanjiNumeral(c)) { + // Kanji numerals; 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九 + builder.append(kanjiNumeralValue(c)); + } else if (isDecimalPoint(c)) { + builder.append("."); + } else if (isThousandSeparator(c)) { + // Just skip and move to the next character + } else { + // We don't have an Arabic nor kanji numeral, nor separation or punctuation, so we'll stop. + break; + } + + i++; + buffer.advance(); + } + + if (builder.length() == 0) { + // We didn't build anything, so we don't have a number + return null; + } + + return new BigDecimal(builder.toString()); + } + + /** + * Parse large kanji numerals (ten thousands or larger) + * + * @param buffer buffer to parse + * @return parsed number, or null on error or end of input + */ + public BigDecimal parseLargeKanjiNumeral(NumberBuffer buffer) { + int i = buffer.position(); + + if (i >= buffer.length()) { + return null; + } + + char c = buffer.charAt(i); + int power = exponents[c]; + + if (power > 3) { + buffer.advance(); + return BigDecimal.TEN.pow(power); + } + + return null; + } + + /** + * Parse medium kanji numerals (tens, hundreds or thousands) + * + * @param buffer buffer to parse + * @return parsed number or null on error + */ + public BigDecimal parseMediumKanjiNumeral(NumberBuffer buffer) { + int i = buffer.position(); + + if (i >= buffer.length()) { + return null; + } + + char c = buffer.charAt(i); + int power = exponents[c]; + + if (1 <= power && power <= 3) { + buffer.advance(); + return BigDecimal.TEN.pow(power); + } + + return null; + } + + /** + * Numeral predicate + * + * @param input string to test + * @return true if and only if input is a numeral + */ + public boolean isNumeral(String input) { + for (int i = 0; i < input.length(); i++) { + if (!isNumeral(input.charAt(i))) { + return false; + } + } + return true; + } + + /** + * Numeral predicate + * + * @param c character to test + * @return true if and only if c is a numeral + */ + public boolean isNumeral(char c) { + return isArabicNumeral(c) || isKanjiNumeral(c) || exponents[c] > 0 || isDecimalPoint(c) || isThousandSeparator(c); + } + + /** + * Arabic numeral predicate. Both half-width and full-width characters are supported + * + * @param c character to test + * @return true if and only if c is an Arabic numeral + */ + public boolean isArabicNumeral(char c) { + return isHalfWidthArabicNumeral(c) || isFullWidthArabicNumeral(c); + } + + /** + * Arabic half-width numeral predicate + * + * @param c character to test + * @return true if and only if c is a half-width Arabic numeral + */ + private boolean isHalfWidthArabicNumeral(char c) { + // 0 U+0030 - 9 U+0039 + return '0' <= c && c <= '9'; + } + + /** + * Arabic full-width numeral predicate + * + * @param c character to test + * @return true if and only if c is a full-width Arabic numeral + */ + private boolean isFullWidthArabicNumeral(char c) { + // 0 U+FF10 - 9 U+FF19 + return '0' <= c && c <= '9'; + } + + /** + * Returns the numeric value for the specified character Arabic numeral. + * Behavior is undefined if a non-Arabic numeral is provided + * + * @param c arabic numeral character + * @return numeral value + */ + private int arabicNumeralValue(char c) { + int offset; + if (isHalfWidthArabicNumeral(c)) { + offset = '0'; + } else { + offset = '0'; + } + return c - offset; + } + + /** + * Kanji numeral predicate that tests if the provided character is one of 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九. + * Larger number kanji gives a false value. + * + * @param c character to test + * @return true if and only is character is one of 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九 (0 to 9) + */ + private boolean isKanjiNumeral(char c) { + return numerals[c] != NO_NUMERAL; + } + + /** + * Returns the value for the provided kanji numeral. Only numeric values for the characters where + * {link isKanjiNumeral} return true are supported - behavior is undefined for other characters. + * + * @param c kanji numeral character + * @return numeral value + * @see #isKanjiNumeral(char) + */ + private int kanjiNumeralValue(char c) { + return numerals[c]; + } + + /** + * Decimal point predicate + * + * @param c character to test + * @return true if and only if c is a decimal point + */ + private boolean isDecimalPoint(char c) { + return c == '.' // U+002E FULL STOP + || c == '.'; // U+FF0E FULLWIDTH FULL STOP + } + + /** + * Thousand separator predicate + * + * @param c character to test + * @return true if and only if c is a thousand separator predicate + */ + private boolean isThousandSeparator(char c) { + return c == ',' // U+002C COMMA + || c == ','; // U+FF0C FULLWIDTH COMMA + } + + /** + * Buffer that holds a Japanese number string and a position index used as a parsed-to marker + */ + public static class NumberBuffer { + + private int position; + + private String string; + + public NumberBuffer(String string) { + this.string = string; + this.position = 0; + } + + public char charAt(int index) { + return string.charAt(index); + } + + public int length() { + return string.length(); + } + + public void advance() { + position++; + } + + public int position() { + return position; + } + } +} \ No newline at end of file Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java (revision 0) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java (working copy) @@ -0,0 +1,187 @@ +package org.apache.lucene.analysis.ja; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Random; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.junit.Ignore; +import org.junit.Test; + +public class TestJapaneseNumberFilter extends BaseTokenStreamTestCase { + + private Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.DEFAULT_MODE); + return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(tokenizer)); + } + }; + + @Test + public void testBasics() throws IOException { + + assertAnalyzesTo(analyzer, "本日十万二千五百円のワインを買った", + new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"}, + new int[]{0, 2, 8, 9, 10, 13, 14, 16}, + new int[]{2, 8, 9, 10, 13, 14, 16, 17} + ); + + assertAnalyzesTo(analyzer, "昨日のお寿司は10万円でした。", + new String[]{"昨日", "の", "お", "寿司", "は", "100000", "円", "でし", "た", "。"}, + new int[]{0, 2, 3, 4, 6, 7, 10, 11, 13, 14}, + new int[]{2, 3, 4, 6, 7, 10, 11, 13, 14, 15} + ); + + assertAnalyzesTo(analyzer, "アティリカの資本金は600万円です", + new String[]{"アティリカ", "の", "資本", "金", "は", "6000000", "円", "です"}, + new int[]{0, 5, 6, 8, 9, 10, 14, 15}, + new int[]{5, 6, 8, 9, 10, 14, 15, 17} + ); + } + + @Test + public void testVariants() throws IOException { + // Test variants of three + assertAnalyzesTo(analyzer, "3", new String[]{"3"}); + assertAnalyzesTo(analyzer, "3", new String[]{"3"}); + assertAnalyzesTo(analyzer, "三", new String[]{"3"}); + + // Test three variations with trailing zero + assertAnalyzesTo(analyzer, "03", new String[]{"3"}); + assertAnalyzesTo(analyzer, "03", new String[]{"3"}); + assertAnalyzesTo(analyzer, "〇三", new String[]{"3"}); + assertAnalyzesTo(analyzer, "003", new String[]{"3"}); + assertAnalyzesTo(analyzer, "003", new String[]{"3"}); + assertAnalyzesTo(analyzer, "〇〇三", new String[]{"3"}); + + // Test thousand variants + assertAnalyzesTo(analyzer, "千", new String[]{"1000"}); + assertAnalyzesTo(analyzer, "1千", new String[]{"1000"}); + assertAnalyzesTo(analyzer, "1千", new String[]{"1000"}); + assertAnalyzesTo(analyzer, "一千", new String[]{"1000"}); + assertAnalyzesTo(analyzer, "一〇〇〇", new String[]{"1000"}); + assertAnalyzesTo(analyzer, "10百", new String[]{"1000"}); // Strange, but supported + } + + @Test + public void testLargeVariants() throws IOException { + // Test large numbers + assertAnalyzesTo(analyzer, "三五七八九", new String[]{"35789"}); + assertAnalyzesTo(analyzer, "六百二万五千一", new String[]{"6025001"}); + assertAnalyzesTo(analyzer, "兆六百万五千一", new String[]{"1000006005001"}); + assertAnalyzesTo(analyzer, "十兆六百万五千一", new String[]{"10000006005001"}); + assertAnalyzesTo(analyzer, "一京一", new String[]{"10000000000000001"}); + assertAnalyzesTo(analyzer, "十京十", new String[]{"100000000000000010"}); + assertAnalyzesTo(analyzer, "垓京兆億万千百十一", new String[]{"100010001000100011111"}); + } + + @Test + public void testMixed() throws IOException { + // Test mixed numbers + assertAnalyzesTo(analyzer, "三千2百2十三", new String[]{"3223"}); + assertAnalyzesTo(analyzer, "32二三", new String[]{"3223"}); + } + + @Test + public void testFunny() throws IOException { + // Test some oddities for inconsistent input + assertAnalyzesTo(analyzer, "十十", new String[]{"20"}); // 100? + assertAnalyzesTo(analyzer, "百百百", new String[]{"300"}); // 10,000? + assertAnalyzesTo(analyzer, "千千千千", new String[]{"4000"}); // 1,000,000,000,000? + } + + @Test + public void testKanjiArabic() throws IOException { + // Test kanji numerals used as Arabic numbers (with head zero) + assertAnalyzesTo(analyzer, "〇一二三四五六七八九九八七六五四三二一〇", + new String[]{"1234567899876543210"} + ); + + // I'm Bond, James "normalized" Bond... + assertAnalyzesTo(analyzer, "〇〇七", new String[]{"7"}); + } + + @Test + public void testDecimalPunctuation() throws IOException { + // Test Arabic numbers with punctuation, i.e. 3.2 thousands + assertAnalyzesTo(analyzer, "3.2千円", + new String[]{"3200", "円"} + ); + } + + @Test + public void testThousandSeparator() throws IOException { + assertAnalyzesTo(analyzer, "4,647", + new String[]{"4647"} + ); + } + + @Test + public void testEmpty() throws IOException { + assertAnalyzesTo(analyzer, "", new String[]{}); + } + + @Test + public void testRandomStrings() throws Exception { + checkRandomData(random(), analyzer, 1000 * RANDOM_MULTIPLIER); + } + + // nocommit + @Ignore("Fails using: ant test -Dtestcase=TestJapaneseNumberFilter -Dtests.method=testRandomHugeStrings -Dtests.seed=C8544B8473D5EE13") + @Test + public void testRandomHugeStrings() throws Exception { + checkRandomData(random(), analyzer, 50 * RANDOM_MULTIPLIER, 8192); + } + + // nocommit + @Ignore("Temporarily ignore - attempt to reproduce the above error") + @Test + public void testLargeRandomIssue() throws Exception { + String input = "\u3002\u3023\u3000\u302e\u3006\u3005 \u300b\u3019\u3007\u302e\u3029\u3039\u3023\u3033\u302b\u300c \u04b5 hkrld tzqfwfy clkaa wyvzrc mBTbdQEoE \u20ed\u20d2\u20f8\u20f7\u20f5\u20d8\u20ee w\u947b^\uc98b\uda5f\udc60*t\uda0e\udc0dK\u0000\u0e5a \u0738\u0721\u0716\u071a\u071f\u0738\u074d\u072c \ua920\ua907\ua91d\ua924\ua90c \ud834\udcab\ud834\udcd9 \u886a\u06c5 \u4dd6\u4dde\u4dd8\u4dd1 \u2d50\u2d77\u2d6e\u2d75\u2d54\u2d5e\u2d39\u2d3d bbytzpc lra bdjwzqju \ud9f2\udd86\uf622\ued2aA\ue628\u03c1es\u0008 \ud83c\udd25\ud83c\udd4b\ud83c\udd9f\ud83c\udd8a ojbkxvjlh kojdbdrmvsky \u31f6\u31f9\u31f9\u31f2\u31fd\u31f2\u31f1\u31f1\u31fa\u31fb\u31f7 \u1cfb\u1ce6\u1ce7\u1cfe\u1cd3 fafq nguigfj i \u2525\u2531\u2544\u254e\u251d\u252c\u2571\u2565\u2523\u257f zn vjsowkzwb dhctlky ; behphqbbuczz \ua5f2\ua616\ua60b\ua63a \u9ba5\u01ab\uda71\udf70^\u5d8f\u03fa\u04e9\uf79c\ud999\udecc\u0b83\uefb1 hnvhpus \u02f8P\uef44\u0e2c?\u032f\u9cb2\u0404 tqvsfkajlvsd zmOxDd \u0cd4\u0ccb\u0cc3\u0cb8\u0cc8\u0caa\u0ccf\u0cf6 Es\u060f\uf85d\u0d4d\udb64\uddba\u0123 \u27fa\u27fb\u27fd\u27f5\u27f2\u27f8\u27f7\u27f1\u27fd yjrpntl \ud8d6\udc13&\u2628*\uf534\u0787 \u01140\u017a\ue114aX\u3af5\n ikxh ?>< / ]?du(.p)r(r[rw \u180b\u183c\u1874\u1890\u18a9 wxeihok 1\ud959\udf77\u0548\u00a1k\u03a9\uf8fc\uda59\udfa0\u0234\uc9e4 ypkmdqa \ud800\udf91\ud800\udf91 uwbcpqde \udaba\udc53\ufd53\u0353z\ud877\udf4c\u0776\ufcba\ufc6a\u0667\uf303 \u31f4\u31fc\u31fe\u31f3\u31fe\u31fe\u31f9\u31f3 \u13ef\u13eb\u13ee\u13af\u13a0\u13a4 \u00da\u00e6\u009a\u0086\u00e9\u00c8\u00c5\u00bc\u008b\u00dd\u00e2\u0094\u00ff zhnygq dkmfcqkutoge \uff57\uff58 mppkjcghtb lnvozvjuon `\u03c9$w\ueb09\uf169V&\ue8d4\uea2a \u0652\uf942\uf5a2u\ud810\udfea\u0187 'Q\u0014%\u001c/\u0008G< \ua639\ua540\ua62f\ua570\ua5f5\ua5d3\ua536 \u0556\uf2ee\u00af2\u0002\u07eb\ud9e3\udf7f07 mgxkk lqqyinqyif zd{0,5}| jegtiicqpy djicnknwsk ll gtVmbQtE \u00ce\u000c\ufe46\uf95d bviqpzpcrd rwfmpdb \ud15c\uda49\ude98\uf43f\u010a\u0571\uf8a9\u1445v cypre btnmzdzspbm \ud800\udcf7\ud800\udce9\ud800\udcf0\ud800\udc9b \ud843\udd3a\ud84a\udf38\ud846\ude8e\ud846\udece \ud802\udc50\ud802\udc47\ud802\udc55\ud802\udc59\ud802\udc4b \ud9d2\udc5b\u02f9d\u02b5\uad7b\u05f2\u07cb\uf79f8e \uf487\u68bfk\uf3d0\uf117\uc5f5\u000e\u491f\u06ef\u0163 {0,5}]iw] \u8189_\udb2d\uddb4\ufb7d\ud955\uddeck\ue1b0\u9199 \u0009\u0181\u01f8\uf0a6\u4cc0oj \u0756\ud87a\udd19C\ud60d\udaa6\udd62 izt h\ufe62\uda2f\udf2d\u04f4\ue8a8' zntuczvetvd \u291a\u297f \u0eaa\u0ea4\u0ee5\u0e97\u0eb1\u0eff\u0ed1 \uf0ff\ufb65\ueed6\u0378\u5c14\u9137\u322c\u0115\ubfd7\ub191\udbfc\udcfd \u7361q\uf57e\u898a\uf1f5\ucd7dp\ua861 \u49f9\u1251\u03ff\ubb0b\u0003O\ubdc5 ogwxrn ' >& gsotft eegebmcgxbz \u07f6b\u0342 v\u06fb\u9601\uf653\u31f4 \uf4bc\uf252\uf334J\u0322\ud9f2\ude37

 wujgj \\\"

&#< bzycgltoyy \u2482\u249b\u248f\u2485\u249f\u2494\u24e0\u24de fubjxymhq \u31a9\u31ba\u31ac\u31af\u31b7\u31aa \ud801\udc9a\ud801\udc90\ud801\udca2\ud801\udc93\ud801\udc97\ud801\udc90 -xy rgq < esocbo \u245b\u2440\u245f"; + BaseTokenStreamTestCase.checkAnalysisConsistency(new Random(), analyzer, true, input, true); + } + + private void printTokenDetails(Analyzer analyzer, String text) throws IOException { + TokenStream stream = analyzer.tokenStream("dummy", text); + stream.reset(); + + CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class); + TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); + + System.out.println("text: " + text); + + while (stream.incrementToken()) { + System.out.println( + "term: " + termAttr + + "\ttype: " + typeAttr.type() + + "\tstart offset: " + offsetAttr.startOffset() + + "\tend offset: " + offsetAttr.endOffset() + ); + } + stream.close(); + } +}