Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java (revision 0) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java (revision 0) @@ -0,0 +1,91 @@ +package org.apache.lucene.analysis.icu; + + +import java.io.*; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.util.Version; +import org.junit.Test; + +import com.ibm.icu.text.Normalizer2; + +public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { + + @Test + public void testNormalization() throws IOException { + String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; + Normalizer2 referenceNormalizer = Normalizer2.getInstance(null, "nfkc_cf", + Normalizer2.Mode.COMPOSE); + String expectedOutput = referenceNormalizer.normalize(input); + + CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), + ICUNormalizer2CharFilter.Form.NFKC_CF); + char[] tempBuff = new char[10]; + StringBuilder output = new StringBuilder(); + while (true) { + int length = reader.read(tempBuff); + if (length == -1) { + break; + } + output.append(tempBuff, 0, length); + assertEquals( + output.toString(), + referenceNormalizer.normalize(input.substring(0, + reader.correctOffset(output.length())))); + } + + assertEquals(expectedOutput, output.toString()); + } + + @Test + public void testTokenStream() throws IOException { + // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<' + String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙"; + + CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), + ICUNormalizer2CharFilter.Form.NFKC); + + TokenStream tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + + assertTokenStreamContents(tokenStream, + new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"}, + new int[] {0, 2, 4, 6, 8, 11, 14}, + new int[] {1, 3, 5, 7, 10, 13, 16}, + input.length()); + } + + @Test + public void testTokenStream2() throws IOException { + // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<' + String input = "㌰゙5℃№㈱㌘ザゾ"; + + CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), + ICUNormalizer2CharFilter.Form.NFKC_CF); + + TokenStream tokenStream = new NGramTokenizer(Version.LUCENE_50, reader, 1,1); + + assertTokenStreamContents(tokenStream, + new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"}, + new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9}, + new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11}, + input.length() + ); + } + + public void testRandomStrings() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new Analyzer.TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + return new ICUNormalizer2CharFilter(reader, ICUNormalizer2CharFilter.Form.NFKC_CF); + } + }; + + checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER); + } + +} Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java (revision 0) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java (revision 0) @@ -0,0 +1,251 @@ +package org.apache.lucene.analysis.icu; + +import java.io.IOException; +import java.io.Reader; +import static java.lang.Math.*; + +import org.apache.lucene.analysis.charfilter.BaseCharFilter; +import org.apache.lucene.analysis.CharFilter; + +import com.ibm.icu.text.Normalizer2; + +public class ICUNormalizer2CharFilter extends BaseCharFilter { + + public static enum Form { + NFC(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE)), + + NFD(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)), + + NFKC(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE)), + + NFKC_CF(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)), + + NFKD(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE)); + + private Normalizer2 normalizer; + + private Form(Normalizer2 normalizer) { + this.normalizer = normalizer; + } + } + + private static final int IO_BUFFER_SIZE = 128; + + private final Normalizer2 normalizer; + private final StringBuilder inputBuffer; + private final StringBuilder resultBuffer; + + private boolean inputFinished; + private boolean afterQuickCheckYes; + private int checkedInputBoundary; + private int charCount; + + /** + * default is NFKC_CF + */ + public ICUNormalizer2CharFilter(CharFilter in) { + this(in, Form.NFKC_CF); + } + + public ICUNormalizer2CharFilter(Reader in, Form form) { + this(in, form.normalizer); + } + + public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) { + super(in); + if (normalizer == null) throw new NullPointerException("normalizer == null"); + + this.normalizer = normalizer; + inputBuffer = new StringBuilder(); + resultBuffer = new StringBuilder(); + resetFields(); + } + + public ICUNormalizer2CharFilter(CharFilter in, Form form) { + this(in, form.normalizer); + } + + public ICUNormalizer2CharFilter(CharFilter in, Normalizer2 normalizer) { + super(in); + if (normalizer == null) throw new NullPointerException("normalizer == null"); + + this.normalizer = normalizer; + inputBuffer = new StringBuilder(); + resultBuffer = new StringBuilder(); + resetFields(); + } + + @Override + public void reset() throws IOException { + super.reset(); + resetFields(); + } + + private void resetFields() { + inputBuffer.delete(0, inputBuffer.length()); + checkedInputBoundary = 0; + resultBuffer.delete(0, resultBuffer.length()); + inputFinished = false; + afterQuickCheckYes = false; + charCount = 0; + } + + @Override + public void close() throws IOException { + resetFields(); + super.close(); + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + if (off < 0) throw new IllegalArgumentException("off < 0"); + if (off >= cbuf.length) throw new IllegalArgumentException( + "off >= cbuf.length"); + if (len <= 0) throw new IllegalArgumentException("len <= 0"); + + while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) { + int retLen; + + if (resultBuffer.length() > 0) { + retLen = outputFromResultBuffer(cbuf, off, len); + if (retLen > 0) { + return retLen; + } + } + + int resLen = readAndNormalizeFromInput(); + if (resLen > 0) { + retLen = outputFromResultBuffer(cbuf, off, len); + if (retLen > 0) { + return retLen; + } + } + + readInputToBuffer(); + } + + return -1; + } + + private final char[] tmpBuffer = new char[IO_BUFFER_SIZE]; + + private int readInputToBuffer() throws IOException { + final int len = input.read(tmpBuffer); + if (len == -1) { + inputFinished = true; + return 0; + } + inputBuffer.append(tmpBuffer, 0, len); + return len; + } + + private int readAndNormalizeFromInput() { + if (inputBuffer.length() <= 0) { + afterQuickCheckYes = false; + return 0; + } + if (!afterQuickCheckYes) { + int resLen = readFromInputWhileSpanQuickCheckYes(); + afterQuickCheckYes = true; + if (resLen > 0) return resLen; + } + int resLen = readFromIoNormalizeUptoBoundary(resultBuffer); + if(resLen > 0){ + afterQuickCheckYes = false; + } + return resLen; + } + + private int readFromInputWhileSpanQuickCheckYes() { + int end = normalizer.spanQuickCheckYes(inputBuffer); + if (end > 0) { + resultBuffer.append(inputBuffer.subSequence(0, end)); + inputBuffer.delete(0, end); + checkedInputBoundary = max(checkedInputBoundary - end, 0); + charCount += end; + } + return end; + } + + private int readFromIoNormalizeUptoBoundary(StringBuilder dest) { + if (inputBuffer.length() <= 0) { + return 0; + } + + boolean foundBoundary = false; + final int bufLen = inputBuffer.length(); + + while (checkedInputBoundary < bufLen - 1) { + ++checkedInputBoundary; + if (normalizer.hasBoundaryBefore(inputBuffer + .charAt(checkedInputBoundary))) { + foundBoundary = true; + break; + } else if (normalizer.hasBoundaryAfter(inputBuffer + .charAt(checkedInputBoundary - 1))) { + foundBoundary = true; + break; + } + } + if (checkedInputBoundary == bufLen - 1) { + if (normalizer.hasBoundaryAfter(inputBuffer.charAt(checkedInputBoundary)) + || inputFinished) { + foundBoundary = true; + ++checkedInputBoundary; + } + } + if (!foundBoundary) { + return 0; + } + + return normalizeInputUpto(checkedInputBoundary); + } + + private int normalizeInputUpto(final int length) { + final int destOrigLen = resultBuffer.length(); + normalizer.normalizeSecondAndAppend(resultBuffer, + inputBuffer.subSequence(0, length)); + inputBuffer.delete(0, length); + checkedInputBoundary = max(checkedInputBoundary - length, 0); + final int resultLength = resultBuffer.length() - destOrigLen; + recordOffsetDiff(length, resultLength); + return resultLength; + } + + private void recordOffsetDiff(int inputLength, int outputLength) { + if (inputLength == outputLength) { + charCount += outputLength; + return; + } + final int diff = inputLength - outputLength; + final int cumuDiff = getLastCumulativeDiff(); + if (diff < 0) { + for (int i = 1; i <= -diff; ++i) { + addOffCorrectMap(charCount + i, cumuDiff - i); + } + } else { + addOffCorrectMap(charCount + Math.min(1, outputLength), cumuDiff + diff); + } + charCount += outputLength; + } + + private int outputFromResultBuffer(char[] cbuf, int begin, int len) { + len = min(resultBuffer.length(), len); + resultBuffer.getChars(0, len, cbuf, begin); + if (len > 0) { + resultBuffer.delete(0, len); + } + return len; + } + + @Override + public boolean markSupported() { + return false; + } + + @Override + public void mark(int readAheadLimit) throws IOException { + throw new IOException("mark() not supported"); + } + +} \ No newline at end of file