Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java (revision 0) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java (working copy) @@ -0,0 +1,224 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.charfilter.BaseCharFilter; + +import com.ibm.icu.text.Normalizer2; + +/** + * Normalize token text with ICU's {@link Normalizer2}. + */ +public final class ICUNormalizer2CharFilter extends BaseCharFilter { + + private static final int IO_BUFFER_SIZE = 128; + + private final Normalizer2 normalizer; + private final StringBuilder inputBuffer; + private final StringBuilder resultBuffer; + + private boolean inputFinished; + private boolean afterQuickCheckYes; + private int checkedInputBoundary; + private int charCount; + + /** + * Create a new Normalizer2CharFilter that combines NFKC normalization, Case + * Folding, and removes Default Ignorables (NFKC_Casefold) + */ + public ICUNormalizer2CharFilter(Reader in) { + this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); + } + + /** + * Create a new Normalizer2CharFilter with the specified Normalizer2 + * @param in text + * @param normalizer normalizer to use + */ + public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) { + super(in); + if (normalizer == null) throw new NullPointerException("normalizer == null"); + + this.normalizer = normalizer; + inputBuffer = new StringBuilder(); + resultBuffer = new StringBuilder(); + resetFields(); + } + + @Override + public void reset() throws IOException { + super.reset(); + resetFields(); + } + + private void resetFields() { + inputBuffer.delete(0, inputBuffer.length()); + checkedInputBoundary = 0; + resultBuffer.delete(0, resultBuffer.length()); + inputFinished = false; + afterQuickCheckYes = false; + charCount = 0; + } + + @Override + public void close() throws IOException { + resetFields(); + super.close(); + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + if (off < 0) throw new IllegalArgumentException("off < 0"); + if (off >= cbuf.length) throw new IllegalArgumentException( + "off >= cbuf.length"); + if (len <= 0) throw new IllegalArgumentException("len <= 0"); + + while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) { + int retLen; + + if (resultBuffer.length() > 0) { + retLen = outputFromResultBuffer(cbuf, off, len); + if (retLen > 0) { + return retLen; + } + } + + int resLen = readAndNormalizeFromInput(); + if (resLen > 0) { + retLen = outputFromResultBuffer(cbuf, off, len); + if (retLen > 0) { + return retLen; + } + } + + readInputToBuffer(); + } + + return -1; + } + + private final char[] tmpBuffer = new char[IO_BUFFER_SIZE]; + + private int readInputToBuffer() throws IOException { + final int len = input.read(tmpBuffer); + if (len == -1) { + inputFinished = true; + return 0; + } + inputBuffer.append(tmpBuffer, 0, len); + if (len >= 2 && normalizer.hasBoundaryAfter(tmpBuffer[len-1]) && !Character.isHighSurrogate(tmpBuffer[len-1])) { + return len; + } else return len + readInputToBuffer(); + } + + private int readAndNormalizeFromInput() { + if (inputBuffer.length() <= 0) { + afterQuickCheckYes = false; + return 0; + } + if (!afterQuickCheckYes) { + int resLen = readFromInputWhileSpanQuickCheckYes(); + afterQuickCheckYes = true; + if (resLen > 0) return resLen; + } + int resLen = readFromIoNormalizeUptoBoundary(resultBuffer); + if(resLen > 0){ + afterQuickCheckYes = false; + } + return resLen; + } + + private int readFromInputWhileSpanQuickCheckYes() { + int end = normalizer.spanQuickCheckYes(inputBuffer); + if (end > 0) { + resultBuffer.append(inputBuffer.subSequence(0, end)); + inputBuffer.delete(0, end); + checkedInputBoundary = Math.max(checkedInputBoundary - end, 0); + charCount += end; + } + return end; + } + + private int readFromIoNormalizeUptoBoundary(StringBuilder dest) { + if (inputBuffer.length() <= 0) { + return 0; + } + + boolean foundBoundary = false; + final int bufLen = inputBuffer.length(); + + while (checkedInputBoundary <= bufLen - 1) { + int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary)); + checkedInputBoundary += charLen; + if (checkedInputBoundary < bufLen && normalizer.hasBoundaryBefore(inputBuffer + .codePointAt(checkedInputBoundary))) { + foundBoundary = true; + break; + } + } + if (!foundBoundary && inputFinished && checkedInputBoundary >= inputBuffer.length()) { + foundBoundary = true; + } + + if (!foundBoundary) { + return 0; + } + + return normalizeInputUpto(checkedInputBoundary); + } + + private int normalizeInputUpto(final int length) { + final int destOrigLen = resultBuffer.length(); + normalizer.normalizeSecondAndAppend(resultBuffer, + inputBuffer.subSequence(0, length)); + inputBuffer.delete(0, length); + checkedInputBoundary = Math.max(checkedInputBoundary - length, 0); + final int resultLength = resultBuffer.length() - destOrigLen; + recordOffsetDiff(length, resultLength); + return resultLength; + } + + private void recordOffsetDiff(int inputLength, int outputLength) { + if (inputLength == outputLength) { + charCount += outputLength; + return; + } + final int diff = inputLength - outputLength; + final int cumuDiff = getLastCumulativeDiff(); + if (diff < 0) { + for (int i = 1; i <= -diff; ++i) { + addOffCorrectMap(charCount + i, cumuDiff - i); + } + } else { + addOffCorrectMap(charCount + Math.min(1, outputLength), cumuDiff + diff); + } + charCount += outputLength; + } + + private int outputFromResultBuffer(char[] cbuf, int begin, int len) { + len = Math.min(resultBuffer.length(), len); + resultBuffer.getChars(0, len, cbuf, begin); + if (len > 0) { + resultBuffer.delete(0, len); + } + return len; + } +} \ No newline at end of file Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java (revision 0) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java (working copy) @@ -0,0 +1,211 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.util.Version; +import org.apache.lucene.util._TestUtil; +import org.junit.Test; + +import com.ibm.icu.text.Normalizer2; + +public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { + + @Test + public void testNormalization() throws IOException { + String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; + Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); + String expectedOutput = normalizer.normalize(input); + + CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer); + char[] tempBuff = new char[10]; + StringBuilder output = new StringBuilder(); + while (true) { + int length = reader.read(tempBuff); + if (length == -1) { + break; + } + output.append(tempBuff, 0, length); + assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length())))); + } + + assertEquals(expectedOutput, output.toString()); + } + + @Test + public void testTokenStream() throws IOException { + // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<' + String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙"; + + CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), + Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE)); + + TokenStream tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + + assertTokenStreamContents(tokenStream, + new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"}, + new int[] {0, 2, 4, 6, 8, 11, 14}, + new int[] {1, 3, 5, 7, 10, 13, 16}, + input.length()); + } + + @Test + public void testTokenStream2() throws IOException { + // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<' + String input = "㌰゙5℃№㈱㌘ザゾ"; + + CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), + Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); + + TokenStream tokenStream = new NGramTokenizer(Version.LUCENE_50, reader, 1,1); + + assertTokenStreamContents(tokenStream, + new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"}, + new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9}, + new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11}, + input.length() + ); + } + + public void doTestMode(final Normalizer2 normalizer, int maxLength, int iterations) throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.KEYWORD, false)); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + return new ICUNormalizer2CharFilter(reader, normalizer); + } + }; + + for (int i = 0; i < iterations; i++) { + String input = _TestUtil.randomUnicodeString(random(), maxLength); + if (input.length() == 0) { + continue; + } + String normalized = normalizer.normalize(input); + if (normalized.length() == 0) { + continue; // MockTokenizer doesnt tokenize empty string... + } + checkOneTerm(a, input, normalized); + } + } + + public void testNFC() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 20, 10000); + } + + public void testNFCHuge() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 8192, 1000); + } + + public void testNFD() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 20, 10000); + } + + public void testNFDHuge() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 8192, 1000); + } + + public void testNFKC() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 20, 10000); + } + + public void testNFKCHuge() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 8192, 1000); + } + + public void testNFKD() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 20, 10000); + } + + public void testNFKDHuge() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 8192, 1000); + } + + public void testNFKC_CF() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 20, 10000); + } + + public void testNFKC_CFHuge() throws Exception { + doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 8192, 1000); + } + + public void testRandomStrings() throws IOException { + // nfkc_cf + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); + } + }; + checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); + // huge strings + checkRandomData(random(), a, 128*RANDOM_MULTIPLIER, 8192); + + // nfkd + a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE)); + } + }; + checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); + // huge strings + checkRandomData(random(), a, 128*RANDOM_MULTIPLIER, 8192); + } + + public void testCuriousString() throws Exception { + String text = "\udb40\udc3d\uf273\ue960\u06c8\ud955\udc13\ub7fc\u0692 \u2089\u207b\u2073\u2075"; + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); + } + }; + for (int i = 0; i < 10000; i++) { + checkAnalysisConsistency(random(), a, false, text); + } + } +}