Index: contrib/icu/build.xml
===================================================================
--- contrib/icu/build.xml (revision 0)
+++ contrib/icu/build.xml (revision 0)
@@ -0,0 +1,51 @@
+
+
+
+
+
+ * By default, the following processing is performed: + *
+ * Default caseless matching, or case-folding is more than just conversion to lowercase. + * For example, it handles cases such as the Greek sigma, so that "Μάϊος" and "ΜΆΪΟΣ" will match correctly. + *
+ *+ * Case-folding is still only an approximation of the language-specific rules governing case. + * If the specific language is known, consider using {@link ICUCollationKeyFilter} and indexing collation keys instead. + * This filter performs the "full" case-folding specified in the Unicode standard, and this may change the length of the term. + * For example, the German ß is case-folded to the string 'ss'. + *
+ *+ * This filter respects both canonical and compatibility equivalence. + * The default caseless matching in the Unicode standard respects canonical equivalence. + * Additional logic has been added to this filter to respect compatibility equivalence, avoiding an extra normalization. + * This logic also ensures the output is closed under all compatibility forms. + * You do not need to normalize before folding, or fold and normalize twice. + * Closure means all of the below conditions are true: + *
+ * This filter does not preserve normalization forms. + * Instead, suggested usage is to first case-fold, then normalize. + *
+ * For more details, see Unicode Standard sections 5.18: Caseless Matching + * and 3.13: Default Caseless Matching + */ + +public class ICUCaseFoldingFilter extends TokenFilter { + // this is true for NFKC or NFKD. When set, the FC_NFKC_Closure mappings from DerivedNormalizationProps.txt will be applied. + private final boolean nfkcClosure; + + // case-folding output buffer, will be resized if necessary. + private char buffer[] = new char[4096]; + + //new api term attribute, will be updated with folded text. + private TermAttribute termAtt; + + /** + * Create a new ICUCaseFoldingFilter, operating on the provided input stream. + * Output will be case-folded and closed under the supplied normalization mode. + * @param input {@link TokenStream} to filter + * @param mode Normalization mode hint, will ensure output is closed under that mode. + */ + public ICUCaseFoldingFilter(TokenStream input, Normalizer.Mode mode) { + super(input); + nfkcClosure = (mode == Normalizer.NFKC || mode == Normalizer.NFKD); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + + /* + * Algorithm: + * Up-front, the set of BMP codepoints that are case-insensitive are calculated. + * For these, and for basic latin, perform simple lower-case, as its equivalent to case-folding. + * + * If a surrogate or case-sensitive (outside of basic latin) character is encountered, bail and proceed down the slower path. + */ + + public boolean incrementToken() throws IOException { + + if (input.incrementToken()) { + final char src[] = termAtt.termBuffer(); + final int length = termAtt.termLength(); + + for (int i = 0; i < length; i++) { + final char ch = src[i]; + + if (ch <= 0x7F /* basic latin */ || + (!UTF16.isSurrogate(ch) && insensitive.contains(ch))) { + src[i] = Character.toLowerCase(ch); // either basic latin or a BMP-case-insensitive codepoint + } else { + + /* + * a case-sensitive codepoint outside of basic latin, or a surrogate has been encountered. + * bail out completely and invoke the 'slow' case folding algorithm. + */ + + final int requiredLength = length << 2; // Max expansion factor: 3x for case folding. 4x for case folding + NFKC closure + + if (buffer.length < requiredLength) + buffer = new char[ArrayUtil.getNextSize(requiredLength)]; + + final int newLength = fold(src, length, buffer, nfkcClosure); + termAtt.setTermBuffer(buffer, 0, newLength); + return true; + } + } + + return true; + } else { + return false; + } + } + + public Token next(final Token reusableToken) throws IOException { + assert reusableToken != null; + + Token nextToken = input.next(reusableToken); + if (nextToken != null) { + final char src[] = nextToken.termBuffer(); + final int length = nextToken.termLength(); + + for (int i = 0; i < length; i++) { + final char ch = src[i]; + + if (ch <= 0x7F /* basic latin */ || + (!UTF16.isSurrogate(ch) && insensitive.contains(ch))) { + src[i] = Character.toLowerCase(ch); // either basic latin or a BMP-case-insensitive codepoint + } else { + + /* + * a case-sensitive codepoint outside of basic latin, or a surrogate has been encountered. + * bail out completely and invoke the 'slow' case folding algorithm. + */ + + final int requiredLength = length << 2; // Max expansion factor: 3x for case folding. 4x for case folding + NFKC closure + + if (buffer.length < requiredLength) + buffer = new char[ArrayUtil.getNextSize(requiredLength)]; + + final int newLength = fold(src, length, buffer, nfkcClosure); + nextToken.setTermBuffer(buffer, 0, newLength); + return nextToken; + } + } + + return nextToken; + } else { + return null; + } + } + + /* + * What ICU should have, some way to do full case folding on char[] + * Behind the scenes this is still bad for the single-many case (look at UCaseProps.java) + * For really good performance, its not hard to create IntTrie data files from CaseFolding.txt and DerivedNormalizationProps.txt + * But this would introduce a maintenance hassle: would have to update data files whenever the Unicode standard is updated. + * Finally, there is a ticket to improve this in ICU: http://bugs.icu-project.org/trac/ticket/5072 + * + * Add a function that performs string normalization and case folding according to the Unicode Standard, chapter 5 + * "Implementation Guidelines", section "Case Mappings". The availability of such a function would help avoid errors + * by users who are unaware of the complications involved. There should be a parameter for the output normalization form. + * For "K" forms, the FC_NFKC_Closure should be applied as well. + * + * Until this functionality is implemented, this looks to be the best overall tradeoff. + * When it is, the logic below should be removed! + */ + + // Low-level unicode case properties functionality + private static final UCaseProps caseProps; + + /* + * Output buffer for when a single codepoint folds to multiple codepoints, required by UCaseProps. + * Fortunately, this is only used for the case where the length of a string is increased. + * Unfortunately, behind the scenes UCaseProps appends to this buffer with 'new String(xxx)'... + */ + private final StringBuffer foldOut = new StringBuffer(4); + + // Output buffer for when a codepoint has an FC_NFKC_Closure mapping + private final char replacement[] = new char[4]; + + // set of case-insensitive BMP characters. + private static final UnicodeSet insensitive; + + static { + + /* + * In the ICU UCharacter implementation there is some logic here involving getDummy(). + * This is supposedly for the case in which the case-properties data cannot be loaded into memory. + * + * In this case, there are bigger problems if there isn't enough memory for the JVM to even lowercase! + */ + + try { + caseProps = UCaseProps.getSingleton(); + } catch(IOException e) { + throw new RuntimeException(e); + } + + /* + * Perhaps too conservative, but still over 60,000 BMP code points. + * This is nice because it is correct even if FC_NFKC_Closure mapping is being applied (NFKD/NFKC case) + * If the codepoint has NFKC_Quick_Check=Yes, then it cannot have an FC_NFKC_Closure mapping. + * Substract from this set any of these that are case-sensitive. + */ + + insensitive = new UnicodeSet("[[:NFKC_Quick_Check=Yes:]-[:Case_Sensitive=True:]]"); + insensitive.compact(); + insensitive.freeze(); + } + + /** + * Folds characters from src to dst, optionally applying FC_NFKC_Closure mappings. + * The output buffer must have enough storage to contain the folded text. + * + * There is no such maximum limit specified in the standard, but a maximum expansion factor of 3x is specified here: + * http://unicode.org/reports/tr36/tr36-6.html#Buffer_Overflows + * + * When applying FC_NFKC_Closure mappings, this expansion factor increases to 4x. + * + * @param src input buffer + * @param length input length + * @param dst output buffer + * @param closure true if FC_NFKC_Closure mappings should be applied + * @return length of folded output. + */ + private int fold(char src[], int length, char dst[], boolean closure) { + int codepoint; + int folded; + int closed; + int newlen = 0; + + for (int i = 0; i < length; i += UTF16.getCharCount(codepoint)) { + codepoint = UTF16.charAt(src, 0, length, i); + + /* + * If closure mappings are requested, and a mapping is applied, then case-folding is complete for this codepoint. + */ + + if (closure) { + closed = Normalizer.getFC_NFKC_Closure(codepoint, replacement); + if (closed > 0) { // apply the closure mapping, no need to apply toFullFolding. + System.arraycopy(replacement, 0, dst, newlen, closed); + newlen += closed; + continue; + } + } + + /* + * The UCaseProps toFullFolding has several possible return values for input codepoint x: + * ~x: This means x is already case-folded. + * x <= MAX_STRING_LENGTH (31): This means x maps to multiple codepoints, which are placed in the supplied StringBuffer. + * In this case x represents the length of that sequence. + * x > MAX_STRING_LENGTH: x is the case-folded single-codepoint result. + */ + + folded = caseProps.toFullFolding(codepoint, foldOut, UCharacter.FOLD_CASE_DEFAULT); + + if (folded < 0) { // codepoint is already folded + newlen += UCharacter.toChars(codepoint, dst, newlen); + } else if (folded <= UCaseProps.MAX_STRING_LENGTH) { // codepoint folds to multiple codepoints + foldOut.getChars(0, folded, dst, newlen); + foldOut.setLength(0); + newlen += folded; + } else if (folded < UTF16.SUPPLEMENTARY_MIN_VALUE) { // an attempt to optimize the BMP case. + dst[newlen++] = (char)folded; + } else { + newlen += UCharacter.toChars(folded, dst, newlen); + } + } + + return newlen; + } + +} Index: contrib/icu/src/java/org/apache/lucene/icu/ICUDigitFoldingFilter.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/ICUDigitFoldingFilter.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/ICUDigitFoldingFilter.java (revision 0) @@ -0,0 +1,131 @@ +package org.apache.lucene.icu; + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.ArrayUtil; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UTF16; + +/** + * A {@link TokenFilter} that folds numeric digits to their ASCII form. + *+ * There are many different representations of numeric digits for different scripts in Unicode. + * This filter folds numeric digits to their ASCII form. + *
+ * For example, '৭০৬' will be folded to '706' + * + */ + +public class ICUDigitFoldingFilter extends TokenFilter { + + private char buffer[] = new char[4096]; + private TermAttribute termAtt; + + public ICUDigitFoldingFilter(TokenStream input) { + super(input); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + + /* + * Algorithm: + * Native digits are typically rare, as are surrogates... + * Fast path is to simply verify the text has no surrogates or digits outside of basic latin. + * + * If a surrogate or native digit (outside of basic latin) character is encountered, bail and proceed down the slower path. + */ + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char src[] = termAtt.termBuffer(); + final int length = termAtt.termLength(); + + for (int i = 0; i < length; i++) { + final char ch = src[i]; + + if (ch > 0x7F && (UTF16.isSurrogate(ch) || UCharacter.isDigit(ch))) { + + /* + * char is a surrogate or digit outside of basic latin + * bail out completely and invoke the slow folding algorithm + */ + + if (buffer.length < length) + buffer = new char[ArrayUtil.getNextSize(length)]; + + final int newLength = foldNumerics(src, length, buffer); + + termAtt.setTermBuffer(buffer, 0, newLength); + return true; + } + } + + return true; + } else { + return false; + } + } + + public Token next(final Token reusableToken) throws IOException { + assert reusableToken != null; + + Token nextToken = input.next(reusableToken); + + if (nextToken != null) { + final char src[] = nextToken.termBuffer(); + final int length = nextToken.termLength(); + + for (int i = 0; i < length; i++) { + final char ch = src[i]; + + if (ch > 0x7F && (UTF16.isSurrogate(ch) || UCharacter.isDigit(ch))) { + + /* + * char is a surrogate or digit outside of basic latin + * bail out completely and invoke the slow folding algorithm + */ + + if (buffer.length < length) + buffer = new char[ArrayUtil.getNextSize(length)]; + + final int newLength = foldNumerics(src, length, buffer); + + nextToken.setTermBuffer(buffer, 0, newLength); + return nextToken; + } + } + + return nextToken; + } else { + return null; + } + } + + +/* + * There are no closure issues here. + * For all codepoints, NFKC(foldNumeric(x)) = NFKC(foldNumeric(NFKC(foldNumeric(x)))) + */ + + private int foldNumerics(char src[], int length, char dst[]) { + int codepoint; + int folded; + int newlen = 0; + + for (int i = 0; i < length; i += UTF16.getCharCount(codepoint)) { + codepoint = UTF16.charAt(src, 0, length, i); + + if (UCharacter.isDigit(codepoint)) + folded = UCharacter.forDigit(UCharacter.getNumericValue(codepoint), 10); + else + folded = codepoint; + + newlen += UCharacter.toChars(folded, dst, newlen); + } + return newlen; + } +} Index: contrib/icu/src/java/org/apache/lucene/icu/ICUFormatFilter.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/ICUFormatFilter.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/ICUFormatFilter.java (revision 0) @@ -0,0 +1,135 @@ +package org.apache.lucene.icu; + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.ArrayUtil; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; + +/** + * A {@link TokenFilter} that removes codepoints that affect the formatting and display of text. + *+ * Some codepoints in Unicode exist only to alter the formatting or display of text. + * This filter removes Format and Variation Selector codepoints. + *
+ * TODO: more docs + */ +public class ICUFormatFilter extends TokenFilter { + + private char buffer[] = new char[4096]; + private TermAttribute termAtt; + + public ICUFormatFilter(TokenStream input) { + super(input); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + + /* + * Algorithm: + * Format/Variation Selectors are typically rare, as are surrogates... + * Fast path is to simply verify the text has no surrogates or Format/Variation Selectors + * + * If a surrogate or Format/Variation character is encountered, bail and proceed down the slower path. + * + * The basic latin range is excluded because it contains no format codepoints, + * and because its much faster to exclude it than to execute the UnicodeSet.contains() binary search. + * + */ + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char src[] = termAtt.termBuffer(); + final int length = termAtt.termLength(); + + for (int i = 0; i < length; i++) { + final char ch = src[i]; + + if (ch > 0x7F && (UTF16.isSurrogate(ch) || formatSet.contains(ch))) { + + /* + * char is a surrogate or format codepoint + * bail out completely and invoke the slow removal algorithm + */ + + if (buffer.length < length) + buffer = new char[ArrayUtil.getNextSize(length)]; + + final int newLength = removeFormat(src, length, buffer); + + termAtt.setTermBuffer(buffer, 0, newLength); + return true; + } + } + + return true; + } else { + return false; + } + } + + public Token next(final Token reusableToken) throws IOException { + assert reusableToken != null; + + Token nextToken = input.next(reusableToken); + + if (nextToken != null) { + final char src[] = nextToken.termBuffer(); + final int length = nextToken.termLength(); + + for (int i = 0; i < length; i++) { + final char ch = src[i]; + + if (ch > 0x7F && (UTF16.isSurrogate(ch) || formatSet.contains(ch))) { + + /* + * char is a surrogate or format codepoint + * bail out completely and invoke the slow removal algorithm + */ + + if (buffer.length < length) + buffer = new char[ArrayUtil.getNextSize(length)]; + + final int newLength = removeFormat(src, length, buffer); + + nextToken.setTermBuffer(buffer, 0, newLength); + return nextToken; + } + } + + return nextToken; + } else { + return null; + } + } + + private static final UnicodeSet formatSet; + + static { + formatSet = new UnicodeSet("[[:General_Category=Format:][:Variation_Selector=True:]]"); + formatSet.compact(); + formatSet.freeze(); + } + +/* + * TODO: validate closure for NFKC/NFKD + */ + private static int removeFormat(char src[], int length, char dst[]) { + int codepoint; + int newlen = 0; + + for (int i = 0; i < length; i += UTF16.getCharCount(codepoint)) { + codepoint = UTF16.charAt(src, 0, length, i); + + if (!formatSet.contains(codepoint)) + newlen += UCharacter.toChars(codepoint, dst, newlen); + } + return newlen; + } +} + Index: contrib/icu/src/java/org/apache/lucene/icu/ICUNormalizationFilter.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/ICUNormalizationFilter.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/ICUNormalizationFilter.java (revision 0) @@ -0,0 +1,265 @@ +package org.apache.lucene.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.ArrayUtil; + +import org.apache.lucene.analysis.ASCIIFoldingFilter; // for javadoc only, example of a filter that does not respect canonical equivalence. + +import com.ibm.icu.text.Normalizer; + +/** + * A {@link TokenFilter} that performs Unicode text normalization. + *+ * Normalization standardizes different forms of the same character in Unicode. + * For any search application, it is essential to apply normalization to at least ensure canonical equivalence. + * For example, a Vietnamese input method on one operating system might represent the character ằ as one codepoint: LATIN SMALL LETTER A WITH BREVE AND GRAVE, + * whereas the input method on another operating system might represent the same character as two: LATIN SMALL LETTER A WITH BREVE followed by COMBINING GRAVE ACCENT. + * Unless text is normalized to a standard form, queries and documents from these different systems will not be interpreted as the same character! + *
+ *+ * There are four modes that text can be normalized to: + *
+ * For most search tasks, it makes sense to normalize to NFC or NFKC, as the composed form will generally be shorter than the decomposed form. + * The decomposed forms can still be useful for some tasks, for example providing decomposed Korean text to a downstream {@link TokenFilter} would + * allow that filter to work with individual Jamo instead of composed Hangul syllables. + *
+ *+ * For a typical search application, the way in which the text will be standardized is the most important, and the two types of standardization are described below. + *
+ *+ * Canonical equivalence is where there are multiple ways to encode the same character or sequence of characters in Unicode. + * These differences display the same to a user, but are different to the computer. + *
+ *+ * For example, é can be encoded in Unicode in at least two different ways: + *
+ * Compatibility equivalence is a weaker form of equivalence than canonical equivalence. + * Similar to canonical equivalents, compatibility equivalents are different ways to represent the same character. + * The difference is that unlike canonical equivalents, compatibility equivalents may have different visual appearance or format. + *
+ *+ * For example, the letter A appears different than A, because of its width. The below two forms are not canonical equivalents, + * but are compatibility equivalents: + *
+ * Normalization is computationally expensive and can both reorder characters and change the length of text. + * In practice, typically the majority of text is already normalized. + * This filter first performs a quick-check, and performs normalization only when this quick-check fails or is uncertain. + *
+ *
+ * When designing an analysis pipeline, it is important to minimize the number of times you invoke ICUNormalizationFilter.
+ * At the same time, it is equally important that the analysis process behaves in such a way that all equivalent text is treated the same.
+ * The naïve solution to this problem is to invoke ICUNormalizationFilter both before and after every {@link TokenFilter} in the pipeline.
+ * This ensures that all equivalent text is treated the same and remains normalized, but is very inefficient.
+ *
+ * Instead the two simple rules below can be followed to minimize the number of invocations:
+ *
+ * If a {@link TokenFilter} does not respect the equivalence defined for the normalization form, ICUNormalizationFilter must be called before that {@link TokenFilter}.
+ *
+ * This way, text is provided to that TokenFilter in a form that it understands, and will be processed correctly.
+ * For example, the {@link ASCIIFoldingFilter} does not respect canonical equivalence: it only folds precomposed character+accent combinations to an accent-free form.
+ * Because of this, the two forms of é listed in the example above will be treated differently; only one will have its accent mark removed!
+ * By invoking ICUNormalizationFilter with NFC first, you can ensure that both are treated the same; both will have their accent marks removed.
+ *
+ * If a {@link TokenFilter} does not preserve the normalization form, ICUNormalizationFilter must be called at some point after that {@link TokenFilter} before indexing.
+ *
+ * When a {@link TokenFilter} modifies text, it might cause text to become denormalized.
+ * There are a number of ways this can happen, even concatenation of two normalized chunks of text can produce a denormalized result.
+ * For example, although the {@link ICUCaseFoldingFilter} respects both canonical and compatibility equivalence, it does not preserve normalization forms.
+ * By the first rule above, because it respects canonical equivalence, the ICUNormalizationFilter need not be invoked before it for normalization form NFC.
+ * But, because it does does not preserve normalization form NFC, the ICUNormalizationFilter must be invoked before indexing, or before any downstream TokenFilter that does not
+ * respect canonical equivalence, whichever comes first.
+ *
+ * It is generally more difficult to preserve normalization forms than it is to respect equivalence. Respecting equivalence is usually a simple matter of adding additional mappings. + * When designing an analysis pipeline, it is recommended that every {@link TokenFilter} respect equivalence, and at the end of the pipeline ICUNormalizationFilter can be invoked a single time. + *
+ * For more details, see UAX #15: Unicode Normalization Forms + */ + +public class ICUNormalizationFilter extends TokenFilter { + // the mode this normalizer uses + private final Normalizer.Mode mode; + + // normalization output buffer, will be resized if needed. + private char buffer[] = new char[4096]; + + // new api term attribute, will be updated with normalized text if necessary. + private TermAttribute termAtt; + + /** + * Create an ICUNormalizationFilter that normalizes text to the specified mode. + * + * @param input {@link TokenStream} to filter + * @param mode Normalization mode to apply + */ + public ICUNormalizationFilter(TokenStream input, Normalizer.Mode mode) { + super(input); + this.mode = mode; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + + public boolean incrementToken() throws IOException { + + /* + * First do a quick-check (this will be the significant majority of text). + * If the text is already normalized, simply return it. + * Otherwise, normalize the text. + */ + + if (input.incrementToken()) { + final char src[] = termAtt.termBuffer(); + final int length = termAtt.termLength(); + + /* + * This quick-check returns three possible values: YES, NO, or MAYBE. + * When it returns YES, the text is already normalized. + * When it returns NO, the text is definitely not normalized. + * When it returns MAYBE, the only way to determine if the text is normalized is to actually normalize it. + * See http://www.unicode.org/unicode/reports/tr15/tr15-23.html#Annex8 + */ + + if (Normalizer.quickCheck(src, 0, length, mode, 0) == Normalizer.YES) + return true; + + /* + * There are known maximum expansions for the different forms that could remove the loop/exception handling below. + * These may change in new versions of the Unicode standard, and are sometimes large. + * The loop is for simplicity and ease of maintenance; with a large default buffer size it should rarely execute more than once. + * + * From http://unicode.org/reports/tr36/tr36-6.html#Buffer_Overflows: + * The very large factors in the case of NFKC/D are due to some extremely rare characters. + * Thus algorithms can use much smaller expansion factors for the typical cases as long as they have + * a fallback process that accounts for the possibility of these characters in data. + * + * For example, under normalization forms NFKC or NFKD, ﷺ (FDFA, ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM) will be expanded to صلى الله عليه وسلم + */ + + do { + try { + + /* + * This method is documented in the public API to throw IndexOutOfBoundsException if there is not enough space. + * Its an unfortunate mechanism, it would be a lot nicer if instead it behaved like the ArabicShaping API, whereas + * instead it would return the necessary length, possibly more than the buffer supplied. + * This would simplify things, instead a call with a 0-length output buffer would return the necessary length. + */ + + final int newLength = Normalizer.normalize(src, 0, length, buffer, 0, buffer.length, mode, 0); + termAtt.setTermBuffer(buffer, 0, newLength); + return true; + } catch (IndexOutOfBoundsException e) { + // technically, ICU encodes the necessary size as a String in the exception, but don't depend on that... + buffer = new char[ArrayUtil.getNextSize(buffer.length << 1)]; + } + } while (true); + } else { + return false; + } + } + + public Token next(final Token reusableToken) throws IOException { + assert reusableToken != null; + + /* + * First do a quick-check (this will be the significant majority of text). + * If the text is already normalized, simply return it. + * Otherwise, normalize the text. + */ + + Token nextToken = input.next(reusableToken); + if (nextToken != null) { + final char src[] = nextToken.termBuffer(); + final int length = nextToken.termLength(); + + /* + * This quick-check returns three possible values: YES, NO, or MAYBE. + * When it returns YES, the text is already normalized. + * When it returns NO, the text is definitely not normalized. + * When it returns MAYBE, the only way to determine if the text is normalized is to actually normalize it. + * See http://www.unicode.org/unicode/reports/tr15/tr15-23.html#Annex8 + */ + + if (Normalizer.quickCheck(src, 0, length, mode, 0) == Normalizer.YES) + return nextToken; + + /* + * There are known maximum expansions for the different forms that could remove the loop/exception handling below. + * These may change in new versions of the Unicode standard, and are sometimes large. + * The loop is for simplicity and ease of maintenance; with a large default buffer size it should rarely execute more than once. + * + * From http://unicode.org/reports/tr36/tr36-6.html#Buffer_Overflows: + * The very large factors in the case of NFKC/D are due to some extremely rare characters. + * Thus algorithms can use much smaller expansion factors for the typical cases as long as they have + * a fallback process that accounts for the possibility of these characters in data. + * + * For example, under normalization forms NFKC or NFKD, ﷺ (FDFA, ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM) will be expanded to صلى الله عليه وسلم + */ + + do { + try { + + /* + * This method is documented in the public API to throw IndexOutOfBoundsException if there is not enough space. + * Its an unfortunate mechanism, it would be a lot nicer if instead it behaved like the ArabicShaping API, whereas + * instead it would return the necessary length, possibly more than the buffer supplied. + * This would simplify things, instead a call with a 0-length output buffer would return the necessary length. + */ + + final int newLength = Normalizer.normalize(src, 0, length, buffer, 0, buffer.length, mode, 0); + nextToken.setTermBuffer(buffer, 0, newLength); + return nextToken; + } catch (IndexOutOfBoundsException e) { + // technically, ICU encodes the necessary size as a String in the exception, but don't depend on that... + buffer = new char[ArrayUtil.getNextSize(buffer.length << 1)]; + } + } while (true); + } else { + return null; + } + } +} Index: contrib/icu/src/java/org/apache/lucene/icu/package.html =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/package.html (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/package.html (revision 0) @@ -0,0 +1,5 @@ + + +An analyzer and set of analysis components for Unicode text, based on ICU. + + \ No newline at end of file Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/BreakIteratorWrapper.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/BreakIteratorWrapper.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/BreakIteratorWrapper.java (revision 0) @@ -0,0 +1,177 @@ +package org.apache.lucene.icu.tokenizer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.CharacterIterator; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.DictionaryBasedBreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.text.UTF16; + +/** + * Contain all the issues surrounding BreakIterators in ICU in one place. + * Basically this boils down to the fact that they aren't very friendly to any sort of OO design. + * + * http://bugs.icu-project.org/trac/ticket/5901: + * RBBI.getRuleStatus(), hoist to BreakIterator from RuleBasedBreakIterator + * + * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, + * but doesn't actually behave as a subclass: it always returns 0 for getRuleStatus() + * + * http://bugs.icu-project.org/trac/ticket/4730: + * Thai RBBI, no boundary type tags + * + */ +abstract class BreakIteratorWrapper { + protected final CharArrayIterator textIterator = new CharArrayIterator(); + protected char text[]; + protected int start; + protected int length; + + abstract int next(); + abstract int current(); + abstract int getRuleStatus(); + abstract void setText(CharacterIterator text); + + void setText(char text[], int start, int length) { + this.text = text; + this.start = start; + this.length = length; + textIterator.setText(text, start, length); + setText(textIterator); + } + + /* + * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so treat it like a generic BreakIterator + * If its any other RuleBasedBreakIterator, the rule status can be used for token type. + * If its any other BreakIterator, the rulestatus method is not available, so treat it like a generic BreakIterator. + */ + static BreakIteratorWrapper wrap(BreakIterator breakIterator) { + if (breakIterator == null) + return null; + else if (breakIterator instanceof RuleBasedBreakIterator && + !(breakIterator instanceof DictionaryBasedBreakIterator)) + return new RBBIWrapper((RuleBasedBreakIterator) breakIterator); + else + return new BIWrapper(breakIterator); + } + + /* + * RuleBasedBreakIterator wrapper: + * RuleBasedBreakIterator (as long as its not a DictionaryBasedBreakIterator) behaves correctly. + */ + static final class RBBIWrapper extends BreakIteratorWrapper { + private final RuleBasedBreakIterator rbbi; + + RBBIWrapper(RuleBasedBreakIterator rbbi) { + this.rbbi = rbbi; + } + + int current() { + return rbbi.current(); + } + + int getRuleStatus() { + return rbbi.getRuleStatus(); + } + + int next() { + return rbbi.next(); + } + + void setText(CharacterIterator text) { + rbbi.setText(text); + } + } + + /* + * Generic BreakIterator wrapper: + * Either the rulestatus method is not available or always returns 0. + * Calculate a rulestatus here so it behaves like RuleBasedBreakIterator. + * + * Note: This is slower than RuleBasedBreakIterator. + */ + static final class BIWrapper extends BreakIteratorWrapper { + private final BreakIterator bi; + private int status; + + BIWrapper(BreakIterator bi) { + this.bi = bi; + } + + int current() { + return bi.current(); + } + + int getRuleStatus() { + return status; + } + + int next() { + int current = bi.current(); + int next = bi.next(); + status = calcStatus(current, next); + return next; + } + + private int calcStatus(int current, int next) { + if (current == BreakIterator.DONE || next == BreakIterator.DONE) + return RuleBasedBreakIterator.WORD_NONE; + + int begin = start + current; + int end = start + next; + + /* + * TODO: Consider optimizing BMP with this idiom: + * http://www.icu-project.org/docs/papers/supplementaries_iuc21.ppt + * + * for (int i = 0; i < s.length(); ++i) { + * int c = s.charAt(i); + * if (0xD800 <= c && c <= 0xDBFF) { + * c = UTF16.charAt(s, i); + * i += UTF16.getCharCount(c) - 1; + * } + * if (UCharacter.isLetter(c)) { + * doSomething(c); + * } + * } + */ + + int codepoint; + for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) { + codepoint = UTF16.charAt(text, 0, end, begin); + + if (UCharacter.isDigit(codepoint)) + return RuleBasedBreakIterator.WORD_NUMBER; + else if (UCharacter.isLetter(codepoint)) { + // TODO try to separately specify ideographic, kana? [currently all bundled as letter for this case] + return RuleBasedBreakIterator.WORD_LETTER; + } + } + + return RuleBasedBreakIterator.WORD_NONE; + } + + void setText(CharacterIterator text) { + bi.setText(text); + status = RuleBasedBreakIterator.WORD_NONE; + } + } +} Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CharArrayIterator.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CharArrayIterator.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CharArrayIterator.java (revision 0) @@ -0,0 +1,104 @@ +package org.apache.lucene.icu.tokenizer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.CharacterIterator; + +/** + * Wraps a char[] as CharacterIterator for processing with a BreakIterator + * + */ +final class CharArrayIterator implements CharacterIterator { + private char array[]; + private int start; + private int index; + private int length; + private int limit; + + CharArrayIterator() { + } + + /** + * Set a new region of text to be examined by this iterator + * @param array text buffer to examine + * @param start offset into buffer + * @param length maximum length to examine + */ + void setText(final char array[], int start, int length) { + this.array = array; + this.start = start; + this.index = start; + this.length = length; + this.limit = start + length; + } + + public char current() { + return (index == limit) ? DONE : array[index]; + } + + public char first() { + index = start; + return current(); + } + + public int getBeginIndex() { + return 0; + } + + public int getEndIndex() { + return length; + } + + public int getIndex() { + return index - start; + } + + public char last() { + index = (limit == start) ? limit : limit - 1; + return current(); + } + + public char next() { + if (++index >= limit) { + index = limit; + return DONE; + } else { + return current(); + } + } + + public char previous() { + if (--index < start) + index = start; + return current(); + } + + public char setIndex(int position) { + if (position < getBeginIndex() || position > getEndIndex()) + throw new IllegalArgumentException("Illegal Position: " + position); + index = start + position; + return current(); + } + + public Object clone() { + CharArrayIterator clone = new CharArrayIterator(); + clone.setText(array, start, length); + clone.index = index; + return clone; + } +} \ No newline at end of file Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CompositeBreakIterator.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CompositeBreakIterator.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CompositeBreakIterator.java (revision 0) @@ -0,0 +1,125 @@ +package org.apache.lucene.icu.tokenizer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Iterator; +import java.util.Map; + +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.BreakIterator; + +/** + * An internal BreakIterator for multilingual text, following recommendations from: + * UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/) + * + * This Break Iterator can be customized in up to two ways: + * 1. Providing a custom default BreakIterator. + * 2. Providing a mapping of BreakIterators that work with specific scripts. + * + * See http://unicode.org/reports/tr29/#Tailoring for the motivation of this design + * + * Text is first divided into script boundaries. + * The processing is then delegated to the appropriate break iterator for that specific script. + * If no script-specific break iterator is available, the Unicode default bounds properties are used. + * + * This break iterator also allows you to retrieve the ISO 15924 script code associated with a piece of text. + * + * See also UAX #29, UTR #24 + * + */ +final class CompositeBreakIterator { + private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT]; + private BreakIteratorWrapper rbbi; + private final ScriptIterator scriptIterator = new ScriptIterator(); + + private char text[]; + + CompositeBreakIterator(BreakIterator rbbi, Map scriptHandlers) { + // the default word breaker + BreakIteratorWrapper defaultBreaker = BreakIteratorWrapper.wrap(rbbi); + for (int i = 0; i < wordBreakers.length; i++) + wordBreakers[i] = defaultBreaker; + + // register any custom script-specific word-breakers + for (Iterator iterator = scriptHandlers.entrySet().iterator(); iterator.hasNext();) { + Map.Entry handler = (Map.Entry) iterator.next(); + Integer scriptCode = (Integer) handler.getKey(); + BreakIterator scriptHandler = (BreakIterator) handler.getValue(); + wordBreakers[scriptCode.intValue()] = BreakIteratorWrapper.wrap(scriptHandler); + } + } + + /** + * Retrieve the next break position. + * If the RBBI range is exhausted within the script boundary, examine the next script boundary. + * @return the next break position or BreakIterator.DONE + */ + int next() { + int next = rbbi.next(); + while (next == BreakIterator.DONE && scriptIterator.next()) { + rbbi = wordBreakers[scriptIterator.getScriptCode()]; + rbbi.setText(text, scriptIterator.getScriptStart(), scriptIterator.getScriptLimit() - scriptIterator.getScriptStart()); + next = rbbi.next(); + } + return (next == BreakIterator.DONE) ? BreakIterator.DONE : next + scriptIterator.getScriptStart(); + } + + /** + * Retrieve the current break position. + * @return the current break position or BreakIterator.DONE + */ + int current() { + final int current = rbbi.current(); + return (current == BreakIterator.DONE) ? BreakIterator.DONE : current + scriptIterator.getScriptStart(); + } + + /** + * Retrieve the rule status code (token type) from the underlying break iterator + * @return rule status code (see RuleBasedBreakIterator constants) + */ + int getRuleStatus() { + return rbbi.getRuleStatus(); + } + + /** + * Retrieve the UScript script code for the current token. + * This code can be decoded with UScript into a name or ISO 15924 code. + * @return UScript script code for the current token. + */ + int getScriptCode() { + return scriptIterator.getScriptCode(); + } + + /** + * Set a new region of text to be examined by this iterator + * @param text buffer of text + * @param start offset into buffer + * @param length maximum length to examine + */ + void setText(final char text[], int start, int length) { + this.text = text; + scriptIterator.setText(text, start, length); + if (scriptIterator.next()) { + rbbi = wordBreakers[scriptIterator.getScriptCode()]; + rbbi.setText(text, scriptIterator.getScriptStart(), scriptIterator.getScriptLimit() - scriptIterator.getScriptStart()); + } else { + rbbi = wordBreakers[UScript.COMMON]; + rbbi.setText(text, 0, 0); + } + } +} Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ICUTokenizer.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ICUTokenizer.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ICUTokenizer.java (revision 0) @@ -0,0 +1,293 @@ +package org.apache.lucene.icu.tokenizer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.util.ULocale; + +/** + * Breaks text into words according to UAX #29: Unicode Text Segmentation (http://www.unicode.org/reports/tr29/) + *+ * Words are broken across script boundaries and unicode boundaries based upon their unicode properties. + *
+ * TODO: more docs + */ +public class ICUTokenizer extends Tokenizer { + /** + * Token type for words containing ideographic characters + */ + public static final String WORD_IDEO = "IDEO"; + /** + * Token type for words containing kana characters + */ + public static final String WORD_KANA = "KANA"; + /** + * Token type for words that contain letters, excluding hiragana, katakana, or ideographic characters. + */ + public static final String WORD_LETTER = "WORD"; + /** + * Token type for words that appear to be numbers. + */ + public static final String WORD_NUMBER = "NUM"; + + private static final int IOBUFFER = 4096; + private final char buffer[] = new char[IOBUFFER]; + private int length = 0; /* true length of text in the buffer */ + private int usableLength = 0; /* length of text in the buffer that can be evaluated safely, up to a safe end point */ + private int offset = 0; /* accumulated offset of previous buffers for this Reader, for correct term offsets */ + + private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */ + + private OffsetAttribute offsetAtt; + private TermAttribute termAtt; + private TypeAttribute typeAtt; + private FlagsAttribute flagsAtt; + private PositionIncrementAttribute posIncAtt; + + /* + * the two default breakiterators in use. + * these can be expensive to instantiate, cheap to clone. + */ + private static final BreakIterator rootBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT); + private static final BreakIterator thaiBreakIterator = BreakIterator.getWordInstance(new ULocale("th_TH")); + + /** + * Construct a new ICUTokenizer that breaks text into words from the given Reader. + *+ * Text will be broken into words by the Unicode text segmentation algorithm: BreakIterator.getWordInstance(ULocale.ROOT) + *
+ *+ * The default script-specific handling is used: specifically Thai text is broken into words with a Thai DictionaryBasedBreakiterator + *
+ * @param input Reader containing text to tokenize. + */ + public ICUTokenizer(Reader input) { + this(input, null, null); + } + + /** + * Construct a new ICUTokenizer that breaks text into words from the given Reader, using a tailored default BreakIterator + *+ * The default script-specific handling is used: specifically Thai text is broken into words with a Thai DictionaryBasedBreakiterator + *
+ * @param input Reader containing text to tokenize. + * @param defaultBreakIterator Tailored BreakIterator that breaks text into words. + */ + public ICUTokenizer(Reader input, BreakIterator defaultBreakIterator) { + this(input, defaultBreakIterator, null); + } + + + /** + * Construct a new ICUTokenizer that breaks text into words from the given Reader, using a tailored default BreakIterator and a custom mapping of script-specific BreakIterators + * + * @param input Reader containing text to tokenize. + * @param defaultBreakIterator Tailored BreakIterator that breaks text into words. + * @param scriptHandlers A {@link Map} mapping UScript codes to BreakIterators for script-specific handling. + */ + public ICUTokenizer(Reader input, BreakIterator defaultBreakIterator, Map scriptHandlers) { + super(input); + + if (defaultBreakIterator == null) + defaultBreakIterator = (BreakIterator) rootBreakIterator.clone(); + + if (scriptHandlers == null) { + /* Add Thai DBBI as handler for the Thai script */ + scriptHandlers = new HashMap(); + scriptHandlers.put(new Integer(UScript.THAI), (BreakIterator) thaiBreakIterator.clone()); + } + + breaker = new CompositeBreakIterator(defaultBreakIterator, scriptHandlers); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); + posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + } + + /** + * Return the token's lexical type. + *+ * For a tailored ruleset, you may want to override this method to return additional types. + *
+ * @return String containing the token type + */ + protected String getType() { + switch(breaker.getRuleStatus()) { + case RuleBasedBreakIterator.WORD_IDEO: return ICUTokenizer.WORD_IDEO; + case RuleBasedBreakIterator.WORD_KANA: return ICUTokenizer.WORD_KANA; + case RuleBasedBreakIterator.WORD_LETTER: return ICUTokenizer.WORD_LETTER; + case RuleBasedBreakIterator.WORD_NUMBER: return ICUTokenizer.WORD_NUMBER; + default: /* RuleBasedBreakIterator.WORD_NONE, not possible */ + return "NONE"; + } + } + + public boolean incrementToken() throws IOException { + if (length == 0) + refill(); + while (!incrementTokenBuffer()) { + refill(); + if (length <= 0) // no more bytes to read; + return false; + } + return true; + } + + public Token next(Token reusableToken) throws IOException { + if (length == 0) + refill(); + Token result = null; + while ((result = nextBuffer(reusableToken)) == null) { + refill(); + if (length <= 0) // no more bytes to read + return null; + } + return result; + } + + public void reset(Reader input) throws IOException { + super.reset(input); + breaker.setText(buffer, 0, 0); + length = usableLength = offset = 0; + } + + /* + * The method is as follows: + * + * The ICU RBBI implementation for the ROOT locale is used. + * This tokenizes text based upon the longest matching rule, and because of this, isn't friendly to a Reader. + * + * Text is read from the input stream in 4kB chunks. + * Within a 4kB chunk of text, the last unambiguous break point is found (in this implementation: white space character) + * Any remaining characters represent possible partial words, so are appended to the front of the next chunk. + * + * There is the possibility that there are no unambiguous break points within an entire 4kB chunk of text (binary data) + * Currently, this could cause what should really be an 8kB word to be parsed as two 4kB words instead. + * This keeps the code simple, but other possibilities could be: + * A. increasing the buffer size and reading more data, hoping to find an unambiguous boundary, at the risk of OOM error. + * B. discarding the chunk entirely and instead looking for a set of unambiguous start-end boundaries in future text. + * + * Option B is closer to what other Lucene analyzers do, but would increase code complexity significantly. + * + */ + + /** + * Returns the last unambiguous break position in the text. + * @return position of character, or -1 if one does not exist + */ + private int findSafeEnd() { + for (int i = length - 1; i >= 0; i--) + if (UCharacter.isWhitespace(buffer[i])) + return i + 1; + return -1; + } + + /** + * Refill the buffer, accumulating the offset and setting usableLength to the last unambiguous break position + * @throws IOException + */ + private void refill() throws IOException { + offset += usableLength; + int leftover = length - usableLength; + System.arraycopy(buffer, usableLength, buffer, 0, leftover); + int requested = buffer.length - leftover; + int returned = input.read(buffer, leftover, requested); + length = returned < 0 ? leftover : returned + leftover; + if (returned < requested) /* reader has been emptied, process the rest */ + usableLength = length; + else { /* still more data to be read, find a safe-stopping place */ + usableLength = findSafeEnd(); + if (usableLength < 0) + usableLength = length; /* more than IOBUFFER of text without space, gonna possibly truncate tokens */ + } + + if (usableLength > 0) + breaker.setText(buffer, 0, usableLength); + } + + /* + * return true if there is a token from the buffer, or null if it is exhausted. + */ + private boolean incrementTokenBuffer() { + int start = breaker.current(); + if (start == BreakIterator.DONE) + return false; // BreakIterator exhausted + + // find the next set of boundaries, skipping over non-tokens (rule status 0) + int end = breaker.next(); + while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) { + start = end; + end = breaker.next(); + } + + if (start == BreakIterator.DONE) + return false; // BreakIterator exhausted + + int length = end - start; + termAtt.setTermBuffer(buffer, start, length); + offsetAtt.setOffset(offset + start, offset + end); + typeAtt.setType(getType()); + flagsAtt.setFlags(breaker.getScriptCode()); + + return true; + } + + /* + * return the next token from the buffer, or null if it is exhausted. + */ + private Token nextBuffer(final Token reusableToken) { + int start = breaker.current(); + if (start == BreakIterator.DONE) + return null; // BreakIterator exhausted + + // find the next set of boundaries, skipping over non-tokens (rule status 0) + int end = breaker.next(); + while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) { + start = end; + end = breaker.next(); + } + + if (start == BreakIterator.DONE) + return null; // BreakIterator exhausted + + final int length = end - start; + + reusableToken.reinit(buffer, start, length, offset + start, offset + end, getType()); + reusableToken.setFlags(breaker.getScriptCode()); + + return reusableToken; + } + +} Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ScriptIterator.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ScriptIterator.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ScriptIterator.java (revision 0) @@ -0,0 +1,170 @@ +package org.apache.lucene.icu.tokenizer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.UTF16; + +/** + * An iterator that locates ISO 15924 script boundaries in text. + * This is not the same as simply looking at the Unicode block, or even the Script property. + * Some characters are 'common' across multiple scripts, and some 'inherit' the script value of text surrounding them. + * + * This is similar to ICU (internal-only) UScriptRun, with the following differences: + * + * Doesn't attempt to match paired punctuation. For tokenization purposes, this is not necessary. Its also quite expensive. + * Non-spacing marks inherit the script of their base character, following recommendations from UTR #24. + * + * TODO: Runs of Han ideographs and Hiragana/Katakana need to be mapped to Japanese script. + * TODO: Runs of Han ideographs and Hangul need to be mapped to Korean script. + */ +final class ScriptIterator { + private char text[]; + private int start; + private int limit; + private int index; + + private int scriptStart; + private int scriptLimit; + private int scriptCode; + + /** + * Get the start of this script run + * @return start position of script run + */ + int getScriptStart() { + return scriptStart; + } + + /** + * Get the index of the first character after the end of this script run + * @return position of the first character after this script run + */ + int getScriptLimit() { + return scriptLimit; + } + + /** + * Get the UScript script code for this script run + * @return code for the script of the current run + */ + int getScriptCode() { + return scriptCode; + } + + /** + * Iterates to the next script run, returning true if one exists. + * @return true if there is another script run, false otherwise. + */ + boolean next() { + if (scriptLimit >= limit) + return false; + + scriptCode = UScript.COMMON; + scriptStart = scriptLimit; + + /* + * TODO: Consider optimizing BMP with this idiom: + * http://www.icu-project.org/docs/papers/supplementaries_iuc21.ppt + * + * for (int i = 0; i < s.length(); ++i) { + * int c = s.charAt(i); + * if (0xD800 <= c && c <= 0xDBFF) { + * c = UTF16.charAt(s, i); + * i += UTF16.getCharCount(c) - 1; + * } + * if (UCharacter.isLetter(c)) { + * doSomething(c); + * } + * } + */ + + while (index < limit) { + final int ch = UTF16.charAt(text, start, limit, index - start); + final int sc = getScript(ch); + + /* + * From UTR #24: + * Implementations that determine the boundaries between characters of given scripts should never break between a non-spacing mark + * and its base character. Thus for boundary determinations and similar sorts of processing, a non-spacing mark — whatever its script + * value — should inherit the script value of its base character. + */ + + if (isSameScript(scriptCode, sc) || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) { + index += UTF16.getCharCount(ch); + + /* + * Inherited or Common becomes the script code of the surrounding text. + */ + + if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { + scriptCode = sc; + } + + } else { + break; + } + } + + scriptLimit = index; + return true; + } + + /* + * Determine if two scripts are compatible. + */ + private static boolean isSameScript(int scriptOne, int scriptTwo) { + return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo; + } + + /** + * Set a new region of text to be examined by this iterator + * @param text text buffer to examine + * @param start offset into buffer + * @param length maximum length to examine + */ + void setText(char text[], int start, int length) { + this.text = text; + this.start = start; + this.index = start; + this.limit = start + length; + this.scriptStart = start; + this.scriptLimit = start; + this.scriptCode = UScript.INVALID_CODE; + } + + /* linear array access fast-path for basic latin case, greatest 128 ints ever spent */ + private static final int basicLatin[] = new int[128]; + + static { + for (int i = 0; i < basicLatin.length; i++) + basicLatin[i] = UScript.getScript(i); + } + + /* + * An accelerated version of UScript.getScript() + * Basic latin is an array lookup. + */ + private static int getScript(int codepoint) { + if (0 <= codepoint && codepoint < basicLatin.length) + return basicLatin[codepoint]; + else + return UScript.getScript(codepoint); + } +} \ No newline at end of file Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/package.html =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/package.html (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/package.html (revision 0) @@ -0,0 +1,5 @@ + + +Tokenizer that breaks text into words with the Unicode Text Segmentation algorithm. + + \ No newline at end of file Index: contrib/icu/src/java/org/apache/lucene/icu/transform/ICUTransformFilter.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/transform/ICUTransformFilter.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/transform/ICUTransformFilter.java (revision 0) @@ -0,0 +1,174 @@ +package org.apache.lucene.icu.transform; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + +import org.apache.lucene.icu.ICUNormalizationFilter; // only used for javadoc. + +import com.ibm.icu.text.RuleBasedTransliterator; // only used for optimizing the transform, see below +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UnicodeSet; + +/** + * A {@link TokenFilter} that transforms text with ICU. + *+ * ICU provides text-transformation functionality via its Transliteration API. + * Although script conversion is its most common use, a transliterator can actually perform a more general class of tasks. + * In fact, Transliterator defines a very general API which specifies only that a segment of the input text is replaced by new text. + * The particulars of this conversion are determined entirely by subclasses of Transliterator. + *
+ *+ * Some useful transformations for search are built-in: + *
+ * Example usage: + *
stream = new ICUTransformFilter(stream, Transliterator.getInstance("Traditional-Simplified"));+ * + *
+ * Whether or not this filter respects equivalence or preserves normalization forms depends entirely upon the ruleset being applied. + *
+ *+ * For good performance, it is helpful to declare a filter in any custom transform you build. + * This allows the transform to efficiently skip over unaffected text. + * It is also useful to consider if there are simpler solutions. + * For example, if you want to standardize Fullwidth and Halfwidth forms, + * use of {@link ICUNormalizationFilter} with compatibility decomposition will erase width differences, with better performance. + *
+ * For more details, see the ICU User Guide. + */ + +public class ICUTransformFilter extends TokenFilter { + // Transliterator to transform the text + private final Transliterator transform; + + // Reusable position object + private final Transliterator.Position position = new Transliterator.Position(); + + // Wraps a token around the replaceable interface. + private final ReplaceableToken replaceableToken = new ReplaceableToken(); + + // Wraps a termAttribute around the replaceable interface. + private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute(); + + // new api term attribute, will be updated with transformed text. + private TermAttribute termAtt; + + /** + * Create a new ICUTransformFilter that transforms text on the given stream. + * + * @param input {@link TokenStream} to filter. + * @param transform Transliterator to transform the text. + */ + public ICUTransformFilter(TokenStream input, Transliterator transform) { + super(input); + this.transform = transform; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + + /* + * A good UnicodeFilter is vital for performance. Unfortunately, sometimes people omit filters in their rulesets. + * However, in the special case that the transform is a RuleBasedTransliterator, this situation can be corrected. + * It can only be applied to a pure RuleBasedTransliterator, and it is only applied when there is no supplied filter. + * + * For a great example of a ruleset like this, see the built-in Simplified/Traditional ruleset from CLDR. + * This is a massive performance optimization for that case! + * + * If CompoundTransliterator and its children were exposed (its package-private and children are inaccessible), + * then more cases could be optimized. + * + * Regardless of who wrote the rules, you can ALWAYS apply a filter own your own: + * Transliterator.getInstance("[:Arabic:] UnfilteredTransformThatOnlyProcessesTheArabicBlock"); + * + * Just be careful to ensure you don't filter characters that should be converted! + * This can be tricky if, for example, the transliterator internally invokes ::NFKC(). + */ + + if (transform.getFilter() == null && (transform instanceof RuleBasedTransliterator)) { + final UnicodeSet sourceSet = transform.getSourceSet(); + if (sourceSet != null && !sourceSet.isEmpty()) + transform.setFilter(sourceSet); + } + } + + public boolean incrementToken() throws IOException { + + /* + * Wrap the TermAttribute around the replaceable interface, clear the positions, and transliterate. + * Finally, update the TermAttribute with the [potentially different] length. + */ + + if (input.incrementToken()) { + final int length = termAtt.termLength(); + replaceableAttribute.setText(termAtt); + + position.start = 0; + position.limit = length; + position.contextStart = 0; + position.contextLimit = length; + + transform.filteredTransliterate(replaceableAttribute, position, false); + termAtt.setTermLength(replaceableAttribute.length()); + return true; + } else { + return false; + } + } + + public Token next(final Token reusableToken) throws IOException { + assert reusableToken != null; + + /* + * Wrap the Token around the replaceable interface, clear the positions, and transliterate. + * Finally, update the Token with the [potentially different] length. + */ + + Token nextToken = input.next(reusableToken); + if (nextToken != null) { + final int length = reusableToken.termLength(); + replaceableToken.setText(reusableToken); + + position.start = 0; + position.limit = length; + position.contextStart = 0; + position.contextLimit = length; + + transform.filteredTransliterate(replaceableToken, position, false); + reusableToken.setTermLength(replaceableToken.length()); + return reusableToken; + } else { + return null; + } + } + +} Index: contrib/icu/src/java/org/apache/lucene/icu/transform/ReplaceableTermAttribute.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/transform/ReplaceableTermAttribute.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/transform/ReplaceableTermAttribute.java (revision 0) @@ -0,0 +1,95 @@ +package org.apache.lucene.icu.transform; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.ArrayUtil; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Replaceable; +import com.ibm.icu.text.UTF16; + +/** + * Wrap a {@link TermAttribute} with the Replaceable API. + * + * This allows for ICU transforms to run without unnecessary object creation. + * + * This wrapper does not keep the TermAttribute's length up to date at all times, when you are done you must + * finalize the replacement process by setting the TermAttribute's length. + */ + +final class ReplaceableTermAttribute implements Replaceable { + private char buffer[]; + private int length; + private TermAttribute token; + + ReplaceableTermAttribute() { + } + + void setText(final TermAttribute reusableToken) { + this.token = reusableToken; + this.buffer = reusableToken.termBuffer(); + this.length = reusableToken.termLength(); + } + + public int char32At(int pos) { + return UTF16.charAt(buffer, 0, length, pos); + } + + public char charAt(int pos) { + return buffer[pos]; + } + + public void copy(int start, int limit, int dest) { + char text[] = new char[limit - start]; + getChars(start, limit, text, 0); + replace(dest, dest, text, 0, limit - start); + } + + public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) { + System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart); + } + + public boolean hasMetaData() { + return false; + } + + public int length() { + return length; + } + + public void replace(int start, int limit, String text) { + replace(start, limit, text.toCharArray(), 0, text.length()); + } + + public void replace(int start, int limit, char[] text, int charsStart, int charsLen) { + final int replacementLength = limit - start; + final int newLength = length - replacementLength + charsLen; + // resize if necessary + if (newLength > length) + buffer = token.resizeTermBuffer(ArrayUtil.getNextSize(newLength)); + // if the substring being replaced is longer or shorter than the replacement, need to shift things around + if (replacementLength != charsLen && limit < length) + System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit); + // insert the replacement text + System.arraycopy(text, charsStart, buffer, start, charsLen); + length = newLength; + } + +} Index: contrib/icu/src/java/org/apache/lucene/icu/transform/ReplaceableToken.java =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/transform/ReplaceableToken.java (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/transform/ReplaceableToken.java (revision 0) @@ -0,0 +1,95 @@ +package org.apache.lucene.icu.transform; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.ArrayUtil; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Replaceable; +import com.ibm.icu.text.UTF16; + +/** + * Wrap a {@link Token} with the Replaceable API. + * + * This allows for ICU transforms to run without unnecessary object creation. + * + * This wrapper does not keep the Token's length up to date at all times, when you are done you must + * finalize the replacement process by setting the Token's length. + */ + +final class ReplaceableToken implements Replaceable { + private char buffer[]; + private int length; + private Token token; + + ReplaceableToken() { + } + + void setText(final Token reusableToken) { + this.token = reusableToken; + this.buffer = reusableToken.termBuffer(); + this.length = reusableToken.termLength(); + } + + public int char32At(int pos) { + return UTF16.charAt(buffer, 0, length, pos); + } + + public char charAt(int pos) { + return buffer[pos]; + } + + public void copy(int start, int limit, int dest) { + char text[] = new char[limit - start]; + getChars(start, limit, text, 0); + replace(dest, dest, text, 0, limit - start); + } + + public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) { + System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart); + } + + public boolean hasMetaData() { + return false; + } + + public int length() { + return length; + } + + public void replace(int start, int limit, String text) { + replace(start, limit, text.toCharArray(), 0, text.length()); + } + + public void replace(int start, int limit, char[] text, int charsStart, int charsLen) { + final int replacementLength = limit - start; + final int newLength = length - replacementLength + charsLen; + // resize if necessary + if (newLength > length) + buffer = token.resizeTermBuffer(ArrayUtil.getNextSize(newLength)); + // if the substring being replaced is longer or shorter than the replacement, need to shift things around + if (replacementLength != charsLen && limit < length) + System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit); + // insert the replacement text + System.arraycopy(text, charsStart, buffer, start, charsLen); + length = newLength; + } + +} Index: contrib/icu/src/java/org/apache/lucene/icu/transform/package.html =================================================================== --- contrib/icu/src/java/org/apache/lucene/icu/transform/package.html (revision 0) +++ contrib/icu/src/java/org/apache/lucene/icu/transform/package.html (revision 0) @@ -0,0 +1,23 @@ + + +Provides the ability to perform general text transformations with ICU. ++Text Transformations are part of the ICU Transliterator API, which is a slightly misleading name. +
++Out-of-box, ICU provides some transformations that can be useful for search, such as: +
++Many of the conversions are context-sensitive, rule-based conversions and are not simple character-character mappings. +
++There are also transliterators that convert between scripts, and a nice syntax to create your own context-sensitive, rule-based text conversions. +This package exposes this capability as a Lucene TokenFilter. +
+ + \ No newline at end of file Index: contrib/icu/src/resources/Hebrew.brk =================================================================== --- contrib/icu/src/resources/Hebrew.brk (revision 0) +++ contrib/icu/src/resources/Hebrew.brk (revision 0) @@ -0,0 +1,61 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# This is an example of rule tailoring for Hebrew. +# In this example the single-quote is added to the Extend category +# The double-quote is added to the MidLetter category. +# +!!chain; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}\u0027]; +$Format = [\p{Word_Break = Format}]; +$ALetter = [\p{Word_Break = ALetter}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidLetter = [\p{Word_Break = MidLetter}\u0022]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$dictionary = [:LineBreak = Complex_Context:]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; + +$ALetterEx = $ALetterPlus ($Extend | $Format)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidLetterEx = $MidLetter ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +!!forward; + +$CR $LF; +[^$CR $LF $Newline]? ($Extend | $Format)+; +$NumericEx {100}; +$ALetterEx {200}; +$ALetterEx $ALetterEx {200}; +$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; +$NumericEx $NumericEx {100}; +$ALetterEx $NumericEx {200}; +$NumericEx $ALetterEx {200}; +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; +$ALetterEx $ExtendNumLetEx {200}; +$NumericEx $ExtendNumLetEx {100}; +$ExtendNumLetEx $ExtendNumLetEx {200}; +$ExtendNumLetEx $ALetterEx {200}; +$ExtendNumLetEx $NumericEx {100}; Index: contrib/icu/src/resources/Khmer.brk =================================================================== --- contrib/icu/src/resources/Khmer.brk (revision 0) +++ contrib/icu/src/resources/Khmer.brk (revision 0) @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Parses Khmer text, with orthographic syllable as token, with the intent of applying ShingleFilter to these syllables downstream. +# This is an example of how one can use a tailored ruleset. +# The definition of Khmer orthographic syllable is taken directly from the Unicode Standard. +# +!!chain; +# +# B = base character (consonant, independent vowel, etc) +$KhmerBase = [\u1780-\u17B3]; +# R = robat +$KhmerRobat = [\u17CC]; +# C = consonant shifter +$KhmerShifter = [\u17C9\u17CA]; +# S = subscript consonant or independent vowel sign +$KhmerSub = ([\u17D2] $KhmerBase); +# V = dependent vowel sign +$KhmerVowel = [\u17B4-\u17C5]; +# Z = zero-width joiner or non-joiner +$KhmerZWC = [\u200C\u200D]; +# O = any other sign +$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; + +$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?; + +# +# default numerical definitions +# +$Extend = [\p{Word_Break = Extend}]; +$Format = [\p{Word_Break = Format}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +!!forward; +$KhmerSyllableEx {200}; + +# +# default numeric rules +# +$NumericEx {100}; +$NumericEx $NumericEx {100}; +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; +$NumericEx $ExtendNumLetEx {100}; +$ExtendNumLetEx $NumericEx {100}; Index: contrib/icu/src/resources/SoutheastAsian.brk =================================================================== --- contrib/icu/src/resources/SoutheastAsian.brk (revision 0) +++ contrib/icu/src/resources/SoutheastAsian.brk (revision 0) @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Tailored ruleset for Southeast Asian scripts (Balinese, Lao, Myanmar, Khmer) +# +# The scripts do not use "spaces" between words, but sometimes use a zero-width space (U+200B) to mark word boundaries. +# This zero-width space is really a format control for line-breaking, but for these scripts interpret it as a word-break. +# +# This character used to be a word-breaker, then it was reversed, now it looks to be reversed again in 5.2 +# This problem is corrected by default according to Unicode 5.2 Beta, unless its reversed yet again! +# +!!chain; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}]; +$Format = [[\p{Word_Break = Format}]-[\u200B]]; +$ALetter = [\p{Word_Break = ALetter}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidLetter = [\p{Word_Break = MidLetter}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$dictionary = [:LineBreak = Complex_Context:]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; + +$ALetterEx = $ALetterPlus ($Extend | $Format)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidLetterEx = $MidLetter ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +!!forward; + +$CR $LF; +[^$CR $LF $Newline]? ($Extend | $Format)+; +$NumericEx {100}; +$ALetterEx {200}; +$ALetterEx $ALetterEx {200}; +$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; +$NumericEx $NumericEx {100}; +$ALetterEx $NumericEx {200}; +$NumericEx $ALetterEx {200}; +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; +$ALetterEx $ExtendNumLetEx {200}; +$NumericEx $ExtendNumLetEx {100}; +$ExtendNumLetEx $ExtendNumLetEx {200}; +$ExtendNumLetEx $ALetterEx {200}; +$ExtendNumLetEx $NumericEx {100}; Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUAnalyzer.java =================================================================== --- contrib/icu/src/test/org/apache/lucene/icu/TestICUAnalyzer.java (revision 0) +++ contrib/icu/src/test/org/apache/lucene/icu/TestICUAnalyzer.java (revision 0) @@ -0,0 +1,246 @@ +package org.apache.lucene.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.icu.tokenizer.ICUTokenizer; +import org.apache.lucene.icu.transform.ICUTransformFilter; +import org.apache.lucene.util.LuceneTestCase; + +import com.ibm.icu.text.Transliterator; + +public class TestICUAnalyzer extends LuceneTestCase { + private Analyzer a = new ICUAnalyzer(); + + /* simple tests from a few sample different writing systems and languages + * TODO: more tests + */ + + public void testArmenian() throws Exception { + assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։", + new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", + "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } ); + } + + public void testAmharic() throws Exception { + assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም", + new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } ); + } + + public void testArabic() throws Exception { + assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.", + new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا", + "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); + } + + public void testAramaic() throws Exception { + assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀", + new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ", + "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"}); + } + + public void testBengali() throws Exception { + assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।", + new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার", + "শুরু", "15", "জানুয়ারি", "2001", "সালে", "এখন", "পর্যন্ত", "200টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" }); + } + + public void testFarsi() throws Exception { + assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.", + new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "25", "دی", "1379", "به", "صورت", "مکملی", + "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" }); + } + + public void testGreek() throws Exception { + assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.", + new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που", + "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" }); + } + + public void testTibetan() throws Exception { + assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །", + new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" }); + } + + + /* test various jira issues this analyzer is related to */ + + public void testLUCENE1032() throws Exception { + /* + * Some of our Japanese customers are reporting errors when performing searches using half width characters. + * The desired behavior is that a document containing half width characters should be returned when performing + * a search using full width equivalents or when searching by the half width character itself. + * Currently, a search will not return any matches for half width characters. + * + * ICUAnalyzer normalizes to NFKC by default. + */ + byte[] fullWidthKa = new byte[]{(byte) 0xE3, (byte) 0x82, (byte) 0xAB}; + byte[] halfWidthKa = new byte[]{(byte) 0xEF, (byte) 0xBD, (byte) 0xB6}; + + assertAnalyzesTo(a, new String(halfWidthKa, "UTF-8"), new String[] { new String(fullWidthKa, "UTF-8") }); + assertAnalyzesTo(a, new String(fullWidthKa, "UTF-8"), new String[] { new String(fullWidthKa, "UTF-8") }); + } + + public void testLUCENE1215() throws Exception { + /* + * New in java 6, we have java.text.Normalizer that supports Unicode Standard Annex #15 normalization. + * + * FYI: The java6 impl is String-only, and does not provide quickCheck, although isNormalized MIGHT invoke quickCheck. + * + * ICUAnalyzer normalizes to NFKC by default (though this can be changed). + * See tests for ICUNormalizationFilter. + */ + + assertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "طمطمطم" }); // arabic presentation forms + } + + public void testLUCENE1343() throws Exception { + /* + * A replacement for ISOLatin1AccentFilter that does a more thorough job of removing diacritical marks or non-spacing modifiers. + * This issue also had a normalization impl, but no quickCheck. + * + * ICUAnalyzer does NOT remove any accents by default!!!! + * This is language-specific/usually incorrect behavior, but if you want to do this kind of thing, the components are here. + * + * If you want to do some additional custom mappings, you can create a custom Transliterator. + */ + Analyzer a = new Analyzer() { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new ICUTransformFilter( + new ICUTokenizer(reader), Transliterator.getInstance("NFKD; [:Nonspacing Mark:] Remove; NFC")); + } }; + + assertAnalyzesTo(a, "Sorcie\u0300re sorcière Pe\u0301rez Matilde Pérez A\u0308\uFB03ne Äffine ", + new String[] { "Sorciere","sorciere","Perez", "Matilde", "Perez", "Affine", "Affine" }); + } + + public void testLUCENE1545() throws Exception { + /* + * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E. + * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost. + * Expected result is only on token "moͤchte". + * + * ICUAnalyzer implements Unicode Text Segmentation, which never separates a combining mark from its base character. + */ + assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); + } + + + // public void testLUCENE1161() throws Exception { + /* + * It would be useful, in the StandardTokenizer, to be able to have more control over in-word punctuation is handled. + * For instance, it is not always desirable to split on dashes or other punctuation. + * + * Figure out nice example of this with a tailored ruleset. + * Its possible to even do something such as provide DBBI for hyphenation, etc etc. + */ + // } + + /* Tests from StandardAnalyzer, just to show behavior is similar */ + public void testAlphanumericSA() throws Exception { + // alphanumeric tokens + assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); + assertAnalyzesTo(a, "2B", new String[]{"2b"}); + } + + public void testDelimitersSA() throws Exception { + // other delimiters: "-", "/", "," + assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"}); + assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"}); + assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"}); + } + + public void testApostrophesSA() throws Exception { + // internal apostrophes: O'Reilly, you're, O'Reilly's + assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); + assertAnalyzesTo(a, "you're", new String[]{"you're"}); + assertAnalyzesTo(a, "she's", new String[]{"she's"}); + assertAnalyzesTo(a, "Jim's", new String[]{"jim's"}); + assertAnalyzesTo(a, "don't", new String[]{"don't"}); + assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"}); + } + + + public void testNumericSA() throws Exception { + // floating point, serial, model numbers, ip addresses, etc. + // every other segment must have at least one digit + assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); + assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); + assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); + assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); + assertAnalyzesTo(a, "২০৬৭০৩", new String[]{"206703"}); + } + + public void testTextWithNumbersSA() throws Exception { + // numbers + assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"}); + } + + public void testVariousTextSA() throws Exception { + // various + assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"}); + assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); + assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); + assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); + } + + + public void testKoreanSA() throws Exception { + // Korean words + assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"}); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception { + assertAnalyzesTo(a, input, expected, null); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception { + assertAnalyzesTo(a, input, expectedImages, expectedTypes, null); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception { + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + // TODO Java 1.5 + //final TypeAttribute typeAtt = reusableToken.getAttribute(TypeAttribute.class); + //final PositionIncrementAttribute posIncrAtt = reusableToken.getAttribute(PositionIncrementAttribute.class); + + final TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + final TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); + final PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class); + + for (int i = 0; i < expectedImages.length; i++) { + assertTrue(ts.incrementToken()); + assertEquals(expectedImages[i], new String(termAtt.termBuffer(), 0, termAtt.termLength())); + if (expectedTypes != null) { + assertEquals(expectedTypes[i], typeAtt.type()); + } + if (expectedPosIncrs != null) { + assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); + } + } + assertFalse(ts.incrementToken()); + ts.close(); + } +} Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUCaseFoldingFilter.java =================================================================== --- contrib/icu/src/test/org/apache/lucene/icu/TestICUCaseFoldingFilter.java (revision 0) +++ contrib/icu/src/test/org/apache/lucene/icu/TestICUCaseFoldingFilter.java (revision 0) @@ -0,0 +1,162 @@ +package org.apache.lucene.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.LuceneTestCase; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.text.Normalizer.Mode; + + +public class TestICUCaseFoldingFilter extends LuceneTestCase { + + + /** + * Some basic case-folding, including a multi-codepoint folding and a supplementary codepoint folding. + */ + public void testBasicFunctionality() throws Exception { + checkToken(Normalizer.NONE, "LuCeNE", "lucene"); // latin + checkToken(Normalizer.NONE, "Ruß", "russ"); // german + checkToken(Normalizer.NONE, "𐐖", "𐐾"); // suppl. codepoint + } + + /** + * Validate the case-folder folds all unicode codepoints the same way as the UCharacter String-based method. + */ + public void testUnicodeSet() throws Exception { + UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE); + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + String string = it.getString(); + checkToken(Normalizer.NONE, string, icuFold(string)); + } + } + + /** + * Case-fold the entire set of unicode codepoints as one huge term, and validate it against the the UCharacter String-based method + */ + public void testUnicodeSetOneTerm() throws Exception { + String allUnicode = unicodeSetString(); + checkToken(Normalizer.NONE, allUnicode, icuFold(allUnicode)); + } + + + /** + * Validate closure under normalization mode NFC. This is the default for the case-folding algorithm, but it should still work! + */ + public void testUnicodeClosureNFC() throws Exception { + UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE); + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + String string = it.getString(); + checkTokenClosure(Normalizer.NFC, string, icuNFC(icuFold(icuNFC(icuFold(string))))); + } + } + + /** + * Validate closure under normalization mode NFD. This is the default for the case-folding algorithm, but it should still work! + */ + public void testUnicodeClosureNFD() throws Exception { + UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE); + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + String string = it.getString(); + checkTokenClosure(Normalizer.NFD, string, icuNFD(icuFold(icuNFD(icuFold(string))))); + } + } + + /** + * Validate closure under normalization mode NFKC. + * In this case the filter will apply NFKC_Closure set to prevent from having to normalize, fold, normalize, fold. + */ + public void testUnicodeClosureNFKC() throws Exception { + UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE); + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + String string = it.getString(); + checkTokenClosure(Normalizer.NFKC, string, icuNFKC(icuFold(icuNFKC(icuFold(string))))); + } + } + + /** + * Validate closure under normalization mode NFKD. + * In this case the filter will apply NFKC_Closure set to prevent from having to normalize, fold, normalize, fold. + */ + public void testUnicodeClosureNFKD() throws Exception { + UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE); + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + String string = it.getString(); + checkTokenClosure(Normalizer.NFKD, string, icuNFKD(icuFold(icuNFKD(icuFold(string))))); + } + } + + String unicodeSetString() { + StringBuffer sb = new StringBuffer(); + UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE); + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) + sb.append(it.getString()); + return sb.toString(); + } + + String icuFold(String s) { + return UCharacter.foldCase(s, true); + } + + String icuNFKD(String s) { + return Normalizer.normalize(s, Normalizer.NFKD); + } + + String icuNFD(String s) { + return Normalizer.normalize(s, Normalizer.NFD); + } + + String icuNFC(String s) { + return Normalizer.normalize(s, Normalizer.NFC); + } + + String icuNFKC(String s) { + return Normalizer.normalize(s, Normalizer.NFKC); + } + + void checkToken(Mode mode, String input, String expected) throws IOException { + TokenStream ts = new ICUCaseFoldingFilter(new KeywordTokenizer((new StringReader(input))), mode); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + assertTrue(ts.incrementToken()); + assertEquals(expected, termAtt.term()); + } + + void checkTokenClosure(Mode mode, String input, String expected) throws IOException { + TokenStream ts = new ICUNormalizationFilter(new ICUCaseFoldingFilter(new KeywordTokenizer((new StringReader(input))), mode), mode); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + assertTrue(ts.incrementToken()); + assertEquals(expected, termAtt.term()); + } + + void checkTokenKW(String input, String expected) throws IOException { + TokenStream ts = new KeywordTokenizer((new StringReader(input))); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + assertTrue(ts.incrementToken()); + assertEquals(expected, termAtt.term()); + } + +} Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUNormalizationFilter.java =================================================================== --- contrib/icu/src/test/org/apache/lucene/icu/TestICUNormalizationFilter.java (revision 0) +++ contrib/icu/src/test/org/apache/lucene/icu/TestICUNormalizationFilter.java (revision 0) @@ -0,0 +1,78 @@ +package org.apache.lucene.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; + +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.LuceneTestCase; + +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; + + +/** + * TODO: maybe just use the Unicode normalization test suite? + * + */ +public class TestICUNormalizationFilter extends LuceneTestCase { + /** + * Validate the given String normalizes correctly in the provided mode against the ICU normalizer. + * + */ + void check(String string, Normalizer.Mode mode) throws Exception { + String expected = Normalizer.normalize(string, mode); + TokenStream ts = new ICUNormalizationFilter(new KeywordTokenizer((new StringReader(string))), mode); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + assertTrue(ts.incrementToken()); + assertEquals(termAtt.term(), expected); + } + + /** + * Validate all unicode codepoints against the given mode. + */ + void unicodeCompliance(Normalizer.Mode mode) throws Exception { + UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE); + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) + check(it.getString(), mode); + } + + public void testComplianceNFC() throws Exception { + unicodeCompliance(Normalizer.NFC); + } + + public void testComplianceNFKC() throws Exception { + unicodeCompliance(Normalizer.NFKC); + } + + public void testComplianceNFD() throws Exception { + unicodeCompliance(Normalizer.NFD); + } + + public void testComplianceNFKD() throws Exception { + unicodeCompliance(Normalizer.NFKD); + } + + public void testComplianceNone() throws Exception { + unicodeCompliance(Normalizer.NONE); + } +} Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUTransformFilter.java =================================================================== --- contrib/icu/src/test/org/apache/lucene/icu/TestICUTransformFilter.java (revision 0) +++ contrib/icu/src/test/org/apache/lucene/icu/TestICUTransformFilter.java (revision 0) @@ -0,0 +1,75 @@ +package org.apache.lucene.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.icu.transform.ICUTransformFilter; +import org.apache.lucene.util.LuceneTestCase; + +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UnicodeSet; + + +/** + * Test the ICUTransformFilter with some basic examples. + */ +public class TestICUTransformFilter extends LuceneTestCase { + + public void testBasicFunctionality() throws Exception { + checkToken(Transliterator.getInstance("Traditional-Simplified"), "簡化字", "简化字"); + checkToken(Transliterator.getInstance("Katakana-Hiragana"), "ヒラガナ", "ひらがな"); + checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), "アルアノリウ", "アルアノリウ"); + checkToken(Transliterator.getInstance("Any-Latin"), "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos"); + checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"), "Alphabētikós Katálogos", "Alphabetikos Katalogos"); + } + + public void testCustomFunctionality() throws Exception { + String rules = "a > b; b > c;"; // convert a's to b's and b's to c's + checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb"); + } + + public void testOptimizer() throws Exception { + String rules = "a > b; b > c;"; // convert a's to b's and b's to c's + Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD); + assertTrue(custom.getFilter() == null); + new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom); + assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]"))); + } + + public void testOptimizerSurrogate() throws Exception { + String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x + Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD); + assertTrue(custom.getFilter() == null); + new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom); + assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]"))); + } + + private void checkToken(Transliterator transform, String input, String expected) throws IOException { + TokenStream ts = new ICUTransformFilter(new KeywordTokenizer((new StringReader(input))), transform); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + assertTrue(ts.incrementToken()); + assertEquals(expected, termAtt.term()); + } + + +}