Index: lucene/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java (revision 997028) +++ lucene/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java (working copy) @@ -23,12 +23,40 @@ import org.apache.lucene.util.Version; /** - * LowerCaseTokenizer performs the function of LetterTokenizer - * and LowerCaseFilter together. It divides text at non-letters and converts - * them to lower case. While it is functionally equivalent to the combination - * of LetterTokenizer and LowerCaseFilter, there is a performance advantage - * to doing the two tasks at once, hence this (redundant) implementation. - *

+ * The LowerCaseTokenizer design has been corrected to work as advertised, + * while not breaking the implementation that may have been used in prior + * schemas. LowerCaseTokenizer would normalize letters by downcasing them. + * However, the implementation extended LetterTokenizer and, therefore, + * split on all characters that for which Character#isLetter was false. + * + * That behavior will remain the default, so-as to avoid ruining anyone's day. + * + * The schema can be adjusted to have "LowerCaseTokenizer" behave truer to + * its name. You may do this by setting: + * + * splitTokensByWhitespaceChars="true" + * splitTokensByVisibleNonAlphaOrNumericChars="false" + * splitTokensByVisibleNonAlphaChars="false" + * eg, the following string would be tokenized at the below Carets: + * "123lorem ipsum dolor sit amet, can't 15 dolor sit now. duh." + * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + * + * For reference, the prior implementation of LowerCaseTokenizer (still the + * default) would have the above flags set to the following: + * + * splitTokensByWhitespaceChars="true" + * splitTokensByVisibleNonAlphaOrNumericChars="false" + * splitTokensByVisibleNonAlphaChars="true" + * As with above, the following string would be tokenized at the below Carets: + * "123lorem ipsum dolor sit amet, can't 15 dolor sit now. duh." + * ^ ^ ^ ^ ^ | ^ | ^ ^ ^ ^ | ^ | + * The "|" denotes the end of the token. Ie, "amet," is tokenized as "amet" + * and "123lorem" is merely tokenized as "lorem" + * + * Additionally, the splitTokensByVisibleNonAlphaOrNumericChars flag, if "true," + * will take precedence over splitTokensByVisibleNonAlphaChars. + * + *

* Note: this does a decent job for most European languages, but does a terrible * job for some Asian languages, where words are not separated by spaces. *

@@ -43,9 +71,15 @@ * *

*/ -public final class LowerCaseTokenizer extends LetterTokenizer { - +public final class LowerCaseTokenizer extends CharTokenizer { /** + * Backwards compatibility has the following attributes set to true, + * as the default. + */ + private boolean tokenizeAtWhitespaceChar = false; + private boolean tokenizeAtNumericChar = false; + private boolean tokenizeAtSpecialChar = false; + /** * Construct a new LowerCaseTokenizer. * * @param matchVersion @@ -88,6 +122,61 @@ } /** + * Returns the (boolean) value of tokenizeAtWhitespaceChar + */ + public boolean getTokenizeAtWhitespaceChar() { + return this.tokenizeAtWhitespaceChar; + } + + /** + * Sets the tokenizeAtWhitespaceChar flag. + * + * @param newFlag + * A true/false value to determine if whitespace characters should be part of a token + * or excluded; eg, space-separated words are broken into individual tokens. + * {@link Character#isWhiteSpace(char)} + */ + public void setTokenizeAtWhitespaceChar(boolean newFlag) { + this.tokenizeAtWhitespaceChar = newFlag; + } + + /** + * Returns the (boolean) value of tokenizeAtNumericChar + */ + public boolean getTokenizeAtNumericChar() { + return this.tokenizeAtNumericChar; + } + + /** + * Sets the tokenizeAtNumericChar flag. + * @param newFlag + * A true/false value to determine if numeric characters should be part of a token + * or excluded, as one might do with whitespace. + * {@link Character#isDigit(char)} + */ + public void setTokenizeAtNumericChar(boolean newFlag) { + this.tokenizeAtNumericChar = newFlag; + } + + /** + * Returns the (boolean) value of tokenizeAtSpecialChar + */ + public boolean getTokenizeAtSpecialChar() { + return this.tokenizeAtSpecialChar; + } + + /** + * Sets the tokenizeAtNumericChar flag. + * @param newFlag + * A true/false value, to determine if special characters should be part of a token + * or excluded, as one might do with whitespace. + * {@link Character#isLetterOrDigit(char)} + */ + public void setTokenizeAtSpecialChar(boolean newFlag) { + this.tokenizeAtSpecialChar = newFlag; + } + + /** * Construct a new LowerCaseTokenizer. * * @deprecated use {@link #LowerCaseTokenizer(Reader)} instead. This will be @@ -119,9 +208,35 @@ super(Version.LUCENE_30, factory, in); } - /** Converts char to lower case - * {@link Character#toLowerCase(int)}.*/ + /** + * Collects only characters which satisfy the boolean flags and any related methods from the + * Character class. Default behavior is to simply check isLetter(int). + * {@link Character#isLetter(int)} + * {@link Character#isDigit(int)} + * {@link Character#isWhitespace(int)} + */ @Override + protected boolean isTokenChar(int c) { + if(Character.isLetter(c)) { + return true; + } + + if(Character.isDigit(c)) { + return(tokenizeAtNumericChar == true); + } + + if(Character.isWhitespace(c)) { + return(tokenizeAtWhitespaceChar == true); + } + + return(tokenizeAtSpecialChar == true); + } + + /** + * Converts char to lower case + * {@link Character#toLowerCase(int)}. + */ + @Override protected int normalize(int c) { return Character.toLowerCase(c); }