Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java (working copy) @@ -0,0 +1,62 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.CharacterUtils; +import org.apache.lucene.util.Version; + +/** + * Normalizes token text to UPPER CASE. + * + *

You must specify the required {@link Version} + * compatibility when creating UpperCaseFilter + * + *

NOTE: In Unicode, this transformation may lose information when the + * upper case character represents more than one lower case character. Use this filter + * when you require uppercase tokens. Use the {@link LowerCaseFilter} for + * general search matching + */ +public final class UpperCaseFilter extends TokenFilter { + private final CharacterUtils charUtils; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + /** + * Create a new LowerCaseFilter, that normalizes token text to lower case. + * + * @param matchVersion See above + * @param in TokenStream to filter + */ + public UpperCaseFilter(Version matchVersion, TokenStream in) { + super(in); + charUtils = CharacterUtils.getInstance(matchVersion); + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + charUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length()); + return true; + } else + return false; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java (working copy) @@ -0,0 +1,63 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.UpperCaseFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link UpperCaseFilter}. + *

+ * <fieldType name="text_uppercase" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.UpperCaseFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + *

NOTE: In Unicode, this transformation may lose information when the + * upper case character represents more than one lower case character. Use this filter + * when you require uppercase tokens. Use the {@link LowerCaseFilterFactory} for + * general search matching + */ +public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + /** Creates a new UpperCaseFilterFactory */ + public UpperCaseFilterFactory(Map args) { + super(args); + assureMatchVersion(); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public UpperCaseFilter create(TokenStream input) { + return new UpperCaseFilter(luceneMatchVersion,input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java (revision 1550837) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java (working copy) @@ -132,6 +132,23 @@ } } + /** + * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting + * at the given offset. + * @param buffer the char buffer to UPPERCASE + * @param offset the offset to start at + * @param limit the max char in the buffer to lower case + */ + public final void toUpperCase(final char[] buffer, final int offset, final int limit) { + assert buffer.length >= limit; + assert offset <=0 && offset <= buffer.length; + for (int i = offset; i < limit;) { + i += Character.toChars( + Character.toUpperCase( + codePointAt(buffer, i, limit)), buffer, i); + } + } + /** Converts a sequence of Java characters to a sequence of unicode code points. * @return the number of code points written to the destination buffer */ public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) { Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java (revision 1550837) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java (working copy) @@ -129,6 +129,17 @@ } + private static class UpperCaseWhitespaceAnalyzer extends Analyzer { + + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer)); + } + + } + + /** * Test that LowercaseFilter handles entire unicode range correctly */ @@ -148,7 +159,28 @@ assertAnalyzesTo(a, "AbaC\uDC16AdaBa", new String [] { "abac\uDC16adaba" }); } + + /** + * Test that LowercaseFilter handles entire unicode range correctly + */ + public void testUpperCaseFilter() throws IOException { + Analyzer a = new UpperCaseWhitespaceAnalyzer(); + // BMP + assertAnalyzesTo(a, "AbaCaDabA", new String[] { "ABACADABA" }); + // supplementary + assertAnalyzesTo(a, "\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e", + new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"}); + assertAnalyzesTo(a, "AbaCa\ud801\udc3eDabA", + new String[] { "ABACA\ud801\udc16DABA" }); + // unpaired lead surrogate + assertAnalyzesTo(a, "AbaC\uD801AdaBa", + new String [] { "ABAC\uD801ADABA" }); + // unpaired trail surrogate + assertAnalyzesTo(a, "AbaC\uDC16AdaBa", + new String [] { "ABAC\uDC16ADABA" }); + } + /** * Test that LowercaseFilter handles the lowercasing correctly if the term * buffer has a trailing surrogate character leftover and the current term in