Index: src/test/org/apache/lucene/analysis/TestTurkishLowerCaseFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestTurkishLowerCaseFilter.java (revision 0) +++ src/test/org/apache/lucene/analysis/TestTurkishLowerCaseFilter.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.Version; + +import java.io.StringReader; + +public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase { + + public void testTurkishLowerCaseFilter() throws Exception { + TokenStream stream = new WhitespaceTokenizer( + new StringReader("\u0130STANBUL \u0130ZM\u0130R ISPARTA")); + TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(Version.LUCENE_30, stream); + assertTokenStreamContents(filter, new String[] {"istanbul", "izmir", "\u0131sparta",}); + } + +} Index: src/java/org/apache/lucene/analysis/TurkishLowerCaseFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/TurkishLowerCaseFilter.java (revision 0) +++ src/java/org/apache/lucene/analysis/TurkishLowerCaseFilter.java (revision 0) @@ -0,0 +1,69 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.util.Version; + +/** + * Normalizes Turkish token text to lower case. + * + *

You must specify the required {@link Version} + * compatibility when creating TurkishLowerCaseFilter: + *

+ * @since lucene 3.1 + */ +public final class TurkishLowerCaseFilter extends TokenFilter { + private final CharacterUtils charUtils; + private final int LATIN_CAPITAL_LETTER_I = 0x0049; + private final int LATIN_SMALL_LETTER_DOTLESS_I = 0x0131; + + /** + * Create a new TurkishLowerCaseFilter, that normalizes Turkish token text to lower case. + * + * @param matchVersion See
above + * @param in TokenStream to filter + */ + public TurkishLowerCaseFilter(Version matchVersion, TokenStream in) { + super(in); + termAtt = addAttribute(TermAttribute.class); + charUtils = CharacterUtils.getInstance(matchVersion); + } + + private TermAttribute termAtt; + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char[] buffer = termAtt.termBuffer(); + final int length = termAtt.termLength(); + for (int i = 0; i < length;) { + i += Character.toChars( + Character.toLowerCase( + charUtils.codePointAt(buffer, i) == LATIN_CAPITAL_LETTER_I ? LATIN_SMALL_LETTER_DOTLESS_I : charUtils.codePointAt(buffer, i)), buffer, i); + } + return true; + } else + return false; + } +}