Index: analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.java =================================================================== --- analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.java (revision 0) +++ analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.java (arbetskopia) @@ -0,0 +1,134 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.StemmerUtil; + +import java.io.IOException; + + +/** + * This filter folds Scandinavian characters åäæÅÄÆ->a and öøÖØ->o. + * It also discriminate against use of double vowels aa, ae, ao, oe and oo, leaving just the first one. + *

+ * blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej == blabarsyltetoj + * räksmörgås == ræksmørgås == ræksmörgaos == raeksmoergaas == raksmorgas + *

+ * Background: + * Swedish åäö are in fact the same letters as Norwegian and Danish åæø and thus interchangeable + * when used between these languages. They are however folded differently when people type + * them on a keyboard lacking these characters. + *

+ * When a Swedish person is lacking umlauted characters on the keyboard + * they consistently type a, a, o instead of å, ä, ö. Foreigners also tend to use a, a, o. + *

+ * In Norway people tend to type aa, ae and oe instead of å, æ and ø. + * Some use a, a, o. I've also seen oo, ao, etc. And permutations. + * Not sure about Denmark but the pattern is probably the same. + *

+ * This filter solves that problem, but might also cause new. + */ +public class ScandinavianFoldingFilter extends TokenFilter { + + public ScandinavianFoldingFilter(TokenStream input) { + super(input); + } + + private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); + + private static final char AA = '\u00C5'; // Å + private static final char aa = '\u00E5'; // å + private static final char AE = '\u00C6'; // Æ + private static final char ae = '\u00E6'; // æ + private static final char AE_se = '\u00C4'; // Ä + private static final char ae_se = '\u00E4'; // ä + private static final char OE = '\u00D8'; // Ø + private static final char oe = '\u00F8'; // ø + private static final char OE_se = '\u00D6'; // Ö + private static final char oe_se = '\u00F6'; //ö + + + @Override + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) { + return false; + } + + char[] buffer = charTermAttribute.buffer(); + int length = charTermAttribute.length(); + + + int i; + for (i = 0; i < length; i++) { + + if (buffer[i] == aa + || buffer[i] == ae_se + || buffer[i] == ae) { + + buffer[i] = 'a'; + + } else if (buffer[i] == AA + || buffer[i] == AE_se + || buffer[i] == AE) { + + buffer[i] = 'A'; + + } else if (buffer[i] == oe + || buffer[i] == oe_se) { + + buffer[i] = 'o'; + + } else if (buffer[i] == OE + || buffer[i] == OE_se) { + + buffer[i] = 'O'; + + } else if ((buffer[i] == 'a' || buffer[i] == 'A') + && (buffer[i + 1] == 'a' + || buffer[i + 1] == 'A' + || buffer[i + 1] == 'e' + || buffer[i + 1] == 'E' + || buffer[i + 1] == 'o' + || buffer[i + 1] == 'O') + ) { + + length = StemmerUtil.delete(buffer, i + 1, length); + + } else if ((buffer[i] == 'o' || buffer[i] == 'O') + && (buffer[i + 1] == 'e' + || buffer[i + 1] == 'E' + || buffer[i + 1] == 'o' + || buffer[i + 1] == 'O') + ) { + + length = StemmerUtil.delete(buffer, i + 1, length); + + } + } + + charTermAttribute.setLength(length); + + + return true; + } + +} + Index: analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilter.java =================================================================== --- analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilter.java (revision 0) +++ analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilter.java (arbetskopia) @@ -0,0 +1,72 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.io.StringReader; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +public class TestScandinavianFoldingFilter extends TestCase { + + + public void test() throws Exception { + + + assertEquals("aaaeea", fold("aeäaeeeae")); + assertEquals("aaeea", fold("aeaeeeae")); + + assertEquals("boen", fold("bøen")); + assertEquals("aene", fold("åene")); + + + assertEquals("blabarsyltetoj", fold("blåbærsyltetøj")); + assertEquals("raksmorgas", fold("räksmörgås")); + assertEquals("raksmorgas", fold("raeksmörgaos")); + assertEquals("raksmorgas", fold("raeksmoergås")); + + + String[][] equalTermsMatrix = new String[][]{ + new String[]{"blåbærsyltetøj", "blaabaarsyltetoej", "blåbärsyltetöj"}, + new String[]{"räksmörgås", "raksmorgas", "ræksmørgås", "raeksmoergaas", "ræksmörgaos"}, + }; + + for (String[] equalTerms : equalTermsMatrix) { + Set value = new HashSet(); + for (String equalTerm : equalTerms) { + value.add(fold(equalTerm)); + } + assertTrue(value.size() == 1); + } + + + + } + + private String fold(String value) throws Exception { + TokenStream ts = new ScandinavianFoldingFilter(new KeywordTokenizer(new StringReader(value))); + ts.incrementToken(); + CharTermAttribute attribute = ts.getAttribute(CharTermAttribute.class); + return new String(attribute.buffer(), 0, attribute.length()); + } +}