Index: analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianInterintelligableASCIIFoldingFilter.java =================================================================== --- analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianInterintelligableASCIIFoldingFilter.java (revision 0) +++ analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianInterintelligableASCIIFoldingFilter.java (arbetskopia) @@ -0,0 +1,116 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.io.IOException; + +/** + * This filter is an augmentation of output from ASCIIFoldingFilter, + * it discriminate against double vowels aa, ae, ao, oe and oo, leaving just the first one. + *

+ * blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej == blabarsyltetoj + * räksmörgås == ræksmørgås == ræksmörgaos == raeksmoergaas == raksmorgas + *

+ * Caveats: + * Since this is a filtering on top of ASCIIFoldingFilter äöåøæ already has been folded + * down to aoaoae when handled by this filter it will cause effects such as: + * bøen -> boen -> bon + * åene -> aene -> ane + * I find this to be a trivial problem compared to not finding anything at all. + *

+ * Background: + * Swedish åäö is in fact the same letters as Norwegian and Danish åæø and thus interchangeable + * in when used between these languages. They are however folded differently when people type + * them on a keyboard lacking these characters and {@link ASCIIFoldingFilter} handle ä and æ differently. + *

+ * When a Swedish person is lacking umlauted characters on the keyboard + * they consistently type a, a, o instead of å, ä, ö. Foreigners also tend to use a, a, o. + *

+ * In Norway people tend to type aa, ae and oe instead of å, æ and ø. + * Some use a, a, o. I've also seen oo, ao, etc. And permutations. + * Not sure about Denmark but the pattern is probably the same. + *

+ * This filter solves that problem, but might also cause new. + * */ +public class ScandinavianInterintelligableASCIIFoldingFilter extends TokenFilter { + + public ScandinavianInterintelligableASCIIFoldingFilter(TokenStream input) { + super(new ASCIIFoldingFilter(input)); + } + + private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); + + private char[] output = new char[512]; + + @Override + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) { + return false; + } + + char[] inputBuffer = charTermAttribute.buffer(); + int inputBufferLength = charTermAttribute.length(); + + int outputPosition = 0; + + int i; + for (i = 0; i < inputBufferLength - 1; i++) { + + if ((inputBuffer[i] == 'a' || inputBuffer[i] == 'A') + && (inputBuffer[i + 1] == 'a' + || inputBuffer[i + 1] == 'A' + || inputBuffer[i + 1] == 'e' + || inputBuffer[i + 1] == 'E' + || inputBuffer[i + 1] == 'o' + || inputBuffer[i + 1] == 'O') + ) { + + output[outputPosition++] = inputBuffer[i]; + i++; + + } else if ((inputBuffer[i] == 'o' || inputBuffer[i] == 'O') + && (inputBuffer[i + 1] == 'e' + || inputBuffer[i + 1] == 'E' + || inputBuffer[i + 1] == 'o' + || inputBuffer[i + 1] == 'O') + ) { + + output[outputPosition++] = inputBuffer[i]; + i++; + + } else { + output[outputPosition++] = inputBuffer[i]; + } + } + + if (i < inputBufferLength) { + output[outputPosition++] = inputBuffer[inputBufferLength - 1]; + } + + charTermAttribute.copyBuffer(output, 0, outputPosition); + + + return true; + } + +} + Index: analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianInterintelligableASCIIFoldingFilter.java =================================================================== --- analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianInterintelligableASCIIFoldingFilter.java (revision 0) +++ analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianInterintelligableASCIIFoldingFilter.java (arbetskopia) @@ -0,0 +1,74 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.io.StringReader; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +public class TestScandinavianInterintelligableASCIIFoldingFilter extends TestCase { + + + public void test() throws Exception { + + // ae -> a, ä -> a (in input ascii folding) -> aa -> a, eee -> eee, ae -> a + assertEquals("aaeeea", fold("aeäaeeeae")); + + // same here, bøen -> boen -> bon + assertEquals("bon", fold("bøen")); + // same here, åene -> aene -> ane + assertEquals("ane", fold("åene")); + + assertEquals("aaeea", fold("aeaeeeae")); + + assertEquals("blabarsyltetoj", fold("blåbærsyltetøj")); + assertEquals("raksmorgas", fold("räksmörgås")); + assertEquals("raksmorgas", fold("raeksmörgaos")); + assertEquals("raksmorgas", fold("raeksmoergås")); + + + String[][] equalTermsMatrix = new String[][]{ + new String[]{"blåbærsyltetøj", "blaabaarsyltetoej", "blåbärsyltetöj"}, + new String[]{"räksmörgås", "raksmorgas", "ræksmørgås", "raeksmoergaas", "ræksmörgaos"}, + }; + + for (String[] equalTerms : equalTermsMatrix) { + Set value = new HashSet<>(); + for (String equalTerm : equalTerms) { + value.add(fold(equalTerm)); + } + assertTrue(value.size() == 1); + } + + + + } + + private String fold(String value) throws Exception { + TokenStream ts = new ScandinavianInterintelligableASCIIFoldingFilter(new KeywordTokenizer(new StringReader(value))); + ts.incrementToken(); + CharTermAttribute attribute = ts.getAttribute(CharTermAttribute.class); + return new String(attribute.buffer(), 0, attribute.length()); + } +}