Index: analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.java
===================================================================
--- analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.java (revision 0)
+++ analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.java (arbetskopia)
@@ -0,0 +1,138 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+import java.io.IOException;
+
+/**
+ * This filter folds Scandinavian characters åÅäæÄÆ->a and öÖøØ->o.
+ * It also discriminate against use of double vowels aa, ae, ao, oe and oo, leaving just the first one.
+ *
+ * It's is a semantically more destructive solution than {@link ScandinavianNormalizationFilter} but
+ * can in addition help with matching raksmorgas as räksmörgås.
+ *
+ * blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej == blabarsyltetoj
+ * räksmörgås == ræksmørgås == ræksmörgaos == raeksmoergaas == raksmorgas
+ *
+ * Background:
+ * Swedish åäö are in fact the same letters as Norwegian and Danish åæø and thus interchangeable
+ * when used between these languages. They are however folded differently when people type
+ * them on a keyboard lacking these characters.
+ *
+ * In that situation almost all Swedish people use a, a, o instead of å, ä, ö.
+ *
+ * Norwegians and Danes on the other hand usually type aa, ae and oe instead of å, æ and ø.
+ * Some do however use a, a, o, oo, ao and sometimes permutations of everything above.
+ *
+ * This filter solves that mismatch problem, but might also cause new.
+ *
+ * @see ScandinavianNormalizationFilter
+ */
+public final class ScandinavianFoldingFilter extends TokenFilter {
+
+ public ScandinavianFoldingFilter(TokenStream input) {
+ super(input);
+ }
+
+ private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
+
+ private static final char AA = '\u00C5'; // Å
+ private static final char aa = '\u00E5'; // å
+ private static final char AE = '\u00C6'; // Æ
+ private static final char ae = '\u00E6'; // æ
+ private static final char AE_se = '\u00C4'; // Ä
+ private static final char ae_se = '\u00E4'; // ä
+ private static final char OE = '\u00D8'; // Ø
+ private static final char oe = '\u00F8'; // ø
+ private static final char OE_se = '\u00D6'; // Ö
+ private static final char oe_se = '\u00F6'; //ö
+
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (!input.incrementToken()) {
+ return false;
+ }
+
+ char[] buffer = charTermAttribute.buffer();
+ int length = charTermAttribute.length();
+
+
+ int i;
+ for (i = 0; i < length; i++) {
+
+ if (buffer[i] == aa
+ || buffer[i] == ae_se
+ || buffer[i] == ae) {
+
+ buffer[i] = 'a';
+
+ } else if (buffer[i] == AA
+ || buffer[i] == AE_se
+ || buffer[i] == AE) {
+
+ buffer[i] = 'A';
+
+ } else if (buffer[i] == oe
+ || buffer[i] == oe_se) {
+
+ buffer[i] = 'o';
+
+ } else if (buffer[i] == OE
+ || buffer[i] == OE_se) {
+
+ buffer[i] = 'O';
+
+ } else if (length - 1 > i) {
+
+ if ((buffer[i] == 'a' || buffer[i] == 'A')
+ && (buffer[i + 1] == 'a'
+ || buffer[i + 1] == 'A'
+ || buffer[i + 1] == 'e'
+ || buffer[i + 1] == 'E'
+ || buffer[i + 1] == 'o'
+ || buffer[i + 1] == 'O')
+ ) {
+
+ length = StemmerUtil.delete(buffer, i + 1, length);
+
+ } else if ((buffer[i] == 'o' || buffer[i] == 'O')
+ && (buffer[i + 1] == 'e'
+ || buffer[i + 1] == 'E'
+ || buffer[i + 1] == 'o'
+ || buffer[i + 1] == 'O')
+ ) {
+
+ length = StemmerUtil.delete(buffer, i + 1, length);
+
+ }
+ }
+ }
+
+ charTermAttribute.setLength(length);
+
+
+ return true;
+ }
+
+}
Index: analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilterFactory.java
===================================================================
--- analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilterFactory.java (revision 0)
+++ analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilterFactory.java (arbetskopia)
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link ScandinavianFoldingFilter}.
+ *
+ * <fieldType name="text_scandfold" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.ScandinavianFoldingFilterFactory"/>
+ * </analyzer>
+ * </fieldType>
+ */
+public class ScandinavianFoldingFilterFactory extends TokenFilterFactory {
+
+ public ScandinavianFoldingFilterFactory(Map args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public ScandinavianFoldingFilter create(TokenStream input) {
+ return new ScandinavianFoldingFilter(input);
+ }
+}
Index: analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java
===================================================================
--- analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java (revision 0)
+++ analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.java (arbetskopia)
@@ -0,0 +1,122 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+import java.io.IOException;
+
+/**
+ * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ
+ * and folded variants (aa, ao, ae, oe and oo) by transforming them to åÅæÆøØ.
+ *
+ * It's a semantically less destructive solution than {@link ScandinavianFoldingFilter},
+ * most useful when a person with a Norwegian or Danish keyboard queries a Swedish index
+ * and vice versa. This filter does not the common Swedish folds of å and ä to a nor ö to o.
+ *
+ * blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj
+ * räksmörgås == ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
+ *
+ * @see ScandinavianFoldingFilter
+ */
+public final class ScandinavianNormalizationFilter extends TokenFilter {
+
+ public ScandinavianNormalizationFilter(TokenStream input) {
+ super(input);
+ }
+
+ private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
+
+ private static final char AA = '\u00C5'; // Å
+ private static final char aa = '\u00E5'; // å
+ private static final char AE = '\u00C6'; // Æ
+ private static final char ae = '\u00E6'; // æ
+ private static final char AE_se = '\u00C4'; // Ä
+ private static final char ae_se = '\u00E4'; // ä
+ private static final char OE = '\u00D8'; // Ø
+ private static final char oe = '\u00F8'; // ø
+ private static final char OE_se = '\u00D6'; // Ö
+ private static final char oe_se = '\u00F6'; //ö
+
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (!input.incrementToken()) {
+ return false;
+ }
+
+ char[] buffer = charTermAttribute.buffer();
+ int length = charTermAttribute.length();
+
+
+ int i;
+ for (i = 0; i < length; i++) {
+
+ if (buffer[i] == ae_se) {
+ buffer[i] = ae;
+
+ } else if (buffer[i] == AE_se) {
+ buffer[i] = AE;
+
+ } else if (buffer[i] == oe_se) {
+ buffer[i] = oe;
+
+ } else if (buffer[i] == OE_se) {
+ buffer[i] = OE;
+
+ } else if (length - 1 > i) {
+
+ if (buffer[i] == 'a' && (buffer[i + 1] == 'a' || buffer[i + 1] == 'o' || buffer[i + 1] == 'A' || buffer[i + 1] == 'O')) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = aa;
+
+ } else if (buffer[i] == 'A' && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = AA;
+
+ } else if (buffer[i] == 'a' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = ae;
+
+ } else if (buffer[i] == 'A' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = AE;
+
+ } else if (buffer[i] == 'o' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = oe;
+
+ } else if (buffer[i] == 'O' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) {
+ length = StemmerUtil.delete(buffer, i + 1, length);
+ buffer[i] = OE;
+
+ }
+
+ }
+ }
+
+ charTermAttribute.setLength(length);
+
+
+ return true;
+ }
+
+}
Index: analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilterFactory.java
===================================================================
--- analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilterFactory.java (revision 0)
+++ analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilterFactory.java (arbetskopia)
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter}.
+ *
+ * <fieldType name="text_scandnorm" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.ScandinavianNormalizationFilterFactory"/>
+ * </analyzer>
+ * </fieldType>
+ */
+public class ScandinavianNormalizationFilterFactory extends TokenFilterFactory {
+
+ public ScandinavianNormalizationFilterFactory(Map args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public ScandinavianNormalizationFilter create(TokenStream input) {
+ return new ScandinavianNormalizationFilter(input);
+ }
+}
Index: analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
===================================================================
--- analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 1486586)
+++ analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (arbetskopia)
@@ -66,6 +66,8 @@
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
+org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
+org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory
org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
org.apache.lucene.analysis.ngram.NGramFilterFactory
org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
Index: analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilter.java
===================================================================
--- analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilter.java (revision 0)
+++ analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilter.java (arbetskopia)
@@ -0,0 +1,106 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.*;
+
+import java.io.Reader;
+
+public class TestScandinavianFoldingFilter extends BaseTokenStreamTestCase {
+
+
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String field, Reader reader) {
+ final Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ final TokenStream stream = new ScandinavianFoldingFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+
+ public void test() throws Exception {
+
+ checkOneTerm(analyzer, "aeäaeeea", "aaaeea"); // should not cause ArrayOutOfBoundsException
+
+ checkOneTerm(analyzer, "aeäaeeeae", "aaaeea");
+ checkOneTerm(analyzer, "aeaeeeae", "aaeea");
+
+ checkOneTerm(analyzer, "bøen", "boen");
+ checkOneTerm(analyzer, "åene", "aene");
+
+
+ checkOneTerm(analyzer, "blåbærsyltetøj", "blabarsyltetoj");
+ checkOneTerm(analyzer, "blaabaarsyltetoej", "blabarsyltetoj");
+ checkOneTerm(analyzer, "blåbärsyltetöj", "blabarsyltetoj");
+
+ checkOneTerm(analyzer, "raksmorgas", "raksmorgas");
+ checkOneTerm(analyzer, "räksmörgås", "raksmorgas");
+ checkOneTerm(analyzer, "ræksmørgås", "raksmorgas");
+ checkOneTerm(analyzer, "raeksmoergaas", "raksmorgas");
+ checkOneTerm(analyzer, "ræksmörgaos", "raksmorgas");
+
+
+ checkOneTerm(analyzer, "ab", "ab");
+ checkOneTerm(analyzer, "ob", "ob");
+ checkOneTerm(analyzer, "Ab", "Ab");
+ checkOneTerm(analyzer, "Ob", "Ob");
+
+ checkOneTerm(analyzer, "å", "a");
+
+ checkOneTerm(analyzer, "aa", "a");
+ checkOneTerm(analyzer, "aA", "a");
+ checkOneTerm(analyzer, "ao", "a");
+ checkOneTerm(analyzer, "aO", "a");
+
+ checkOneTerm(analyzer, "AA", "A");
+ checkOneTerm(analyzer, "Aa", "A");
+ checkOneTerm(analyzer, "Ao", "A");
+ checkOneTerm(analyzer, "AO", "A");
+
+ checkOneTerm(analyzer, "æ", "a");
+ checkOneTerm(analyzer, "ä", "a");
+
+ checkOneTerm(analyzer, "Æ", "A");
+ checkOneTerm(analyzer, "Ä", "A");
+
+ checkOneTerm(analyzer, "ae", "a");
+ checkOneTerm(analyzer, "aE", "a");
+
+ checkOneTerm(analyzer, "Ae", "A");
+ checkOneTerm(analyzer, "AE", "A");
+
+
+ checkOneTerm(analyzer, "ö", "o");
+ checkOneTerm(analyzer, "ø", "o");
+ checkOneTerm(analyzer, "Ö", "O");
+ checkOneTerm(analyzer, "Ø", "O");
+
+
+ checkOneTerm(analyzer, "oo", "o");
+ checkOneTerm(analyzer, "oe", "o");
+ checkOneTerm(analyzer, "oO", "o");
+ checkOneTerm(analyzer, "oE", "o");
+
+ checkOneTerm(analyzer, "Oo", "O");
+ checkOneTerm(analyzer, "Oe", "O");
+ checkOneTerm(analyzer, "OO", "O");
+ checkOneTerm(analyzer, "OE", "O");
+
+
+ }
+}
Index: analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilterFactory.java
===================================================================
--- analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilterFactory.java (revision 0)
+++ analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianFoldingFilterFactory.java (arbetskopia)
@@ -0,0 +1,45 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+public class TestScandinavianFoldingFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("räksmörgås");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = tokenFilterFactory("ScandinavianFolding").create(stream);
+ assertTokenStreamContents(stream, new String[] { "raksmorgas" });
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ tokenFilterFactory("ScandinavianFolding",
+ "bogusArg", "bogusValue");
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
+}
\ No newline at end of file
Index: analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java
===================================================================
--- analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java (revision 0)
+++ analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilter.java (arbetskopia)
@@ -0,0 +1,106 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.*;
+
+import java.io.Reader;
+
+
+public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase {
+
+
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String field, Reader reader) {
+ final Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+
+ public void test() throws Exception {
+
+ checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
+
+ checkOneTerm(analyzer, "aeäaeeeae", "æææeeæ");
+ checkOneTerm(analyzer, "aeaeeeae", "ææeeæ");
+
+ checkOneTerm(analyzer, "bøen", "bøen");
+ checkOneTerm(analyzer, "bOEen", "bØen");
+ checkOneTerm(analyzer, "åene", "åene");
+
+
+ checkOneTerm(analyzer, "blåbærsyltetøj", "blåbærsyltetøj");
+ checkOneTerm(analyzer, "blaabaersyltetöj", "blåbærsyltetøj");
+ checkOneTerm(analyzer, "räksmörgås", "ræksmørgås");
+ checkOneTerm(analyzer, "raeksmörgaos", "ræksmørgås");
+ checkOneTerm(analyzer, "raeksmörgaas", "ræksmørgås");
+ checkOneTerm(analyzer, "raeksmoergås", "ræksmørgås");
+
+
+ checkOneTerm(analyzer, "ab", "ab");
+ checkOneTerm(analyzer, "ob", "ob");
+ checkOneTerm(analyzer, "Ab", "Ab");
+ checkOneTerm(analyzer, "Ob", "Ob");
+
+ checkOneTerm(analyzer, "å", "å");
+
+ checkOneTerm(analyzer, "aa", "å");
+ checkOneTerm(analyzer, "aA", "å");
+ checkOneTerm(analyzer, "ao", "å");
+ checkOneTerm(analyzer, "aO", "å");
+
+ checkOneTerm(analyzer, "AA", "Å");
+ checkOneTerm(analyzer, "Aa", "Å");
+ checkOneTerm(analyzer, "Ao", "Å");
+ checkOneTerm(analyzer, "AO", "Å");
+
+ checkOneTerm(analyzer, "æ", "æ");
+ checkOneTerm(analyzer, "ä", "æ");
+
+ checkOneTerm(analyzer, "Æ", "Æ");
+ checkOneTerm(analyzer, "Ä", "Æ");
+
+ checkOneTerm(analyzer, "ae", "æ");
+ checkOneTerm(analyzer, "aE", "æ");
+
+ checkOneTerm(analyzer, "Ae", "Æ");
+ checkOneTerm(analyzer, "AE", "Æ");
+
+
+ checkOneTerm(analyzer, "ö", "ø");
+ checkOneTerm(analyzer, "ø", "ø");
+ checkOneTerm(analyzer, "Ö", "Ø");
+ checkOneTerm(analyzer, "Ø", "Ø");
+
+
+ checkOneTerm(analyzer, "oo", "ø");
+ checkOneTerm(analyzer, "oe", "ø");
+ checkOneTerm(analyzer, "oO", "ø");
+ checkOneTerm(analyzer, "oE", "ø");
+
+ checkOneTerm(analyzer, "Oo", "Ø");
+ checkOneTerm(analyzer, "Oe", "Ø");
+ checkOneTerm(analyzer, "OO", "Ø");
+ checkOneTerm(analyzer, "OE", "Ø");
+
+
+ }
+
+}
Index: analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java
===================================================================
--- analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java (revision 0)
+++ analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizationFilterFactory.java (arbetskopia)
@@ -0,0 +1,45 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("räksmörgås");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = tokenFilterFactory("ScandinavianNormalization").create(stream);
+ assertTokenStreamContents(stream, new String[] { "ræksmørgås" });
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ tokenFilterFactory("ScandinavianNormalization",
+ "bogusArg", "bogusValue");
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
+}
\ No newline at end of file