Index: contrib/phonetics/pom.xml.template
===================================================================
--- contrib/phonetics/pom.xml.template (revision 0)
+++ contrib/phonetics/pom.xml.template (revision 0)
@@ -0,0 +1,42 @@
+
+
+
+ 4.0.0
+
+ org.apache.lucene
+ lucene-contrib
+ @version@
+
+ org.apache.lucene
+ lucene-phonetics
+ Lucene phonetics
+ @version@
+ Phonetic algorithm filters module
+ jar
+
+
+ commons-codec
+ commons-codec
+ ${commons-codec-version}
+
+
+
Index: contrib/phonetics/lib/commons-codec-1.3.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib/phonetics/lib/commons-codec-1.3.jar
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Index: contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestDoubleMetaphoneFilter.java
===================================================================
--- contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestDoubleMetaphoneFilter.java (revision 0)
+++ contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestDoubleMetaphoneFilter.java (revision 0)
@@ -0,0 +1,196 @@
+package org.apache.lucene.analysis.phonetics;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class TestDoubleMetaphoneFilter extends TestCase {
+
+ public void testNumeric() throws Exception {
+
+ TokenStream ts;
+
+ DoubleMetaphone codec = new DoubleMetaphone();
+ codec.setMaxCodeLen(10);
+
+ ts = tokenize("matt1as fahlström");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "MTS", 1);
+ assertNext(ts, "FLSTRM", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("12345");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.standard, codec);
+ assertNextNull(ts);
+
+ ts = tokenize("12345");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.alternative, codec);
+ assertNextNull(ts);
+
+ ts = tokenize("12345");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNextNull(ts);
+
+ }
+
+ public void testSwedish() throws Exception {
+
+ TokenStream ts;
+
+ DoubleMetaphone codec = new DoubleMetaphone();
+ codec.setMaxCodeLen(10);
+
+
+ ts = tokenize("mathias");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.standard, codec);
+ assertNext(ts, "M0S", 1, "Standard double metaphone expression");
+ assertNextNull(ts);
+
+ ts = tokenize("mathias");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.alternative, codec);
+ assertNext(ts, "MTS", 1, "Alternative double metaphone expression");
+ assertNextNull(ts);
+
+
+ ts = tokenize("mathias");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "M0S", 1, "Standard double metaphone expression");
+ assertNext(ts, "MTS", 0, "Alternative double metaphone expression");
+ assertNextNull(ts);
+
+
+
+
+ ts = tokenize("mattias fahlström");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "MTS", 1);
+ assertNext(ts, "FLSTRM", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("mathias valströhm");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "M0S", 1);
+ assertNext(ts, "MTS", 0);
+ assertNext(ts, "FLSTRM", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("christian nygård");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "KRSXN", 1);
+ assertNext(ts, "NKRT", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("kristian nygaard");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "KRSXN", 1);
+ assertNext(ts, "NKRT", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("kristina nygaard");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "KRSTN", 1);
+ assertNext(ts, "NKRT", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("anders nilsson");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "ANTRS", 1);
+ assertNext(ts, "NLSN", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("andreas skiöld");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "ANTRS", 1);
+ assertNext(ts, "SKLT", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("andreas skjöld");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "ANTRS", 1);
+ assertNext(ts, "SKLT", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("andreas sköld");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "ANTRS", 1);
+ assertNext(ts, "SKLT", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("lars ericksson");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "LRS", 1);
+ assertNext(ts, "ARKSN", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("lars erixon");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "LRS", 1);
+ assertNext(ts, "ARKSN", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("lars eriksen");
+ ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec);
+ assertNext(ts, "LRS", 1);
+ assertNext(ts, "ARKSN", 1);
+ assertNextNull(ts);
+
+ }
+
+ private TokenStream tokenize(String text) throws IOException {
+ return new WhitespaceAnalyzer().tokenStream(null, new StringReader(text));
+ }
+
+ public void assertNextNull(TokenStream ts) throws IOException {
+ Token token = ts.next(new Token());
+ assertNull(token);
+ }
+
+ public void assertNext(TokenStream ts, String termValue, int positionIncrement) throws IOException {
+ Token token = ts.next(new Token());
+ assertEquals(termValue, token.term());
+ assertEquals(positionIncrement, token.getPositionIncrement());
+ }
+
+ public void assertNext(TokenStream ts, String termValue, int positionIncrement, String type) throws IOException {
+ Token token = ts.next(new Token());
+ assertEquals(termValue, token.term());
+ assertEquals(positionIncrement, token.getPositionIncrement());
+ assertEquals(type, token.type());
+ }
+
+ private void factory(TokenStream ts) throws IOException {
+ Token token;
+ while ((token = ts.next(new Token())) != null) {
+ System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ");");
+ }
+ System.out.println("assertNextNull(ts);");
+ }
+
+
+}
Index: contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestMetaphoneFilter.java
===================================================================
--- contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestMetaphoneFilter.java (revision 0)
+++ contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestMetaphoneFilter.java (revision 0)
@@ -0,0 +1,162 @@
+package org.apache.lucene.analysis.phonetics;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class TestMetaphoneFilter extends TestCase {
+
+ public void testNumeric() throws Exception {
+
+ TokenStream ts;
+
+ Metaphone codec = new Metaphone();
+ codec.setMaxCodeLen(10);
+
+
+ ts = tokenize("matt1as fahlström");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "MTS", 1);
+ assertNext(ts, "FLSTRM", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("12345");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNextNull(ts);
+
+ }
+
+ public void testSwedish() throws Exception {
+
+ TokenStream ts;
+
+ Metaphone codec = new Metaphone();
+ codec.setMaxCodeLen(10);
+
+
+ ts = tokenize("mattias fahlström");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "MTS", 1);
+ assertNext(ts, "FLSTRM", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("mathias valströhm");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "M0S", 1);
+ assertNext(ts, "FLSTRM", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("christian nygård");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "XRSXN", 1);
+ assertNext(ts, "NKRT", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("kristian nygaard");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "KRSXN", 1);
+ assertNext(ts, "NKRT", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("kristina nygaard");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "KRSTN", 1);
+ assertNext(ts, "NKRT", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("anders nilsson");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "ANTRS", 1);
+ assertNext(ts, "NLSN", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("andreas skiöld");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "ANTRS", 1);
+ assertNext(ts, "SKLT", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("andreas skjöld");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "ANTRS", 1);
+ assertNext(ts, "SKJLT", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("andreas sköld");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "ANTRS", 1);
+ assertNext(ts, "SKLT", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("lars ericksson");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "LRS", 1);
+ assertNext(ts, "ERKSN", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("lars erixon");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "LRS", 1);
+ assertNext(ts, "ERKSN", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("lars eriksen");
+ ts = new MetaphoneFilter(ts, codec);
+ assertNext(ts, "LRS", 1);
+ assertNext(ts, "ERKSN", 1);
+ assertNextNull(ts);
+
+ }
+
+ private TokenStream tokenize(String text) throws IOException {
+ return new WhitespaceAnalyzer().tokenStream(null, new StringReader(text));
+ }
+
+ public void assertNextNull(TokenStream ts) throws IOException {
+ Token token = ts.next(new Token());
+ assertNull(token);
+ }
+
+ public void assertNext(TokenStream ts, String termValue, int positionIncrement) throws IOException {
+ Token token = ts.next(new Token());
+ assertEquals(termValue, token.term());
+ assertEquals(positionIncrement, token.getPositionIncrement());
+ }
+
+ private void factory(TokenStream ts) throws IOException {
+ Token token;
+ while ((token = ts.next(new Token())) != null) {
+ System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ");");
+ }
+ System.out.println("assertNextNull(ts);");
+ }
+
+
+}
\ No newline at end of file
Index: contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestRefinedSoundexFilter.java
===================================================================
--- contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestRefinedSoundexFilter.java (revision 0)
+++ contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestRefinedSoundexFilter.java (revision 0)
@@ -0,0 +1,149 @@
+package org.apache.lucene.analysis.phonetics;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.commons.codec.language.RefinedSoundex;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class TestRefinedSoundexFilter extends TestCase {
+
+ public void testNumeric() throws Exception {
+
+ TokenStream ts;
+
+ RefinedSoundex codec = new RefinedSoundex();
+
+ ts = tokenize("m1atti1as");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "M80603", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("12345");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNextNull(ts);
+
+ }
+
+
+
+ public void testSwedish() throws Exception {
+
+ TokenStream ts;
+
+ RefinedSoundex codec = new RefinedSoundex();
+
+ ts = tokenize("mattias fahlström");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "M80603", 1);
+ try {
+ ts.next(new Token());
+ fail("Not supposed to handle ö!");
+ } catch (ArrayIndexOutOfBoundsException e) {
+ // all good
+ }
+
+ ts = tokenize("mathias valströhm");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "M80603", 1);
+ try {
+ ts.next(new Token());
+ fail("Not supposed to handle ö!");
+ } catch (ArrayIndexOutOfBoundsException e) {
+ // all good
+ }
+
+ ts = tokenize("christian nygård");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "C30903608", 1);
+ try {
+ ts.next(new Token());
+ fail("Not supposed to handle å!");
+ } catch (ArrayIndexOutOfBoundsException e) {
+ // all good
+ }
+
+ ts = tokenize("kristian nygaard");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "K3903608", 1);
+ assertNext(ts, "N804096", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("kristina nygaard");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "K39036080", 1);
+ assertNext(ts, "N804096", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("anders nilsson");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "A086093", 1);
+ assertNext(ts, "N807308", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("lars ericksson");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "L7093", 1);
+ assertNext(ts, "E090308", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("lars erixon");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "L7093", 1);
+ assertNext(ts, "E090508", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("lars eriksen");
+ ts = new RefinedSoundexFilter(ts, codec);
+ assertNext(ts, "L7093", 1);
+ assertNext(ts, "E090308", 1);
+ assertNextNull(ts);
+ }
+
+ private TokenStream tokenize(String text) throws IOException {
+ return new WhitespaceAnalyzer().tokenStream(null, new StringReader(text));
+ }
+
+ public void assertNextNull(TokenStream ts) throws IOException {
+ Token token = ts.next(new Token());
+ assertNull(token);
+ }
+
+ public void assertNext(TokenStream ts, String termValue, int positionIncrement) throws IOException {
+ Token token = ts.next(new Token());
+ assertEquals(termValue, token.term());
+ assertEquals(positionIncrement, token.getPositionIncrement());
+ }
+
+ private void factory(TokenStream ts) throws IOException {
+ Token token;
+ while ((token = ts.next(new Token())) != null) {
+ System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ");");
+ }
+ System.out.println("assertNextNull(ts);");
+ }
+
+
+}
\ No newline at end of file
Index: contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestSoundexFilter.java
===================================================================
--- contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestSoundexFilter.java (revision 0)
+++ contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestSoundexFilter.java (revision 0)
@@ -0,0 +1,175 @@
+package org.apache.lucene.analysis.phonetics;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.commons.codec.language.Soundex;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class TestSoundexFilter extends TestCase {
+
+ public void testNumeric() throws Exception {
+
+ TokenStream ts;
+
+ Soundex codec = new Soundex();
+
+ ts = tokenize("m1atti1as");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "M320", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("12345");
+ ts = new SoundexFilter(ts, codec);
+ assertNextNull(ts);
+
+ }
+
+
+
+ public void testSwedish() throws Exception {
+
+ TokenStream ts;
+
+ Soundex codec = new Soundex();
+
+ // the last name is long enough to skip ö?
+ ts = tokenize("mattias fahlström");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "M320", 1);
+ assertNext(ts, "F423", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("mathias valströhm");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "M320", 1);
+ assertNext(ts, "V423", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("christian nygård");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "C623", 1);
+ try {
+ ts.next(new Token());
+ fail("Not supposed to handle å!");
+ } catch (IllegalArgumentException e) {
+ // all good
+ }
+
+ ts = tokenize("kristian nygaard");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "K623", 1);
+ assertNext(ts, "N263", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("kristina nygaard");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "K623", 1);
+ assertNext(ts, "N263", 1);
+ assertNextNull(ts);
+
+
+ ts = tokenize("anders nilsson");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "A536", 1);
+ assertNext(ts, "N425", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("andreas skiöld");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "A536", 1);
+ try {
+ ts.next(new Token());
+ fail("Not supposed to handle ö!");
+ } catch (IllegalArgumentException e) {
+ // all good
+ }
+ assertNextNull(ts);
+
+ ts = tokenize("andreas skjöld");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "A536", 1);
+ try {
+ ts.next(new Token());
+ fail("Not supposed to handle ö!");
+ } catch (IllegalArgumentException e) {
+ // all good
+ }
+ assertNextNull(ts);
+
+ ts = tokenize("andreas sköld");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "A536", 1);
+ try {
+ ts.next(new Token());
+ fail("Not supposed to handle ö!");
+ } catch (IllegalArgumentException e) {
+ // all good
+ }
+ assertNextNull(ts);
+
+
+ ts = tokenize("lars ericksson");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "L620", 1);
+ assertNext(ts, "E625", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("lars erixon");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "L620", 1);
+ assertNext(ts, "E625", 1);
+ assertNextNull(ts);
+
+ ts = tokenize("lars eriksen");
+ ts = new SoundexFilter(ts, codec);
+ assertNext(ts, "L620", 1);
+ assertNext(ts, "E625", 1);
+ assertNextNull(ts);
+ }
+
+ private TokenStream tokenize(String text) throws IOException {
+ return new WhitespaceAnalyzer().tokenStream(null, new StringReader(text));
+ }
+
+ public void assertNextNull(TokenStream ts) throws IOException {
+ Token token = ts.next(new Token());
+ assertNull(token);
+ }
+
+ public void assertNext(TokenStream ts, String termValue, int positionIncrement) throws IOException {
+ Token token = ts.next(new Token());
+ assertEquals(termValue, token.term());
+ assertEquals(positionIncrement, token.getPositionIncrement());
+ }
+
+ private void factory(TokenStream ts) throws IOException {
+ Token token;
+ while ((token = ts.next(new Token())) != null) {
+ System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ");");
+ }
+ System.out.println("assertNextNull(ts);");
+ }
+
+
+}
\ No newline at end of file
Index: contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/DoubleMetaphoneFilter.java
===================================================================
--- contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/DoubleMetaphoneFilter.java (revision 0)
+++ contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/DoubleMetaphoneFilter.java (revision 0)
@@ -0,0 +1,133 @@
+package org.apache.lucene.analysis.phonetics;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.IOException;
+
+public class DoubleMetaphoneFilter extends TokenFilter {
+
+ private DoubleMetaphone codec;
+
+ public enum Setting {
+ standard,
+ alternative,
+ all
+ }
+
+ private Setting setting;
+
+ public DoubleMetaphoneFilter(TokenStream input) {
+ this(input, Setting.standard);
+ }
+
+ public DoubleMetaphoneFilter(TokenStream input, Setting setting) {
+ this(input, setting, new DoubleMetaphone());
+ }
+
+ public DoubleMetaphoneFilter(TokenStream input, Setting setting, DoubleMetaphone codec) {
+ super(input);
+ this.codec = codec;
+ this.setting = setting;
+ }
+
+ private int bufStartOffset;
+ private int bufEndOffset;
+ private String bufAlternativeTermValue;
+ private int bufFlags;
+
+
+ public Token next(Token token) throws IOException {
+
+ if (bufAlternativeTermValue != null) {
+ token.setTermBuffer(bufAlternativeTermValue);
+ token.setStartOffset(bufStartOffset);
+ token.setEndOffset(bufEndOffset);
+ token.setFlags(bufFlags);
+ token.setPositionIncrement(0);
+ token.setType("Alternative double metaphone expression");
+ bufAlternativeTermValue = null;
+ return token;
+ }
+
+ token = input.next(token);
+ if (token == null) {
+ return null;
+ }
+
+ if (setting == Setting.standard || setting == Setting.all) {
+
+ String standard = getCodec().doubleMetaphone(token.term(), false);
+
+ if (setting == Setting.all) {
+ String alternative = getCodec().doubleMetaphone(token.term(), true);
+ if (!alternative.equals(standard) && !"".equals(alternative)) {
+ bufStartOffset = token.startOffset();
+ bufEndOffset = token.endOffset();
+ bufFlags = token.getFlags();
+ bufAlternativeTermValue = alternative;
+ }
+ }
+
+ if ("".equals(standard)) {
+ return next(token);
+ }
+
+ token.setTermBuffer(standard);
+ token.setType("Standard double metaphone expression");
+
+
+ } else if (setting == Setting.alternative) {
+
+ String alternative = getCodec().doubleMetaphone(token.term(), true);
+
+ if ("".equals(alternative)) {
+ return next(token);
+ }
+
+ token.setTermBuffer(alternative);
+ token.setType("Alternative double metaphone expression");
+
+ } else {
+ throw new IllegalArgumentException("Unknown setting: " + getSetting());
+ }
+
+ return token;
+
+ }
+
+ public DoubleMetaphone getCodec() {
+ return codec;
+ }
+
+ public void setCodec(DoubleMetaphone codec) {
+ this.codec = codec;
+ }
+
+ public Setting getSetting() {
+ return setting;
+ }
+
+ public void setSetting(Setting setting) {
+ this.setting = setting;
+ }
+}
\ No newline at end of file
Index: contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/MetaphoneFilter.java
===================================================================
--- contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/MetaphoneFilter.java (revision 0)
+++ contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/MetaphoneFilter.java (revision 0)
@@ -0,0 +1,63 @@
+package org.apache.lucene.analysis.phonetics;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.IOException;
+
+public class MetaphoneFilter extends TokenFilter {
+
+ private Metaphone codec;
+
+ public MetaphoneFilter(TokenStream input) {
+ this(input, new Metaphone());
+ }
+
+ public MetaphoneFilter(TokenStream input, Metaphone metaphone) {
+ super(input);
+ this.codec = metaphone;
+ }
+
+ public Token next(Token token) throws IOException {
+ token = input.next(token);
+ if (token == null) {
+ return null;
+ }
+
+ String encoded = getCodec().metaphone(token.term());
+ if ("".equals(encoded)) {
+ return next(token);
+ }
+
+ token.setTermBuffer(encoded);
+
+ return token;
+ }
+
+ public Metaphone getCodec() {
+ return codec;
+ }
+
+ public void setCodec(Metaphone codec) {
+ this.codec = codec;
+ }
+}
Index: contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/RefinedSoundexFilter.java
===================================================================
--- contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/RefinedSoundexFilter.java (revision 0)
+++ contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/RefinedSoundexFilter.java (revision 0)
@@ -0,0 +1,66 @@
+package org.apache.lucene.analysis.phonetics;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.Soundex;
+import org.apache.commons.codec.language.RefinedSoundex;
+
+import java.io.IOException;
+
+public class RefinedSoundexFilter extends TokenFilter {
+
+ private RefinedSoundex codec;
+
+ public RefinedSoundexFilter(TokenStream input) {
+ this(input, new RefinedSoundex());
+ }
+
+ public RefinedSoundexFilter(TokenStream input, RefinedSoundex codec) {
+ super(input);
+ this.codec = codec;
+ }
+
+ public Token next(Token token) throws IOException {
+ token = input.next(token);
+ if (token == null) {
+ return null;
+ }
+
+ String encoded = getCodec().soundex(token.term());
+
+ if("".equals(encoded)) {
+ return next(token);
+ }
+
+ token.setTermBuffer(encoded);
+
+ return token;
+ }
+
+ public RefinedSoundex getCodec() {
+ return codec;
+ }
+
+ public void setCodec(RefinedSoundex codec) {
+ this.codec = codec;
+ }
+}
\ No newline at end of file
Index: contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/SoundexFilter.java
===================================================================
--- contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/SoundexFilter.java (revision 0)
+++ contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/SoundexFilter.java (revision 0)
@@ -0,0 +1,65 @@
+package org.apache.lucene.analysis.phonetics;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.Soundex;
+
+import java.io.IOException;
+
+public class SoundexFilter extends TokenFilter {
+
+ private Soundex codec;
+
+ public SoundexFilter(TokenStream input) {
+ this(input, new Soundex());
+ }
+
+ public SoundexFilter(TokenStream input, Soundex metaphone) {
+ super(input);
+ this.codec = metaphone;
+ }
+
+ public Token next(Token token) throws IOException {
+ token = input.next(token);
+ if (token == null) {
+ return null;
+ }
+
+ String encoded = getCodec().soundex(token.term());
+
+ if("".equals(encoded)) {
+ return next(token);
+ }
+
+ token.setTermBuffer(encoded);
+
+ return token;
+ }
+
+ public Soundex getCodec() {
+ return codec;
+ }
+
+ public void setCodec(Soundex codec) {
+ this.codec = codec;
+ }
+}
\ No newline at end of file
Index: contrib/phonetics/build.xml
===================================================================
--- contrib/phonetics/build.xml (revision 0)
+++ contrib/phonetics/build.xml (revision 0)
@@ -0,0 +1,36 @@
+
+
+
+
+
+
+
+ Phonetic algorithm filters module
+
+
+
+
+
+
+
+
+
+
Index: lucene-contrib-pom.xml.template
===================================================================
--- lucene-contrib-pom.xml.template (revision 729818)
+++ lucene-contrib-pom.xml.template (working copy)
@@ -44,5 +44,6 @@
3.1
1.7.0
1.4
+ 1.3