Index: contrib/phonetics/pom.xml.template =================================================================== --- contrib/phonetics/pom.xml.template (revision 0) +++ contrib/phonetics/pom.xml.template (revision 0) @@ -0,0 +1,42 @@ + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-phonetics + Lucene phonetics + @version@ + Phonetic algorithm filters module + jar + + + commons-codec + commons-codec + ${commons-codec-version} + + + Index: contrib/phonetics/lib/commons-codec-1.3.jar =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Property changes on: contrib/phonetics/lib/commons-codec-1.3.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Index: contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestDoubleMetaphoneFilter.java =================================================================== --- contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestDoubleMetaphoneFilter.java (revision 0) +++ contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestDoubleMetaphoneFilter.java (revision 0) @@ -0,0 +1,196 @@ +package org.apache.lucene.analysis.phonetics; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; + +import java.io.IOException; +import java.io.StringReader; + +public class TestDoubleMetaphoneFilter extends TestCase { + + public void testNumeric() throws Exception { + + TokenStream ts; + + DoubleMetaphone codec = new DoubleMetaphone(); + codec.setMaxCodeLen(10); + + ts = tokenize("matt1as fahlström"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "MTS", 1); + assertNext(ts, "FLSTRM", 1); + assertNextNull(ts); + + + ts = tokenize("12345"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.standard, codec); + assertNextNull(ts); + + ts = tokenize("12345"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.alternative, codec); + assertNextNull(ts); + + ts = tokenize("12345"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNextNull(ts); + + } + + public void testSwedish() throws Exception { + + TokenStream ts; + + DoubleMetaphone codec = new DoubleMetaphone(); + codec.setMaxCodeLen(10); + + + ts = tokenize("mathias"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.standard, codec); + assertNext(ts, "M0S", 1, "Standard double metaphone expression"); + assertNextNull(ts); + + ts = tokenize("mathias"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.alternative, codec); + assertNext(ts, "MTS", 1, "Alternative double metaphone expression"); + assertNextNull(ts); + + + ts = tokenize("mathias"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "M0S", 1, "Standard double metaphone expression"); + assertNext(ts, "MTS", 0, "Alternative double metaphone expression"); + assertNextNull(ts); + + + + + ts = tokenize("mattias fahlström"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "MTS", 1); + assertNext(ts, "FLSTRM", 1); + assertNextNull(ts); + + ts = tokenize("mathias valströhm"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "M0S", 1); + assertNext(ts, "MTS", 0); + assertNext(ts, "FLSTRM", 1); + assertNextNull(ts); + + + ts = tokenize("christian nygård"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "KRSXN", 1); + assertNext(ts, "NKRT", 1); + assertNextNull(ts); + + ts = tokenize("kristian nygaard"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "KRSXN", 1); + assertNext(ts, "NKRT", 1); + assertNextNull(ts); + + ts = tokenize("kristina nygaard"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "KRSTN", 1); + assertNext(ts, "NKRT", 1); + assertNextNull(ts); + + + ts = tokenize("anders nilsson"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "ANTRS", 1); + assertNext(ts, "NLSN", 1); + assertNextNull(ts); + + ts = tokenize("andreas skiöld"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "ANTRS", 1); + assertNext(ts, "SKLT", 1); + assertNextNull(ts); + + ts = tokenize("andreas skjöld"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "ANTRS", 1); + assertNext(ts, "SKLT", 1); + assertNextNull(ts); + + ts = tokenize("andreas sköld"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "ANTRS", 1); + assertNext(ts, "SKLT", 1); + assertNextNull(ts); + + + ts = tokenize("lars ericksson"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "LRS", 1); + assertNext(ts, "ARKSN", 1); + assertNextNull(ts); + + ts = tokenize("lars erixon"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "LRS", 1); + assertNext(ts, "ARKSN", 1); + assertNextNull(ts); + + ts = tokenize("lars eriksen"); + ts = new DoubleMetaphoneFilter(ts, DoubleMetaphoneFilter.Setting.all, codec); + assertNext(ts, "LRS", 1); + assertNext(ts, "ARKSN", 1); + assertNextNull(ts); + + } + + private TokenStream tokenize(String text) throws IOException { + return new WhitespaceAnalyzer().tokenStream(null, new StringReader(text)); + } + + public void assertNextNull(TokenStream ts) throws IOException { + Token token = ts.next(new Token()); + assertNull(token); + } + + public void assertNext(TokenStream ts, String termValue, int positionIncrement) throws IOException { + Token token = ts.next(new Token()); + assertEquals(termValue, token.term()); + assertEquals(positionIncrement, token.getPositionIncrement()); + } + + public void assertNext(TokenStream ts, String termValue, int positionIncrement, String type) throws IOException { + Token token = ts.next(new Token()); + assertEquals(termValue, token.term()); + assertEquals(positionIncrement, token.getPositionIncrement()); + assertEquals(type, token.type()); + } + + private void factory(TokenStream ts) throws IOException { + Token token; + while ((token = ts.next(new Token())) != null) { + System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ");"); + } + System.out.println("assertNextNull(ts);"); + } + + +} Index: contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestMetaphoneFilter.java =================================================================== --- contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestMetaphoneFilter.java (revision 0) +++ contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestMetaphoneFilter.java (revision 0) @@ -0,0 +1,162 @@ +package org.apache.lucene.analysis.phonetics; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.commons.codec.language.Metaphone; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; + +import java.io.IOException; +import java.io.StringReader; + +public class TestMetaphoneFilter extends TestCase { + + public void testNumeric() throws Exception { + + TokenStream ts; + + Metaphone codec = new Metaphone(); + codec.setMaxCodeLen(10); + + + ts = tokenize("matt1as fahlström"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "MTS", 1); + assertNext(ts, "FLSTRM", 1); + assertNextNull(ts); + + + ts = tokenize("12345"); + ts = new MetaphoneFilter(ts, codec); + assertNextNull(ts); + + } + + public void testSwedish() throws Exception { + + TokenStream ts; + + Metaphone codec = new Metaphone(); + codec.setMaxCodeLen(10); + + + ts = tokenize("mattias fahlström"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "MTS", 1); + assertNext(ts, "FLSTRM", 1); + assertNextNull(ts); + + ts = tokenize("mathias valströhm"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "M0S", 1); + assertNext(ts, "FLSTRM", 1); + assertNextNull(ts); + + + ts = tokenize("christian nygård"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "XRSXN", 1); + assertNext(ts, "NKRT", 1); + assertNextNull(ts); + + ts = tokenize("kristian nygaard"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "KRSXN", 1); + assertNext(ts, "NKRT", 1); + assertNextNull(ts); + + ts = tokenize("kristina nygaard"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "KRSTN", 1); + assertNext(ts, "NKRT", 1); + assertNextNull(ts); + + + ts = tokenize("anders nilsson"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "ANTRS", 1); + assertNext(ts, "NLSN", 1); + assertNextNull(ts); + + ts = tokenize("andreas skiöld"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "ANTRS", 1); + assertNext(ts, "SKLT", 1); + assertNextNull(ts); + + ts = tokenize("andreas skjöld"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "ANTRS", 1); + assertNext(ts, "SKJLT", 1); + assertNextNull(ts); + + ts = tokenize("andreas sköld"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "ANTRS", 1); + assertNext(ts, "SKLT", 1); + assertNextNull(ts); + + + ts = tokenize("lars ericksson"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "LRS", 1); + assertNext(ts, "ERKSN", 1); + assertNextNull(ts); + + ts = tokenize("lars erixon"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "LRS", 1); + assertNext(ts, "ERKSN", 1); + assertNextNull(ts); + + ts = tokenize("lars eriksen"); + ts = new MetaphoneFilter(ts, codec); + assertNext(ts, "LRS", 1); + assertNext(ts, "ERKSN", 1); + assertNextNull(ts); + + } + + private TokenStream tokenize(String text) throws IOException { + return new WhitespaceAnalyzer().tokenStream(null, new StringReader(text)); + } + + public void assertNextNull(TokenStream ts) throws IOException { + Token token = ts.next(new Token()); + assertNull(token); + } + + public void assertNext(TokenStream ts, String termValue, int positionIncrement) throws IOException { + Token token = ts.next(new Token()); + assertEquals(termValue, token.term()); + assertEquals(positionIncrement, token.getPositionIncrement()); + } + + private void factory(TokenStream ts) throws IOException { + Token token; + while ((token = ts.next(new Token())) != null) { + System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ");"); + } + System.out.println("assertNextNull(ts);"); + } + + +} \ No newline at end of file Index: contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestRefinedSoundexFilter.java =================================================================== --- contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestRefinedSoundexFilter.java (revision 0) +++ contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestRefinedSoundexFilter.java (revision 0) @@ -0,0 +1,149 @@ +package org.apache.lucene.analysis.phonetics; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.commons.codec.language.RefinedSoundex; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; + +import java.io.IOException; +import java.io.StringReader; + +public class TestRefinedSoundexFilter extends TestCase { + + public void testNumeric() throws Exception { + + TokenStream ts; + + RefinedSoundex codec = new RefinedSoundex(); + + ts = tokenize("m1atti1as"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "M80603", 1); + assertNextNull(ts); + + ts = tokenize("12345"); + ts = new RefinedSoundexFilter(ts, codec); + assertNextNull(ts); + + } + + + + public void testSwedish() throws Exception { + + TokenStream ts; + + RefinedSoundex codec = new RefinedSoundex(); + + ts = tokenize("mattias fahlström"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "M80603", 1); + try { + ts.next(new Token()); + fail("Not supposed to handle ö!"); + } catch (ArrayIndexOutOfBoundsException e) { + // all good + } + + ts = tokenize("mathias valströhm"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "M80603", 1); + try { + ts.next(new Token()); + fail("Not supposed to handle ö!"); + } catch (ArrayIndexOutOfBoundsException e) { + // all good + } + + ts = tokenize("christian nygård"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "C30903608", 1); + try { + ts.next(new Token()); + fail("Not supposed to handle å!"); + } catch (ArrayIndexOutOfBoundsException e) { + // all good + } + + ts = tokenize("kristian nygaard"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "K3903608", 1); + assertNext(ts, "N804096", 1); + assertNextNull(ts); + + ts = tokenize("kristina nygaard"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "K39036080", 1); + assertNext(ts, "N804096", 1); + assertNextNull(ts); + + + ts = tokenize("anders nilsson"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "A086093", 1); + assertNext(ts, "N807308", 1); + assertNextNull(ts); + + + ts = tokenize("lars ericksson"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "L7093", 1); + assertNext(ts, "E090308", 1); + assertNextNull(ts); + + ts = tokenize("lars erixon"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "L7093", 1); + assertNext(ts, "E090508", 1); + assertNextNull(ts); + + ts = tokenize("lars eriksen"); + ts = new RefinedSoundexFilter(ts, codec); + assertNext(ts, "L7093", 1); + assertNext(ts, "E090308", 1); + assertNextNull(ts); + } + + private TokenStream tokenize(String text) throws IOException { + return new WhitespaceAnalyzer().tokenStream(null, new StringReader(text)); + } + + public void assertNextNull(TokenStream ts) throws IOException { + Token token = ts.next(new Token()); + assertNull(token); + } + + public void assertNext(TokenStream ts, String termValue, int positionIncrement) throws IOException { + Token token = ts.next(new Token()); + assertEquals(termValue, token.term()); + assertEquals(positionIncrement, token.getPositionIncrement()); + } + + private void factory(TokenStream ts) throws IOException { + Token token; + while ((token = ts.next(new Token())) != null) { + System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ");"); + } + System.out.println("assertNextNull(ts);"); + } + + +} \ No newline at end of file Index: contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestSoundexFilter.java =================================================================== --- contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestSoundexFilter.java (revision 0) +++ contrib/phonetics/src/test/org/apache/lucene/analysis/phonetics/TestSoundexFilter.java (revision 0) @@ -0,0 +1,175 @@ +package org.apache.lucene.analysis.phonetics; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.commons.codec.language.Soundex; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; + +import java.io.IOException; +import java.io.StringReader; + +public class TestSoundexFilter extends TestCase { + + public void testNumeric() throws Exception { + + TokenStream ts; + + Soundex codec = new Soundex(); + + ts = tokenize("m1atti1as"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "M320", 1); + assertNextNull(ts); + + ts = tokenize("12345"); + ts = new SoundexFilter(ts, codec); + assertNextNull(ts); + + } + + + + public void testSwedish() throws Exception { + + TokenStream ts; + + Soundex codec = new Soundex(); + + // the last name is long enough to skip ö? + ts = tokenize("mattias fahlström"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "M320", 1); + assertNext(ts, "F423", 1); + assertNextNull(ts); + + ts = tokenize("mathias valströhm"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "M320", 1); + assertNext(ts, "V423", 1); + assertNextNull(ts); + + ts = tokenize("christian nygård"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "C623", 1); + try { + ts.next(new Token()); + fail("Not supposed to handle å!"); + } catch (IllegalArgumentException e) { + // all good + } + + ts = tokenize("kristian nygaard"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "K623", 1); + assertNext(ts, "N263", 1); + assertNextNull(ts); + + ts = tokenize("kristina nygaard"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "K623", 1); + assertNext(ts, "N263", 1); + assertNextNull(ts); + + + ts = tokenize("anders nilsson"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "A536", 1); + assertNext(ts, "N425", 1); + assertNextNull(ts); + + ts = tokenize("andreas skiöld"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "A536", 1); + try { + ts.next(new Token()); + fail("Not supposed to handle ö!"); + } catch (IllegalArgumentException e) { + // all good + } + assertNextNull(ts); + + ts = tokenize("andreas skjöld"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "A536", 1); + try { + ts.next(new Token()); + fail("Not supposed to handle ö!"); + } catch (IllegalArgumentException e) { + // all good + } + assertNextNull(ts); + + ts = tokenize("andreas sköld"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "A536", 1); + try { + ts.next(new Token()); + fail("Not supposed to handle ö!"); + } catch (IllegalArgumentException e) { + // all good + } + assertNextNull(ts); + + + ts = tokenize("lars ericksson"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "L620", 1); + assertNext(ts, "E625", 1); + assertNextNull(ts); + + ts = tokenize("lars erixon"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "L620", 1); + assertNext(ts, "E625", 1); + assertNextNull(ts); + + ts = tokenize("lars eriksen"); + ts = new SoundexFilter(ts, codec); + assertNext(ts, "L620", 1); + assertNext(ts, "E625", 1); + assertNextNull(ts); + } + + private TokenStream tokenize(String text) throws IOException { + return new WhitespaceAnalyzer().tokenStream(null, new StringReader(text)); + } + + public void assertNextNull(TokenStream ts) throws IOException { + Token token = ts.next(new Token()); + assertNull(token); + } + + public void assertNext(TokenStream ts, String termValue, int positionIncrement) throws IOException { + Token token = ts.next(new Token()); + assertEquals(termValue, token.term()); + assertEquals(positionIncrement, token.getPositionIncrement()); + } + + private void factory(TokenStream ts) throws IOException { + Token token; + while ((token = ts.next(new Token())) != null) { + System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ");"); + } + System.out.println("assertNextNull(ts);"); + } + + +} \ No newline at end of file Index: contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/DoubleMetaphoneFilter.java =================================================================== --- contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/DoubleMetaphoneFilter.java (revision 0) +++ contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/DoubleMetaphoneFilter.java (revision 0) @@ -0,0 +1,133 @@ +package org.apache.lucene.analysis.phonetics; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; + +public class DoubleMetaphoneFilter extends TokenFilter { + + private DoubleMetaphone codec; + + public enum Setting { + standard, + alternative, + all + } + + private Setting setting; + + public DoubleMetaphoneFilter(TokenStream input) { + this(input, Setting.standard); + } + + public DoubleMetaphoneFilter(TokenStream input, Setting setting) { + this(input, setting, new DoubleMetaphone()); + } + + public DoubleMetaphoneFilter(TokenStream input, Setting setting, DoubleMetaphone codec) { + super(input); + this.codec = codec; + this.setting = setting; + } + + private int bufStartOffset; + private int bufEndOffset; + private String bufAlternativeTermValue; + private int bufFlags; + + + public Token next(Token token) throws IOException { + + if (bufAlternativeTermValue != null) { + token.setTermBuffer(bufAlternativeTermValue); + token.setStartOffset(bufStartOffset); + token.setEndOffset(bufEndOffset); + token.setFlags(bufFlags); + token.setPositionIncrement(0); + token.setType("Alternative double metaphone expression"); + bufAlternativeTermValue = null; + return token; + } + + token = input.next(token); + if (token == null) { + return null; + } + + if (setting == Setting.standard || setting == Setting.all) { + + String standard = getCodec().doubleMetaphone(token.term(), false); + + if (setting == Setting.all) { + String alternative = getCodec().doubleMetaphone(token.term(), true); + if (!alternative.equals(standard) && !"".equals(alternative)) { + bufStartOffset = token.startOffset(); + bufEndOffset = token.endOffset(); + bufFlags = token.getFlags(); + bufAlternativeTermValue = alternative; + } + } + + if ("".equals(standard)) { + return next(token); + } + + token.setTermBuffer(standard); + token.setType("Standard double metaphone expression"); + + + } else if (setting == Setting.alternative) { + + String alternative = getCodec().doubleMetaphone(token.term(), true); + + if ("".equals(alternative)) { + return next(token); + } + + token.setTermBuffer(alternative); + token.setType("Alternative double metaphone expression"); + + } else { + throw new IllegalArgumentException("Unknown setting: " + getSetting()); + } + + return token; + + } + + public DoubleMetaphone getCodec() { + return codec; + } + + public void setCodec(DoubleMetaphone codec) { + this.codec = codec; + } + + public Setting getSetting() { + return setting; + } + + public void setSetting(Setting setting) { + this.setting = setting; + } +} \ No newline at end of file Index: contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/MetaphoneFilter.java =================================================================== --- contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/MetaphoneFilter.java (revision 0) +++ contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/MetaphoneFilter.java (revision 0) @@ -0,0 +1,63 @@ +package org.apache.lucene.analysis.phonetics; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.commons.codec.language.Metaphone; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; + +public class MetaphoneFilter extends TokenFilter { + + private Metaphone codec; + + public MetaphoneFilter(TokenStream input) { + this(input, new Metaphone()); + } + + public MetaphoneFilter(TokenStream input, Metaphone metaphone) { + super(input); + this.codec = metaphone; + } + + public Token next(Token token) throws IOException { + token = input.next(token); + if (token == null) { + return null; + } + + String encoded = getCodec().metaphone(token.term()); + if ("".equals(encoded)) { + return next(token); + } + + token.setTermBuffer(encoded); + + return token; + } + + public Metaphone getCodec() { + return codec; + } + + public void setCodec(Metaphone codec) { + this.codec = codec; + } +} Index: contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/RefinedSoundexFilter.java =================================================================== --- contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/RefinedSoundexFilter.java (revision 0) +++ contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/RefinedSoundexFilter.java (revision 0) @@ -0,0 +1,66 @@ +package org.apache.lucene.analysis.phonetics; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.Soundex; +import org.apache.commons.codec.language.RefinedSoundex; + +import java.io.IOException; + +public class RefinedSoundexFilter extends TokenFilter { + + private RefinedSoundex codec; + + public RefinedSoundexFilter(TokenStream input) { + this(input, new RefinedSoundex()); + } + + public RefinedSoundexFilter(TokenStream input, RefinedSoundex codec) { + super(input); + this.codec = codec; + } + + public Token next(Token token) throws IOException { + token = input.next(token); + if (token == null) { + return null; + } + + String encoded = getCodec().soundex(token.term()); + + if("".equals(encoded)) { + return next(token); + } + + token.setTermBuffer(encoded); + + return token; + } + + public RefinedSoundex getCodec() { + return codec; + } + + public void setCodec(RefinedSoundex codec) { + this.codec = codec; + } +} \ No newline at end of file Index: contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/SoundexFilter.java =================================================================== --- contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/SoundexFilter.java (revision 0) +++ contrib/phonetics/src/java/org/apache/lucene/analysis/phonetics/SoundexFilter.java (revision 0) @@ -0,0 +1,65 @@ +package org.apache.lucene.analysis.phonetics; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.Soundex; + +import java.io.IOException; + +public class SoundexFilter extends TokenFilter { + + private Soundex codec; + + public SoundexFilter(TokenStream input) { + this(input, new Soundex()); + } + + public SoundexFilter(TokenStream input, Soundex metaphone) { + super(input); + this.codec = metaphone; + } + + public Token next(Token token) throws IOException { + token = input.next(token); + if (token == null) { + return null; + } + + String encoded = getCodec().soundex(token.term()); + + if("".equals(encoded)) { + return next(token); + } + + token.setTermBuffer(encoded); + + return token; + } + + public Soundex getCodec() { + return codec; + } + + public void setCodec(Soundex codec) { + this.codec = codec; + } +} \ No newline at end of file Index: contrib/phonetics/build.xml =================================================================== --- contrib/phonetics/build.xml (revision 0) +++ contrib/phonetics/build.xml (revision 0) @@ -0,0 +1,36 @@ + + + + + + + + Phonetic algorithm filters module + + + + + + + + + + Index: lucene-contrib-pom.xml.template =================================================================== --- lucene-contrib-pom.xml.template (revision 729818) +++ lucene-contrib-pom.xml.template (working copy) @@ -44,5 +44,6 @@ 3.1 1.7.0 1.4 + 1.3