Index: solr/common-build.xml =================================================================== --- solr/common-build.xml (revision 957093) +++ solr/common-build.xml (working copy) @@ -147,6 +147,7 @@ + @@ -162,6 +163,7 @@ + @@ -181,6 +183,9 @@ + + + @@ -206,6 +211,7 @@ + Index: solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java =================================================================== --- solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java (revision 957093) +++ solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase { Index: solr/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java (revision 957093) +++ solr/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java (working copy) @@ -30,7 +30,7 @@ /** * */ -public class TestCapitalizationFilter extends BaseTokenTestCase { +public class TestCapitalizationFilterFactory extends BaseTokenTestCase { public void testCapitalization() throws Exception { @@ -40,74 +40,78 @@ CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init( args ); - char[] termBuffer; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length)); - + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kiTTEN"))), + new String[] { "Kitten" }); + factory.forceFirstLetter = true; - termBuffer = "and".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("and"))), + new String[] { "And" }); - termBuffer = "AnD".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced, but it's not a keep word, either + //first is forced, but it's not a keep word, either + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))), + new String[] { "And" }); factory.forceFirstLetter = false; - termBuffer = "AnD".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length)); //first is not forced, but it's not a keep word, either + //first is not forced, but it's not a keep word, either + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))), + new String[] { "And" }); + factory.forceFirstLetter = true; - termBuffer = "big".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Big", new String(termBuffer, 0, termBuffer.length)); - termBuffer = "BIG".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length)); - Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan")); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("big"))), + new String[] { "Big" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("BIG"))), + new String[] { "BIG" }); + + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"))), + new String[] { "Hello there my name is ryan" }); + // now each token factory.onlyFirstWord = false; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan")); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))), + new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); // now only the long words factory.minWordLength = 3; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))), + new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); // without prefix - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Mckinley" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))), + new String[] { "Mckinley" }); // Now try some prefixes factory = new CapitalizationFilterFactory(); args.put( "okPrefix", "McK" ); // all words factory.init( args ); - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "McKinley" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))), + new String[] { "McKinley" }); // now try some stuff with numbers factory.forceFirstLetter = false; factory.onlyFirstWord = false; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third"))), + new String[] { "1st", "2nd", "Third" }); - factory.forceFirstLetter = true; - tokenizer = new KeywordTokenizer(new StringReader("the The the" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "The The the" }); + factory.forceFirstLetter = true; + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("the The the"))), + new String[] { "The The the" }); } public void testKeepIgnoreCase() throws Exception { @@ -118,21 +122,20 @@ CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init( args ); - char[] termBuffer; - termBuffer = "kiTTEN".toCharArray(); factory.forceFirstLetter = true; - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "KiTTEN", new String(termBuffer, 0, termBuffer.length)); + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("kiTTEN"))), + new String[] { "KiTTEN" }); factory.forceFirstLetter = false; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "kiTTEN", new String(termBuffer, 0, termBuffer.length)); + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("kiTTEN"))), + new String[] { "kiTTEN" }); factory.keep = null; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length)); + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("kiTTEN"))), + new String[] { "Kitten" }); } /** Index: solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java =================================================================== --- solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java (revision 957093) +++ solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java (working copy) @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.analysis; - -import java.io.StringReader; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - -public class DoubleMetaphoneFilterTest extends BaseTokenTestCase { - - public void testSize4FalseInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); - TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); - assertTokenStreamContents(filter, new String[] { "ANTR" }); - } - - public void testSize4TrueInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); - TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true); - assertTokenStreamContents(filter, new String[] { "international", "ANTR" }); - } - - public void testAlternateInjectFalse() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski")); - TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); - assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" }); - } - - public void testSize8FalseInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); - TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); - assertTokenStreamContents(filter, new String[] { "ANTRNXNL" }); - } - - public void testNonConvertableStringsWithInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&")); - TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true); - assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); - } - - public void testNonConvertableStringsWithoutInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&")); - TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); - assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); - - // should have something after the stream - stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello")); - filter = new DoubleMetaphoneFilter(stream, 8, false); - assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" }); - } - -} Index: solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java (revision 957093) +++ solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java (working copy) @@ -1,213 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - - -/** - * - */ -public class TestCapitalizationFilter extends BaseTokenTestCase { - - public void testCapitalization() throws Exception - { - Map args = new HashMap(DEFAULT_VERSION_PARAM); - args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" ); - args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); - - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.init( args ); - char[] termBuffer; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length)); - - factory.forceFirstLetter = true; - - termBuffer = "and".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced - - termBuffer = "AnD".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced, but it's not a keep word, either - - factory.forceFirstLetter = false; - termBuffer = "AnD".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length)); //first is not forced, but it's not a keep word, either - - factory.forceFirstLetter = true; - termBuffer = "big".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Big", new String(termBuffer, 0, termBuffer.length)); - termBuffer = "BIG".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length)); - - Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan")); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" }); - - // now each token - factory.onlyFirstWord = false; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan")); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); - - // now only the long words - factory.minWordLength = 3; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); - - // without prefix - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Mckinley" }); - - // Now try some prefixes - factory = new CapitalizationFilterFactory(); - args.put( "okPrefix", "McK" ); // all words - factory.init( args ); - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "McKinley" }); - - // now try some stuff with numbers - factory.forceFirstLetter = false; - factory.onlyFirstWord = false; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" }); - - factory.forceFirstLetter = true; - tokenizer = new KeywordTokenizer(new StringReader("the The the" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "The The the" }); - } - - public void testKeepIgnoreCase() throws Exception { - Map args = new HashMap(DEFAULT_VERSION_PARAM); - args.put( CapitalizationFilterFactory.KEEP, "kitten" ); - args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" ); - args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); - - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.init( args ); - char[] termBuffer; - termBuffer = "kiTTEN".toCharArray(); - factory.forceFirstLetter = true; - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "KiTTEN", new String(termBuffer, 0, termBuffer.length)); - - factory.forceFirstLetter = false; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "kiTTEN", new String(termBuffer, 0, termBuffer.length)); - - factory.keep = null; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length)); - } - - /** - * Test CapitalizationFilterFactory's minWordLength option. - * - * This is very weird when combined with ONLY_FIRST_WORD!!! - */ - public void testMinWordLength() throws Exception { - Map args = new HashMap(DEFAULT_VERSION_PARAM); - args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true"); - args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader( - "helo testing")); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"helo", "Testing"}); - } - - /** - * Test CapitalizationFilterFactory's maxWordCount option with only words of 1 - * in each token (it should do nothing) - */ - public void testMaxWordCount() throws Exception { - Map args = new HashMap(DEFAULT_VERSION_PARAM); - args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader( - "one two three four")); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"}); - } - - /** - * Test CapitalizationFilterFactory's maxWordCount option when exceeded - */ - public void testMaxWordCount2() throws Exception { - Map args = new HashMap(DEFAULT_VERSION_PARAM); - args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.init(args); - Tokenizer tokenizer = new KeywordTokenizer(new StringReader( - "one two three four")); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"one two three four"}); - } - - /** - * Test CapitalizationFilterFactory's maxTokenLength option when exceeded - * - * This is weird, it is not really a max, but inclusive (look at 'is') - */ - public void testMaxTokenLength() throws Exception { - Map args = new HashMap(DEFAULT_VERSION_PARAM); - args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader( - "this is a test")); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"}); - } - - /** - * Test CapitalizationFilterFactory's forceFirstLetter option - */ - public void testForceFirstLetter() throws Exception { - Map args = new HashMap(DEFAULT_VERSION_PARAM); - args.put(CapitalizationFilterFactory.KEEP, "kitten"); - args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kitten")); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"Kitten"}); - } -} Index: solr/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java (revision 957093) +++ solr/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java (working copy) @@ -30,7 +30,7 @@ /** * @version $Id$ */ -public class TestPhoneticFilter extends BaseTokenTestCase { +public class TestPhoneticFilterFactory extends BaseTokenTestCase { public void testFactory() { Index: solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java (revision 957093) +++ solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java (working copy) @@ -1,102 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.codec.language.Metaphone; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - - -/** - * @version $Id$ - */ -public class TestPhoneticFilter extends BaseTokenTestCase { - - public void testFactory() - { - Map args = new HashMap(); - - PhoneticFilterFactory ff = new PhoneticFilterFactory(); - try { - ff.init( args ); - fail( "missing encoder parameter" ); - } - catch( Exception ex ) {} - args.put( PhoneticFilterFactory.ENCODER, "XXX" ); - try { - ff.init( args ); - fail( "unknown encoder parameter" ); - } - catch( Exception ex ) {} - - args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); - ff.init( args ); - assertTrue( ff.encoder instanceof Metaphone ); - assertTrue( ff.inject ); // default - - args.put( PhoneticFilterFactory.INJECT, "false" ); - ff.init( args ); - assertFalse( ff.inject ); - } - - public void testAlgorithms() throws Exception { - assertAlgorithm("Metaphone", "true", "aaa bbb ccc easgasg", - new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" }); - assertAlgorithm("Metaphone", "false", "aaa bbb ccc easgasg", - new String[] { "A", "B", "KKK", "ESKS" }); - - assertAlgorithm("DoubleMetaphone", "true", "aaa bbb ccc easgasg", - new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" }); - assertAlgorithm("DoubleMetaphone", "false", "aaa bbb ccc easgasg", - new String[] { "A", "PP", "KK", "ASKS" }); - - assertAlgorithm("Soundex", "true", "aaa bbb ccc easgasg", - new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" }); - assertAlgorithm("Soundex", "false", "aaa bbb ccc easgasg", - new String[] { "A000", "B000", "C000", "E220" }); - - assertAlgorithm("RefinedSoundex", "true", "aaa bbb ccc easgasg", - new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" }); - assertAlgorithm("RefinedSoundex", "false", "aaa bbb ccc easgasg", - new String[] { "A0", "B1", "C3", "E034034" }); - - assertAlgorithm("Caverphone", "true", "Darda Karleen Datha Carlene", - new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen", - "TTA1111111", "Datha", "KLN1111111", "Carlene" }); - assertAlgorithm("Caverphone", "false", "Darda Karleen Datha Carlene", - new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); - } - - static void assertAlgorithm(String algName, String inject, String input, - String[] expected) throws Exception { - Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, - new StringReader(input)); - Map args = new HashMap(); - args.put("encoder", algName); - args.put("inject", inject); - PhoneticFilterFactory factory = new PhoneticFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, expected); - } -} Index: solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java (revision 957093) +++ solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java (working copy) @@ -19,6 +19,7 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory { Index: solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java (revision 957093) +++ solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java (working copy) @@ -17,11 +17,10 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.*; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter; import org.apache.lucene.analysis.util.CharArraySet; -import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -29,12 +28,8 @@ import java.util.StringTokenizer; /** - * A filter to apply normal capitalization rules to Tokens. It will make the first letter - * capital and the rest lower case. + * Factory for {@link CapitalizationFilter}. *

- * This filter is particularly useful to build nice looking facet parameters. This filter - * is not appropriate if you intend to use a prefix query. - *

* The factory takes parameters:
* "onlyFirstWord" - should each word be capitalized or all of the words?
* "keep" - a keep word list. Each word that should be kept separated by whitespace.
@@ -52,7 +47,6 @@ * @since solr 1.3 */ public class CapitalizationFilterFactory extends BaseTokenFilterFactory { - public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE; public static final String KEEP = "keep"; public static final String KEEP_IGNORE_CASE = "keepIgnoreCase"; public static final String OK_PREFIX = "okPrefix"; @@ -68,8 +62,8 @@ Collection okPrefix = Collections.emptyList(); // for Example: McK int minWordLength = 0; // don't modify capitalization for words shorter then this - int maxWordCount = DEFAULT_MAX_WORD_COUNT; - int maxTokenLength = DEFAULT_MAX_WORD_COUNT; + int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT; + int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH; boolean onlyFirstWord = true; boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list @@ -128,116 +122,8 @@ } } - - public void processWord(char[] buffer, int offset, int length, int wordCount) { - if (length < 1) { - return; - } - if (onlyFirstWord && wordCount > 0) { - for (int i = 0; i < length; i++) { - buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); - - } - return; - } - - if (keep != null && keep.contains(buffer, offset, length)) { - if (wordCount == 0 && forceFirstLetter) { - buffer[offset] = Character.toUpperCase(buffer[offset]); - } - return; - } - if (length < minWordLength) { - return; - } - for (char[] prefix : okPrefix) { - if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix - boolean match = true; - for (int i = 0; i < prefix.length; i++) { - if (prefix[i] != buffer[offset + i]) { - match = false; - break; - } - } - if (match == true) { - return; - } - } - } - - // We know it has at least one character - /*char[] chars = w.toCharArray(); - StringBuilder word = new StringBuilder( w.length() ); - word.append( Character.toUpperCase( chars[0] ) );*/ - buffer[offset] = Character.toUpperCase(buffer[offset]); - - for (int i = 1; i < length; i++) { - buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); - } - //return word.toString(); - } - public CapitalizationFilter create(TokenStream input) { - return new CapitalizationFilter(input, this); + return new CapitalizationFilter(input, onlyFirstWord, keep, + forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); } } - - -/** - * This relies on the Factory so that the difficult stuff does not need to be - * re-initialized each time the filter runs. - *

- * This is package protected since it is not useful without the Factory - */ -final class CapitalizationFilter extends TokenFilter { - private final CapitalizationFilterFactory factory; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - - public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) { - super(in); - this.factory = factory; - } - - @Override - public boolean incrementToken() throws IOException { - if (!input.incrementToken()) return false; - - char[] termBuffer = termAtt.buffer(); - int termBufferLength = termAtt.length(); - char[] backup = null; - if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) { - //make a backup in case we exceed the word count - backup = new char[termBufferLength]; - System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); - } - if (termBufferLength < factory.maxTokenLength) { - int wordCount = 0; - - int lastWordStart = 0; - for (int i = 0; i < termBufferLength; i++) { - char c = termBuffer[i]; - if (c <= ' ' || c == '.') { - int len = i - lastWordStart; - if (len > 0) { - factory.processWord(termBuffer, lastWordStart, len, wordCount++); - lastWordStart = i + 1; - i++; - } - } - } - - // process the last word - if (lastWordStart < termBufferLength) { - factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); - } - - if (wordCount > factory.maxWordCount) { - termAtt.copyBuffer(backup, 0, termBufferLength); - } - } - - return true; - } - -} - Index: solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java =================================================================== --- solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java (revision 957093) +++ solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java (working copy) @@ -1,108 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.analysis; - -import java.io.IOException; -import java.util.LinkedList; - -import org.apache.commons.codec.language.DoubleMetaphone; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; - -public final class DoubleMetaphoneFilter extends TokenFilter { - - private static final String TOKEN_TYPE = "DoubleMetaphone"; - - private final LinkedList remainingTokens = new LinkedList(); - private final DoubleMetaphone encoder = new DoubleMetaphone(); - private final boolean inject; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); - - protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) { - super(input); - this.encoder.setMaxCodeLen(maxCodeLength); - this.inject = inject; - } - - @Override - public boolean incrementToken() throws IOException { - for(;;) { - - if (!remainingTokens.isEmpty()) { - // clearAttributes(); // not currently necessary - restoreState(remainingTokens.removeFirst()); - return true; - } - - if (!input.incrementToken()) return false; - - int len = termAtt.length(); - if (len==0) return true; // pass through zero length terms - - int firstAlternativeIncrement = inject ? 0 : posAtt.getPositionIncrement(); - - String v = termAtt.toString(); - String primaryPhoneticValue = encoder.doubleMetaphone(v); - String alternatePhoneticValue = encoder.doubleMetaphone(v, true); - - // a flag to lazily save state if needed... this avoids a save/restore when only - // one token will be generated. - boolean saveState=inject; - - if (primaryPhoneticValue!=null && primaryPhoneticValue.length() > 0 && !primaryPhoneticValue.equals(v)) { - if (saveState) { - remainingTokens.addLast(captureState()); - } - posAtt.setPositionIncrement( firstAlternativeIncrement ); - firstAlternativeIncrement = 0; - termAtt.setEmpty().append(primaryPhoneticValue); - saveState = true; - } - - if (alternatePhoneticValue!=null && alternatePhoneticValue.length() > 0 - && !alternatePhoneticValue.equals(primaryPhoneticValue) - && !primaryPhoneticValue.equals(v)) { - if (saveState) { - remainingTokens.addLast(captureState()); - saveState = false; - } - posAtt.setPositionIncrement( firstAlternativeIncrement ); - termAtt.setEmpty().append(alternatePhoneticValue); - saveState = true; - } - - // Just one token to return, so no need to capture/restore - // any state, simply return it. - if (remainingTokens.isEmpty()) { - return true; - } - - if (saveState) { - remainingTokens.addLast(captureState()); - } - } - } - - @Override - public void reset() throws IOException { - input.reset(); - remainingTokens.clear(); - } -} Index: solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java (revision 957093) +++ solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java (working copy) @@ -29,6 +29,7 @@ import org.apache.commons.codec.language.RefinedSoundex; import org.apache.commons.codec.language.Soundex; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.PhoneticFilter; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.StrUtils; @@ -96,6 +97,6 @@ } public PhoneticFilter create(TokenStream input) { - return new PhoneticFilter(input,encoder,name,inject); + return new PhoneticFilter(input,encoder,inject); } } Index: solr/src/java/org/apache/solr/analysis/PhoneticFilter.java =================================================================== --- solr/src/java/org/apache/solr/analysis/PhoneticFilter.java (revision 957093) +++ solr/src/java/org/apache/solr/analysis/PhoneticFilter.java (working copy) @@ -1,98 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.commons.codec.Encoder; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; - -import java.io.IOException; - -/** - * Create tokens for phonetic matches. See: - * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html - * - * @version $Id$ - */ -public final class PhoneticFilter extends TokenFilter -{ - protected boolean inject = true; - protected Encoder encoder = null; - protected String name = null; - - protected State save = null; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); - - public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) { - super(in); - this.encoder = encoder; - this.name = name; - this.inject = inject; - } - - @Override - public boolean incrementToken() throws IOException { - if( save != null ) { - // clearAttributes(); // not currently necessary - restoreState(save); - save = null; - return true; - } - - if (!input.incrementToken()) return false; - - // pass through zero-length terms - if (termAtt.length() == 0) return true; - - String value = termAtt.toString(); - String phonetic = null; - try { - String v = encoder.encode(value).toString(); - if (v.length() > 0 && !value.equals(v)) phonetic = v; - } catch (Exception ignored) {} // just use the direct text - - if (phonetic == null) return true; - - if (!inject) { - // just modify this token - termAtt.setEmpty().append(phonetic); - return true; - } - - // We need to return both the original and the phonetic tokens. - // to avoid a orig=captureState() change_to_phonetic() saved=captureState() restoreState(orig) - // we return the phonetic alternative first - - int origOffset = posAtt.getPositionIncrement(); - posAtt.setPositionIncrement(0); - save = captureState(); - - posAtt.setPositionIncrement(origOffset); - termAtt.setEmpty().append(phonetic); - return true; - } - - @Override - public void reset() throws IOException { - input.reset(); - save = null; - } -} Index: modules/analysis/NOTICE.txt =================================================================== --- modules/analysis/NOTICE.txt (revision 957093) +++ modules/analysis/NOTICE.txt (working copy) @@ -4,6 +4,10 @@ This product includes software developed by The Apache Software Foundation (http://www.apache.org/). +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Commons + The snowball stemmers in common/src/java/net/sf/snowball were developed by Martin Porter and Richard Boulton. Index: modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java (revision 0) @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; + +import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*; + +/** Tests {@link CapitalizationFilter} */ +public class TestCapitalizationFilter extends BaseTokenStreamTestCase { + public void testCapitalization() throws Exception { + CharArraySet keep = new CharArraySet(TEST_VERSION_CURRENT, + Arrays.asList("and", "the", "it", "BIG"), false); + + assertCapitalizesTo("kiTTEN", new String[] { "Kitten" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesTo("and", new String[] { "And" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesTo("AnD", new String[] { "And" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + //first is not forced, but it's not a keep word, either + assertCapitalizesTo("AnD", new String[] { "And" }, + true, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesTo("big", new String[] { "Big" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesTo("BIG", new String[] { "BIG" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesToKeyword("Hello thEre my Name is Ryan", "Hello there my name is ryan", + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // now each token + assertCapitalizesTo("Hello thEre my Name is Ryan", + new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }, + false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // now only the long words + assertCapitalizesTo("Hello thEre my Name is Ryan", + new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }, + false, keep, true, null, 3, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // without prefix + assertCapitalizesTo("McKinley", + new String[] { "Mckinley" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // Now try some prefixes + List okPrefix = new ArrayList(); + okPrefix.add("McK".toCharArray()); + + assertCapitalizesTo("McKinley", + new String[] { "McKinley" }, + true, keep, true, okPrefix, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // now try some stuff with numbers + assertCapitalizesTo("1st 2nd third", + new String[] { "1st", "2nd", "Third" }, + false, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesToKeyword("the The the", "The The the", + false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + } + + static void assertCapitalizesTo(Tokenizer tokenizer, String expected[], + boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter, + Collection okPrefix, int minWordLength, int maxWordCount, + int maxTokenLength) throws IOException { + CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, + forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); + assertTokenStreamContents(filter, expected); + } + + static void assertCapitalizesTo(String input, String expected[], + boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter, + Collection okPrefix, int minWordLength, int maxWordCount, + int maxTokenLength) throws IOException { + assertCapitalizesTo(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), + expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, + maxWordCount, maxTokenLength); + } + + static void assertCapitalizesToKeyword(String input, String expected, + boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter, + Collection okPrefix, int minWordLength, int maxWordCount, + int maxTokenLength) throws IOException { + assertCapitalizesTo(new KeywordTokenizer(new StringReader(input)), + new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix, + minWordLength, maxWordCount, maxTokenLength); + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java (revision 0) @@ -0,0 +1,181 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.CharArraySet; + +/** + * A filter to apply normal capitalization rules to Tokens. It will make the first letter + * capital and the rest lower case. + *

+ * This filter is particularly useful to build nice looking facet parameters. This filter + * is not appropriate if you intend to use a prefix query. + */ +public final class CapitalizationFilter extends TokenFilter { + public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE; + public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE; + + private final boolean onlyFirstWord; + private final CharArraySet keep; + private final boolean forceFirstLetter; + private final Collection okPrefix; + + private final int minWordLength; + private final int maxWordCount; + private final int maxTokenLength; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + /** + * Creates a CapitalizationFilter with the default parameters. + *

+ * Calls {@link #CapitalizationFilter(TokenStream, boolean, CharArraySet, boolean, Collection, int, int, int) + * CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)} + */ + public CapitalizationFilter(TokenStream in) { + this(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + } + + /** + * Creates a CapitalizationFilter with the specified parameters. + * @param in input tokenstream + * @param onlyFirstWord should each word be capitalized or all of the words? + * @param keep a keep word list. Each word that should be kept separated by whitespace. + * @param forceFirstLetter Force the first letter to be capitalized even if it is in the keep list. + * @param okPrefix do not change word capitalization if a word begins with something in this list. + * @param minWordLength how long the word needs to be to get capitalization applied. If the + * minWordLength is 3, "and" > "And" but "or" stays "or". + * @param maxWordCount if the token contains more then maxWordCount words, the capitalization is + * assumed to be correct. + * @param maxTokenLength ??? + */ + public CapitalizationFilter(TokenStream in, boolean onlyFirstWord, CharArraySet keep, + boolean forceFirstLetter, Collection okPrefix, int minWordLength, + int maxWordCount, int maxTokenLength) { + super(in); + this.onlyFirstWord = onlyFirstWord; + this.keep = keep; + this.forceFirstLetter = forceFirstLetter; + this.okPrefix = okPrefix; + this.minWordLength = minWordLength; + this.maxWordCount = maxWordCount; + this.maxTokenLength = maxTokenLength; + } + + @Override + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) return false; + + char[] termBuffer = termAtt.buffer(); + int termBufferLength = termAtt.length(); + char[] backup = null; + + if (maxWordCount < DEFAULT_MAX_WORD_COUNT) { + //make a backup in case we exceed the word count + backup = new char[termBufferLength]; + System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); + } + + if (termBufferLength < maxTokenLength) { + int wordCount = 0; + + int lastWordStart = 0; + for (int i = 0; i < termBufferLength; i++) { + char c = termBuffer[i]; + if (c <= ' ' || c == '.') { + int len = i - lastWordStart; + if (len > 0) { + processWord(termBuffer, lastWordStart, len, wordCount++); + lastWordStart = i + 1; + i++; + } + } + } + + // process the last word + if (lastWordStart < termBufferLength) { + processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); + } + + if (wordCount > maxWordCount) { + termAtt.copyBuffer(backup, 0, termBufferLength); + } + } + + return true; + } + + private void processWord(char[] buffer, int offset, int length, int wordCount) { + if (length < 1) { + return; + } + + if (onlyFirstWord && wordCount > 0) { + for (int i = 0; i < length; i++) { + buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); + + } + return; + } + + if (keep != null && keep.contains(buffer, offset, length)) { + if (wordCount == 0 && forceFirstLetter) { + buffer[offset] = Character.toUpperCase(buffer[offset]); + } + return; + } + + if (length < minWordLength) { + return; + } + + if (okPrefix != null) { + for (char[] prefix : okPrefix) { + if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix + boolean match = true; + for (int i = 0; i < prefix.length; i++) { + if (prefix[i] != buffer[offset + i]) { + match = false; + break; + } + } + if (match == true) { + return; + } + } + } + } + + // We know it has at least one character + /*char[] chars = w.toCharArray(); + StringBuilder word = new StringBuilder( w.length() ); + word.append( Character.toUpperCase( chars[0] ) );*/ + buffer[offset] = Character.toUpperCase(buffer[offset]); + + for (int i = 1; i < length; i++) { + buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); + } + //return word.toString(); + } +} Index: modules/analysis/phonetic/pom.xml.template =================================================================== --- modules/analysis/phonetic/pom.xml.template (revision 0) +++ modules/analysis/phonetic/pom.xml.template (revision 0) @@ -0,0 +1,46 @@ + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-phonetic + + Lucene Phonetic Filters + + @version@ + + Provides phonetic encoding via Commons Codec. + + jar + + + org.apache.commons + codec + ${codec-version} + + + Index: modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java =================================================================== --- modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java (revision 0) +++ modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java (working copy) @@ -15,88 +15,59 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.phonetic; import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; +import org.apache.commons.codec.Encoder; +import org.apache.commons.codec.language.Caverphone; +import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.commons.codec.language.Metaphone; -import org.apache.lucene.analysis.TokenStream; +import org.apache.commons.codec.language.RefinedSoundex; +import org.apache.commons.codec.language.Soundex; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; - /** - * @version $Id$ + * Tests {@link PhoneticFilter} */ -public class TestPhoneticFilter extends BaseTokenTestCase { - - public void testFactory() - { - Map args = new HashMap(); - - PhoneticFilterFactory ff = new PhoneticFilterFactory(); - try { - ff.init( args ); - fail( "missing encoder parameter" ); - } - catch( Exception ex ) {} - args.put( PhoneticFilterFactory.ENCODER, "XXX" ); - try { - ff.init( args ); - fail( "unknown encoder parameter" ); - } - catch( Exception ex ) {} - - args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); - ff.init( args ); - assertTrue( ff.encoder instanceof Metaphone ); - assertTrue( ff.inject ); // default - - args.put( PhoneticFilterFactory.INJECT, "false" ); - ff.init( args ); - assertFalse( ff.inject ); - } - +public class TestPhoneticFilter extends BaseTokenStreamTestCase { + public void testAlgorithms() throws Exception { - assertAlgorithm("Metaphone", "true", "aaa bbb ccc easgasg", + assertAlgorithm(new Metaphone(), true, "aaa bbb ccc easgasg", new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" }); - assertAlgorithm("Metaphone", "false", "aaa bbb ccc easgasg", + assertAlgorithm(new Metaphone(), false, "aaa bbb ccc easgasg", new String[] { "A", "B", "KKK", "ESKS" }); - assertAlgorithm("DoubleMetaphone", "true", "aaa bbb ccc easgasg", + assertAlgorithm(new DoubleMetaphone(), true, "aaa bbb ccc easgasg", new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" }); - assertAlgorithm("DoubleMetaphone", "false", "aaa bbb ccc easgasg", + assertAlgorithm(new DoubleMetaphone(), false, "aaa bbb ccc easgasg", new String[] { "A", "PP", "KK", "ASKS" }); - assertAlgorithm("Soundex", "true", "aaa bbb ccc easgasg", + assertAlgorithm(new Soundex(), true, "aaa bbb ccc easgasg", new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" }); - assertAlgorithm("Soundex", "false", "aaa bbb ccc easgasg", + assertAlgorithm(new Soundex(), false, "aaa bbb ccc easgasg", new String[] { "A000", "B000", "C000", "E220" }); - assertAlgorithm("RefinedSoundex", "true", "aaa bbb ccc easgasg", + assertAlgorithm(new RefinedSoundex(), true, "aaa bbb ccc easgasg", new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" }); - assertAlgorithm("RefinedSoundex", "false", "aaa bbb ccc easgasg", + assertAlgorithm(new RefinedSoundex(), false, "aaa bbb ccc easgasg", new String[] { "A0", "B1", "C3", "E034034" }); - assertAlgorithm("Caverphone", "true", "Darda Karleen Datha Carlene", + assertAlgorithm(new Caverphone(), true, "Darda Karleen Datha Carlene", new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen", "TTA1111111", "Datha", "KLN1111111", "Carlene" }); - assertAlgorithm("Caverphone", "false", "Darda Karleen Datha Carlene", + assertAlgorithm(new Caverphone(), false, "Darda Karleen Datha Carlene", new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); } + - static void assertAlgorithm(String algName, String inject, String input, + static void assertAlgorithm(Encoder encoder, boolean inject, String input, String[] expected) throws Exception { - Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); - Map args = new HashMap(); - args.put("encoder", algName); - args.put("inject", inject); - PhoneticFilterFactory factory = new PhoneticFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, expected); + PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject); + assertTokenStreamContents(filter, expected); } } Index: modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java =================================================================== --- modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java (revision 0) +++ modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java (working copy) @@ -14,52 +14,53 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.phonetic; import java.io.StringReader; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -public class DoubleMetaphoneFilterTest extends BaseTokenTestCase { +public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase { public void testSize4FalseInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); assertTokenStreamContents(filter, new String[] { "ANTR" }); } public void testSize4TrueInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true); assertTokenStreamContents(filter, new String[] { "international", "ANTR" }); } public void testAlternateInjectFalse() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" }); } public void testSize8FalseInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "ANTRNXNL" }); } public void testNonConvertableStringsWithInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); } public void testNonConvertableStringsWithoutInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); // should have something after the stream - stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello")); + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello")); filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" }); } Index: modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java =================================================================== --- modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java (revision 0) +++ modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.phonetic; import org.apache.commons.codec.Encoder; import org.apache.lucene.analysis.TokenFilter; @@ -28,23 +28,19 @@ /** * Create tokens for phonetic matches. See: * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html - * - * @version $Id$ */ public final class PhoneticFilter extends TokenFilter { protected boolean inject = true; protected Encoder encoder = null; - protected String name = null; protected State save = null; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); - public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) { + public PhoneticFilter(TokenStream in, Encoder encoder, boolean inject) { super(in); this.encoder = encoder; - this.name = name; this.inject = inject; } Index: modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java =================================================================== --- modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java (revision 0) +++ modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java (working copy) @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.phonetic; import java.io.IOException; import java.util.LinkedList; @@ -35,7 +35,7 @@ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); - protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) { + public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) { super(input); this.encoder.setMaxCodeLen(maxCodeLength); this.inject = inject; Index: modules/analysis/phonetic/build.xml =================================================================== --- modules/analysis/phonetic/build.xml (revision 0) +++ modules/analysis/phonetic/build.xml (revision 0) @@ -0,0 +1,63 @@ + + + + + + + + Provides phonetic encoding support via Apache Commons Codec. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + phonetic building dependency ${analyzers-common.jar} + + + Index: modules/analysis/README.txt =================================================================== --- modules/analysis/README.txt (revision 957093) +++ modules/analysis/README.txt (working copy) @@ -20,7 +20,12 @@ lucene-analyzers-icu-XX.jar An add-on analysis library that provides improved Unicode support via International Components for Unicode (ICU). Note: this module depends on - the ICU4j jar file (version > 4.4.0) + the ICU4j jar file (version >= 4.4.0) + +lucene-analyzers-phonetic-XX.jar + An add-on analysis library that provides phonetic encoders via Apache + Commons-Codec. Note: this module depends on the commons-codec jar + file (version >= 1.4) lucene-analyzers-smartcn-XX.jar An add-on analysis library that provides word segmentation for Simplified @@ -32,12 +37,14 @@ common/src/java icu/src/java +phonetic/src/java smartcn/src/java stempel/src/java - The source code for the four libraries. + The source code for the ffve libraries. common/src/test icu/src/test +phonetic/src/test smartcn/src/test stempel/src/test - Unit tests for the four libraries. + Unit tests for the five libraries. Index: modules/analysis/build.xml =================================================================== --- modules/analysis/build.xml (revision 957093) +++ modules/analysis/build.xml (working copy) @@ -35,6 +35,10 @@ + + + + @@ -44,29 +48,33 @@ - + + + + + @@ -76,6 +84,7 @@ + @@ -83,6 +92,7 @@ + @@ -90,6 +100,7 @@ +