Index: solr/src/test/org/apache/solr/analysis/TestTrimFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestTrimFilterFactory.java (revision 940912) +++ solr/src/test/org/apache/solr/analysis/TestTrimFilterFactory.java (working copy) @@ -18,10 +18,12 @@ package org.apache.solr.analysis; import java.io.IOException; +import java.io.StringReader; import java.util.Collection; import java.util.HashMap; import java.util.Map; +import org.apache.lucene.analysis.KeywordTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; @@ -32,85 +34,15 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** - * @version $Id:$ + * Simple tests to ensure this factory is working */ -public class TestTrimFilter extends BaseTokenTestCase { - - public void testTrim() throws Exception { - char[] a = " a ".toCharArray(); - char[] b = "b ".toCharArray(); - char[] ccc = "cCc".toCharArray(); - char[] whitespace = " ".toCharArray(); - char[] empty = "".toCharArray(); +public class TestTrimFilterFactory extends BaseTokenTestCase { + public void testTrimming() throws Exception { TrimFilterFactory factory = new TrimFilterFactory(); Map args = new HashMap(); args.put("updateOffsets", "false"); factory.init(args); - TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5), - new Token(b, 0, b.length, 6, 10), - new Token(ccc, 0, ccc.length, 11, 15), - new Token(whitespace, 0, whitespace.length, 16, 20), - new Token(empty, 0, empty.length, 21, 21))); - - assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""}); - - a = " a".toCharArray(); - b = "b ".toCharArray(); - ccc = " c ".toCharArray(); - whitespace = " ".toCharArray(); - factory = new TrimFilterFactory(); - args = new HashMap(); - args.put("updateOffsets", "true"); - factory.init(args); - ts = factory.create(new IterTokenStream( - new Token(a, 0, a.length, 0, 2), - new Token(b, 0, b.length, 0, 2), - new Token(ccc, 0, ccc.length, 0, 3), - new Token(whitespace, 0, whitespace.length, 0, 3))); - - assertTokenStreamContents(ts, - new String[] { "a", "b", "c", "" }, - new int[] { 1, 0, 1, 3 }, - new int[] { 2, 1, 2, 3 }, - new int[] { 1, 1, 1, 1 }); + TokenStream ts = factory.create(new KeywordTokenizer(new StringReader("trim me "))); + assertTokenStreamContents(ts, new String[] { "trim me" }); } - - /** - * @deprecated does not support custom attributes - */ - private static class IterTokenStream extends TokenStream { - final Token tokens[]; - int index = 0; - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); - TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - - public IterTokenStream(Token... tokens) { - super(); - this.tokens = tokens; - } - - public IterTokenStream(Collection tokens) { - this(tokens.toArray(new Token[tokens.size()])); - } - - public boolean incrementToken() throws IOException { - if (index >= tokens.length) - return false; - else { - clearAttributes(); - Token token = tokens[index++]; - termAtt.setEmpty().append(token.term()); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - posIncAtt.setPositionIncrement(token.getPositionIncrement()); - flagsAtt.setFlags(token.getFlags()); - typeAtt.setType(token.type()); - payloadAtt.setPayload(token.getPayload()); - return true; - } - } - } } Index: solr/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java (deleted) =================================================================== Index: solr/src/test/org/apache/solr/analysis/TestTrimFilter.java (deleted) =================================================================== Index: solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java (deleted) =================================================================== Index: solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java (revision 940912) +++ solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java (working copy) @@ -20,6 +20,7 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.solr.common.SolrException; /** Index: solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java (deleted) =================================================================== Index: solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java (revision 940912) +++ solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java (working copy) @@ -18,6 +18,7 @@ */ import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; import org.apache.solr.analysis.BaseTokenFilterFactory; /** Index: solr/src/java/org/apache/solr/analysis/KeepWordFilter.java (deleted) =================================================================== Index: solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (revision 940912) +++ solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (working copy) @@ -21,6 +21,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; import java.util.Set; import java.io.IOException; Index: solr/src/java/org/apache/solr/analysis/TrimFilter.java (deleted) =================================================================== Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 940912) +++ lucene/contrib/CHANGES.txt (working copy) @@ -57,6 +57,12 @@ into subwords and performs optional transformations on subword groups. - o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which filters out Tokens at the same position and Term text as the previous token. + - o.a.l.analysis.miscellaneous.TrimFilter: Trims leading and trailing whitespace + from Tokens in the stream. + - o.a.l.analysis.miscellaneous.KeepWordFilter: A TokenFilter that only keeps tokens + with text contained in the required words (inverse of StopFilter). + - o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts + hyphenated words broken into two lines back together. - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes. (... in progress) Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java (revision 940912) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java (working copy) @@ -15,23 +15,23 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.miscellaneous; import java.io.StringReader; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; /** * HyphenatedWordsFilter test */ -public class TestHyphenatedWordsFilter extends BaseTokenTestCase { +public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase { public void testHyphenatedWords() throws Exception { String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal"; // first test - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); - HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory(); - ts = factory.create(ts); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); + ts = new HyphenatedWordsFilter(ts); assertTokenStreamContents(ts, new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" }); } @@ -42,9 +42,8 @@ public void testHyphenAtEnd() throws Exception { String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-"; // first test - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); - HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory(); - ts = factory.create(ts); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); + ts = new HyphenatedWordsFilter(ts); assertTokenStreamContents(ts, new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" }); } Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (revision 940912) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (working copy) @@ -15,13 +15,12 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.util.Collection; -import java.util.HashMap; -import java.util.Map; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; @@ -34,7 +33,7 @@ /** * @version $Id:$ */ -public class TestTrimFilter extends BaseTokenTestCase { +public class TestTrimFilter extends BaseTokenStreamTestCase { public void testTrim() throws Exception { char[] a = " a ".toCharArray(); @@ -42,15 +41,13 @@ char[] ccc = "cCc".toCharArray(); char[] whitespace = " ".toCharArray(); char[] empty = "".toCharArray(); - TrimFilterFactory factory = new TrimFilterFactory(); - Map args = new HashMap(); - args.put("updateOffsets", "false"); - factory.init(args); - TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5), + + TokenStream ts = new IterTokenStream(new Token(a, 0, a.length, 1, 5), new Token(b, 0, b.length, 6, 10), new Token(ccc, 0, ccc.length, 11, 15), new Token(whitespace, 0, whitespace.length, 16, 20), - new Token(empty, 0, empty.length, 21, 21))); + new Token(empty, 0, empty.length, 21, 21)); + ts = new TrimFilter(ts, false); assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""}); @@ -58,15 +55,12 @@ b = "b ".toCharArray(); ccc = " c ".toCharArray(); whitespace = " ".toCharArray(); - factory = new TrimFilterFactory(); - args = new HashMap(); - args.put("updateOffsets", "true"); - factory.init(args); - ts = factory.create(new IterTokenStream( + ts = new IterTokenStream( new Token(a, 0, a.length, 0, 2), new Token(b, 0, b.length, 0, 2), new Token(ccc, 0, ccc.length, 0, 3), - new Token(whitespace, 0, whitespace.length, 0, 3))); + new Token(whitespace, 0, whitespace.length, 0, 3)); + ts = new TrimFilter(ts, true); assertTokenStreamContents(ts, new String[] { "a", "b", "c", "" }, Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (revision 940912) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (working copy) @@ -15,24 +15,18 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.miscellaneous; import java.io.StringReader; -import java.util.HashMap; import java.util.HashSet; -import java.util.Map; import java.util.Set; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.solr.common.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -/** - * @version $Id$ - */ -public class TestKeepWordFilter extends BaseTokenTestCase { +/** Test {@link KeepWordFilter} */ +public class TestKeepWordFilter extends BaseTokenStreamTestCase { public void testStopAndGo() throws Exception { @@ -41,39 +35,15 @@ words.add( "bbb" ); String input = "aaa BBB ccc ddd EEE"; - Map args = new HashMap(DEFAULT_VERSION_PARAM); - ResourceLoader loader = new SolrResourceLoader(null, null); // Test Stopwords - KeepWordFilterFactory factory = new KeepWordFilterFactory(); - args.put( "ignoreCase", "true" ); - factory.init( args ); - factory.inform( loader ); - factory.setWords( words ); - assertTrue(factory.isIgnoreCase()); - TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); + stream = new KeepWordFilter(stream, words, true); assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); - - // Test Stopwords (ignoreCase via the setter instead) - factory = new KeepWordFilterFactory(); - args = new HashMap(DEFAULT_VERSION_PARAM); - factory.init( args ); - factory.inform( loader ); - factory.setIgnoreCase(true); - factory.setWords( words ); - assertTrue(factory.isIgnoreCase()); - stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); - assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); - + // Now force case - factory = new KeepWordFilterFactory(); - args = new HashMap(DEFAULT_VERSION_PARAM); - args.put( "ignoreCase", "false" ); - factory.init( args ); - factory.inform( loader ); - factory.setWords( words ); - assertFalse(factory.isIgnoreCase()); - stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); + stream = new KeepWordFilter(stream, words, false); assertTokenStreamContents(stream, new String[] { "aaa" }); } } Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java (revision 940912) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.miscellaneous; /** * Licensed to the Apache Software Foundation (ASF) under one or more Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java (revision 940912) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.miscellaneous; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -26,8 +26,6 @@ /** * Trims leading and trailing whitespace from Tokens in the stream. - * - * @version $Id:$ */ public final class TrimFilter extends TokenFilter { Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (revision 940912) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.miscellaneous; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -29,14 +29,13 @@ * A TokenFilter that only keeps tokens with text contained in the * required words. This filter behaves like the inverse of StopFilter. * - * @version $Id$ * @since solr 1.3 */ public final class KeepWordFilter extends TokenFilter { private final CharArraySet words; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - /** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */ + /** @deprecated Use {@link #KeepWordFilter(TokenStream, CharArraySet)} instead */ @Deprecated public KeepWordFilter(TokenStream in, Set words, boolean ignoreCase ) { this(in, new CharArraySet(words, ignoreCase));