Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (revision 1124242) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (revision ) @@ -41,36 +41,72 @@ private boolean outputUnigramsIfNoShingles = false; public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { - super(); - this.defaultAnalyzer = defaultAnalyzer; + this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); } public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) { - this(defaultAnalyzer); - setMaxShingleSize(maxShingleSize); + this(defaultAnalyzer, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize); } public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) { - this(defaultAnalyzer); - setMaxShingleSize(maxShingleSize); - setMinShingleSize(minShingleSize); + this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false); } /** + * Creates a new ShingleAnalyzerWrapper + * + * @param defaultAnalyzer Analyzer whose TokenStream is to be filtered + * @param minShingleSize Min shingle (token ngram) size + * @param maxShingleSize Max shingle size + * @param tokenSeparator Used to separate input stream tokens in output shingles + * @param outputUnigrams Whether or not the filter shall pass the original + * tokens to the output stream + * @param outputUnigramsIfNoShingles Overrides the behavior of outputUnigrams==false for those + * times when no shingles are available (because there are fewer than + * minShingleSize tokens in the input stream)? + * Note that if outputUnigrams==true, then unigrams are always output, + * regardless of whether any shingles are available. + */ + public ShingleAnalyzerWrapper( + Analyzer defaultAnalyzer, + int minShingleSize, + int maxShingleSize, + String tokenSeparator, + boolean outputUnigrams, + boolean outputUnigramsIfNoShingles) { + this.defaultAnalyzer = defaultAnalyzer; + + if (maxShingleSize < 2) { + throw new IllegalArgumentException("Max shingle size must be >= 2"); + } + this.maxShingleSize = maxShingleSize; + + if (minShingleSize < 2) { + throw new IllegalArgumentException("Min shingle size must be >= 2"); + } + if (minShingleSize > maxShingleSize) { + throw new IllegalArgumentException + ("Min shingle size must be <= max shingle size"); + } + this.minShingleSize = minShingleSize; + + this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); + this.outputUnigrams = outputUnigrams; + this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; + } + + /** * Wraps {@link StandardAnalyzer}. */ public ShingleAnalyzerWrapper(Version matchVersion) { - super(); - this.defaultAnalyzer = new StandardAnalyzer(matchVersion); + this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); } /** * Wraps {@link StandardAnalyzer}. */ public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) { - this(matchVersion); - setMaxShingleSize(maxShingleSize); - setMinShingleSize(minShingleSize); + this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize); } /** @@ -86,7 +122,10 @@ * Set the maximum size of output shingles (default: 2) * * @param maxShingleSize max shingle size + * @deprecated Setting maxShingleSize after Analyzer instantiation prevents reuse. + * Confgure maxShingleSize during construction. */ + @Deprecated public void setMaxShingleSize(int maxShingleSize) { if (maxShingleSize < 2) { throw new IllegalArgumentException("Max shingle size must be >= 2"); @@ -110,7 +149,10 @@ * calling this method. * * @param minShingleSize min size of output shingles + * @deprecated Setting minShingleSize after Analyzer instantiation prevents reuse. + * Confgure minShingleSize during construction. */ + @Deprecated public void setMinShingleSize(int minShingleSize) { if (minShingleSize < 2) { throw new IllegalArgumentException("Min shingle size must be >= 2"); @@ -129,7 +171,10 @@ /** * Sets the string to use when joining adjacent tokens to form a shingle * @param tokenSeparator used to separate input stream tokens in output shingles + * @deprecated Setting tokenSeparator after Analyzer instantiation prevents reuse. + * Confgure tokenSeparator during construction. */ + @Deprecated public void setTokenSeparator(String tokenSeparator) { this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); } @@ -144,7 +189,10 @@ * * @param outputUnigrams Whether or not the filter shall pass the original * tokens to the output stream + * @deprecated Setting outputUnigrams after Analyzer instantiation prevents reuse. + * Confgure outputUnigrams during construction. */ + @Deprecated public void setOutputUnigrams(boolean outputUnigrams) { this.outputUnigrams = outputUnigrams; } @@ -162,7 +210,10 @@ * * @param outputUnigramsIfNoShingles Whether or not to output a single * unigram when no shingles are available. + * @deprecated Setting outputUnigramsIfNoShingles after Analyzer instantiation prevents reuse. + * Confgure outputUnigramsIfNoShingles during construction. */ + @Deprecated public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) { this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; } Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java (revision 917019) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java (revision ) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper; +import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.util.Version; @@ -66,9 +67,14 @@ // otherwise use default ctor wrappedAnalyzer = clazz.newInstance(); } - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize); - analyzer.setOutputUnigrams(outputUnigrams); + + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + wrappedAnalyzer, + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + maxShingleSize, + ShingleFilter.TOKEN_SEPARATOR, + outputUnigrams, + false); getRunData().setAnalyzer(analyzer); } Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (revision 1124242) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (revision ) @@ -262,7 +262,9 @@ new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false); assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles", new String[] { "please divide this", "please divide this sentence", "divide this sentence", "divide this sentence into", @@ -286,7 +288,9 @@ new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 }, new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false); assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles", new String[] { "please divide this", "divide this sentence", @@ -298,9 +302,11 @@ } public void testNoTokenSeparator() throws Exception { - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - analyzer.setTokenSeparator(""); + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", true, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -309,7 +315,12 @@ new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", false, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -320,9 +331,11 @@ } public void testNullTokenSeparator() throws Exception { - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - analyzer.setTokenSeparator(null); + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + null, true, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -331,7 +344,12 @@ new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", false, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -341,9 +359,11 @@ new int[] { 1, 1, 1 }); } public void testAltTokenSeparator() throws Exception { - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - analyzer.setTokenSeparator(""); + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", true, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -352,7 +372,12 @@ new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", false, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -363,10 +388,11 @@ } public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception { - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - analyzer.setOutputUnigrams(false); - analyzer.setOutputUnigramsIfNoShingles(true); + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", false, true); assertAnalyzesToReuse(analyzer, "please", new String[] { "please" }, new int[] { 0 }, Index: lucene/src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java =================================================================== --- lucene/src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (revision 940806) +++ lucene/src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (revision ) @@ -1,6 +1,8 @@ package org.apache.lucene.analysis; import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -24,9 +26,12 @@ public class TestPerFieldAnalzyerWrapper extends BaseTokenStreamTestCase { public void testPerField() throws Exception { String text = "Qwerty"; + + Map analyzerPerField = new HashMap(); + analyzerPerField.put("special", new SimpleAnalyzer(TEST_VERSION_CURRENT)); + PerFieldAnalyzerWrapper analyzer = - new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT)); - analyzer.addAnalyzer("special", new SimpleAnalyzer(TEST_VERSION_CURRENT)); + new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField); TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text)); Index: lucene/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (revision 940806) +++ lucene/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (revision ) @@ -26,16 +26,19 @@ /** * This analyzer is used to facilitate scenarios where different - * fields require different analysis techniques. Use {@link #addAnalyzer} - * to add a non-default analyzer on a field name basis. + * fields require different analysis techniques. Use the Map + * argument in {@link #PerFieldAnalyzerWrapper(Analyzer, java.util.Map)} + * to add non-default analyzers for fields. * *

Example usage: * *

+ *   Map analyzerPerField = new HashMap();
+ *   analyzerPerField.put("firstname", new KeywordAnalyzer());
+ *   analyzerPerField.put("lastname", new KeywordAnalyzer());
+ *
  *   PerFieldAnalyzerWrapper aWrapper =
- *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
- *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
- *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
+ *      new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);
  * 
* *

In this example, StandardAnalyzer will be used for all fields except "firstname" @@ -45,8 +48,8 @@ * and query parsing. */ public final class PerFieldAnalyzerWrapper extends Analyzer { - private Analyzer defaultAnalyzer; - private Map analyzerMap = new HashMap(); + private final Analyzer defaultAnalyzer; + private final Map analyzerMap = new HashMap(); /** @@ -82,7 +85,10 @@ * * @param fieldName field name requiring a non-default analyzer * @param analyzer non-default analyzer to use for field + * @deprecated Changing the Analyzer for a field after instantiation prevents + * reusability. Analyzers for fields should be set during construction. */ + @Deprecated public void addAnalyzer(String fieldName, Analyzer analyzer) { analyzerMap.put(fieldName, analyzer); }