Index: modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java (revision 1162347) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java (revision ) @@ -23,21 +23,25 @@ import java.io.Reader; import java.io.IOException; +import java.util.Collections; import java.util.Map; import java.util.HashMap; /** * This analyzer is used to facilitate scenarios where different - * fields require different analysis techniques. Use {@link #addAnalyzer} - * to add a non-default analyzer on a field name basis. + * fields require different analysis techniques. Use the Map + * argument in {@link #PerFieldAnalyzerWrapper(Analyzer, java.util.Map)} + * to add non-default analyzers for fields. * *
Example usage: * *
+ * Map analyzerPerField = new HashMap();
+ * analyzerPerField.put("firstname", new KeywordAnalyzer());
+ * analyzerPerField.put("lastname", new KeywordAnalyzer());
+ *
* PerFieldAnalyzerWrapper aWrapper =
- * new PerFieldAnalyzerWrapper(new StandardAnalyzer());
- * aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
- * aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
+ * new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);
*
*
* In this example, StandardAnalyzer will be used for all fields except "firstname"
@@ -47,10 +51,9 @@
* and query parsing.
*/
public final class PerFieldAnalyzerWrapper extends Analyzer {
- private Analyzer defaultAnalyzer;
- private Map Set the min shingle size (default: 2).
- * This method requires that the passed in minShingleSize is not greater
- * than maxShingleSize, so make sure that maxShingleSize is set before
- * calling this method.
- *
- * @param minShingleSize min size of output shingles
- */
- public void setMinShingleSize(int minShingleSize) {
- if (minShingleSize < 2) {
- throw new IllegalArgumentException("Min shingle size must be >= 2");
- }
- if (minShingleSize > maxShingleSize) {
- throw new IllegalArgumentException
- ("Min shingle size must be <= max shingle size");
- }
- this.minShingleSize = minShingleSize;
- }
-
public String getTokenSeparator() {
return tokenSeparator;
}
-
+
- /**
- * Sets the string to use when joining adjacent tokens to form a shingle
- * @param tokenSeparator used to separate input stream tokens in output shingles
- */
- public void setTokenSeparator(String tokenSeparator) {
- this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
- }
-
public boolean isOutputUnigrams() {
return outputUnigrams;
}
-
+
- /**
- * Shall the filter pass the original tokens (the "unigrams") to the output
- * stream?
- *
- * @param outputUnigrams Whether or not the filter shall pass the original
- * tokens to the output stream
- */
- public void setOutputUnigrams(boolean outputUnigrams) {
- this.outputUnigrams = outputUnigrams;
- }
-
public boolean isOutputUnigramsIfNoShingles() {
return outputUnigramsIfNoShingles;
}
-
+
- /**
- * Shall we override the behavior of outputUnigrams==false for those
- * times when no shingles are available (because there are fewer than
- * minShingleSize tokens in the input stream)? (default: false.)
- * Note that if outputUnigrams==true, then unigrams are always output,
- * regardless of whether any shingles are available.
- *
- * @param outputUnigramsIfNoShingles Whether or not to output a single
- * unigram when no shingles are available.
- */
- public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
- this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
- }
-
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream wrapped;
Index: modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
===================================================================
--- modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java (revision 1087468)
+++ modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java (revision )
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.benchmark.byTask.PerfRunData;
/**
@@ -64,9 +65,14 @@
}
wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName);
}
- ShingleAnalyzerWrapper analyzer
- = new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize);
- analyzer.setOutputUnigrams(outputUnigrams);
+
+ ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
+ wrappedAnalyzer,
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ maxShingleSize,
+ ShingleFilter.TOKEN_SEPARATOR,
+ outputUnigrams,
+ false);
getRunData().setAnalyzer(analyzer);
}
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (revision 1169607)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (revision )
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
@@ -162,7 +161,9 @@
new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
- analyzer.setOutputUnigrams(false);
+
+ analyzer = new ShingleAnalyzerWrapper(
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", "please divide this sentence",
"divide this sentence", "divide this sentence into",
@@ -186,7 +187,9 @@
new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
- analyzer.setOutputUnigrams(false);
+
+ analyzer = new ShingleAnalyzerWrapper(
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this",
"divide this sentence",
@@ -198,9 +201,11 @@
}
public void testNoTokenSeparator() throws Exception {
- ShingleAnalyzerWrapper analyzer
- = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
- analyzer.setTokenSeparator("");
+ ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ "", true, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@@ -209,7 +214,12 @@
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
- analyzer.setOutputUnigrams(false);
+
+ analyzer = new ShingleAnalyzerWrapper(
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ "", false, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@@ -220,9 +230,11 @@
}
public void testNullTokenSeparator() throws Exception {
- ShingleAnalyzerWrapper analyzer
- = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
- analyzer.setTokenSeparator(null);
+ ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ null, true, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@@ -231,7 +243,12 @@
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
- analyzer.setOutputUnigrams(false);
+
+ analyzer = new ShingleAnalyzerWrapper(
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ "", false, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@@ -241,9 +258,11 @@
new int[] { 1, 1, 1 });
}
public void testAltTokenSeparator() throws Exception {
- ShingleAnalyzerWrapper analyzer
- = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
- analyzer.setTokenSeparator("