Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (revision 1562630)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (working copy)
@@ -36,6 +36,7 @@
private final String tokenSeparator;
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
+ private final String fillerToken;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
@@ -46,7 +47,8 @@
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
- this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false);
+ this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+ true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
}
/**
@@ -63,6 +65,7 @@
* minShingleSize tokens in the input stream)?
* Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
+ * @param fillerToken filler token to use when positionIncrement is more than 1
*/
public ShingleAnalyzerWrapper(
Analyzer delegate,
@@ -70,7 +73,8 @@
int maxShingleSize,
String tokenSeparator,
boolean outputUnigrams,
- boolean outputUnigramsIfNoShingles) {
+ boolean outputUnigramsIfNoShingles,
+ String fillerToken) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
@@ -91,6 +95,7 @@
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
this.outputUnigrams = outputUnigrams;
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ this.fillerToken = fillerToken;
}
/**
@@ -137,6 +142,10 @@
return outputUnigramsIfNoShingles;
}
+ public String getFillerToken() {
+ return fillerToken;
+ }
+
@Override
public final Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
@@ -150,6 +159,7 @@
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
+ filter.setFillerToken(fillerToken);
return new TokenStreamComponents(components.getTokenizer(), filter);
}
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 1562630)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy)
@@ -47,7 +47,7 @@
/**
* filler token for when positionIncrement is more than 1
*/
- public static final char[] FILLER_TOKEN = { '_' };
+ public static final String DEFAULT_FILLER_TOKEN = "_";
/**
* default maximum shingle size is 2.
@@ -67,7 +67,7 @@
/**
* The default string to use when joining adjacent tokens to form a shingle
*/
- public static final String TOKEN_SEPARATOR = " ";
+ public static final String DEFAULT_TOKEN_SEPARATOR = " ";
/**
* The sequence of input stream tokens (or filler tokens, if necessary)
@@ -95,9 +95,15 @@
/**
* The string to use when joining adjacent tokens to form a shingle
*/
- private String tokenSeparator = TOKEN_SEPARATOR;
+ private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
/**
+ * The string to insert for each position at which there is no token
+ * (i.e., when position increment is greater than one).
+ */
+ private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray();
+
+ /**
* By default, we output unigrams (individual tokens) as well as shingles
* (token n-grams).
*/
@@ -284,6 +290,16 @@
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
}
+ /**
+ * Sets the string to insert for each position at which there is no token
+ * (i.e., when position increment is greater than one).
+ *
+ * @param fillerToken string to insert at each position where there is no token
+ */
+ public void setFillerToken(String fillerToken) {
+ this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray();
+ }
+
@Override
public boolean incrementToken() throws IOException {
boolean tokenAvailable = false;
@@ -341,7 +357,7 @@
/**
*
Get the next token from the input stream.
*
If the next token has positionIncrement > 1,
- * positionIncrement - 1 {@link #FILLER_TOKEN}s are
+ * positionIncrement - 1 {@link #fillerToken}s are
* inserted first.
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
@@ -359,7 +375,7 @@
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
newTarget.offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
@@ -390,7 +406,7 @@
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision 1562630)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (working copy)
@@ -29,7 +29,7 @@
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
- * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
+ * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/>
* </analyzer>
* </fieldType>
*/
@@ -39,6 +39,7 @@
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
private final String tokenSeparator;
+ private final String fillerToken;
/** Creates a new ShingleFilterFactory */
public ShingleFilterFactory(Map args) {
@@ -57,7 +58,8 @@
}
outputUnigrams = getBoolean(args, "outputUnigrams", true);
outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
- tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR);
+ tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
+ fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -69,6 +71,7 @@
r.setOutputUnigrams(outputUnigrams);
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
r.setTokenSeparator(tokenSeparator);
+ r.setFillerToken(fillerToken);
return r;
}
}
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (revision 1562630)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (working copy)
@@ -21,9 +21,13 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
@@ -169,7 +173,8 @@
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
- new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
+ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", "please divide this sentence",
"divide this sentence", "divide this sentence into",
@@ -195,7 +200,8 @@
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
- new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
+ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this",
"divide this sentence",
@@ -211,7 +217,8 @@
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", true, false);
+ "", true, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@@ -225,7 +232,8 @@
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", false, false);
+ "", false, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@@ -240,7 +248,8 @@
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- null, true, false);
+ null, true, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@@ -254,7 +263,8 @@
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", false, false);
+ "", false, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@@ -268,7 +278,8 @@
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", true, false);
+ "", true, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@@ -282,7 +293,8 @@
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", false, false);
+ "", false, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@@ -291,13 +303,65 @@
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
-
+
+ public void testAltFillerToken() throws Exception {
+ Analyzer delegate = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
+ delegate,
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+ true, false, "--");
+ assertAnalyzesTo(analyzer, "please divide into shingles",
+ new String[] { "please", "please divide",
+ "divide", "divide --",
+ "-- shingles", "shingles" },
+ new int[] { 0, 0, 7, 7, 19, 19 },
+ new int[] { 6, 13, 13, 19, 27, 27 },
+ new int[] { 1, 0, 1, 0, 1, 1 });
+
+ analyzer = new ShingleAnalyzerWrapper(
+ delegate,
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+ false, false, null);
+ assertAnalyzesTo(analyzer, "please divide into shingles",
+ new String[] { "please divide", "divide ", " shingles" },
+ new int[] { 0, 7, 19 },
+ new int[] { 13, 19, 27 },
+ new int[] { 1, 1, 1 });
+
+ analyzer = new ShingleAnalyzerWrapper(
+ delegate,
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+ false, false, "");
+ assertAnalyzesTo(analyzer, "please divide into shingles",
+ new String[] { "please divide", "divide ", " shingles" },
+ new int[] { 0, 7, 19 },
+ new int[] { 13, 19, 27 },
+ new int[] { 1, 1, 1 });
+ }
+
+
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", false, true);
+ "", false, true,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please",
new String[] { "please" },
new int[] { 0 },
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 1562630)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (working copy)
@@ -1196,4 +1196,52 @@
new int[] {1, 0, 0, 1, 0, 0},
20);
}
+
+ public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
+ // Analyzing "purple wizard of the", where of and the are removed as a
+ // stopwords, leaving two trailing holes:
+ Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
+ ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+ filter.setFillerToken("--");
+
+ assertTokenStreamContents(filter,
+ new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"},
+ new int[]{0, 0, 0, 7, 7, 7},
+ new int[]{6, 13, 20, 13, 20, 20},
+ new int[]{1, 0, 0, 1, 0, 0},
+ 20);
+
+ filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+ filter.setFillerToken("");
+
+ assertTokenStreamContents(filter,
+ new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
+ new int[]{0, 0, 0, 7, 7, 7},
+ new int[]{6, 13, 20, 13, 20, 20},
+ new int[]{1, 0, 0, 1, 0, 0},
+ 20);
+
+
+ filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+ filter.setFillerToken(null);
+
+ assertTokenStreamContents(filter,
+ new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
+ new int[] {0, 0, 0, 7, 7, 7},
+ new int[] {6, 13, 20, 13, 20, 20},
+ new int[] {1, 0, 0, 1, 0, 0},
+ 20);
+
+
+ filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+ filter.setFillerToken(null);
+ filter.setTokenSeparator(null);
+
+ assertTokenStreamContents(filter,
+ new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
+ new int[] {0, 0, 0, 7, 7, 7},
+ new int[] {6, 13, 20, 13, 20, 20},
+ new int[] {1, 0, 0, 1, 0, 0},
+ 20);
+ }
}