Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (revision 1562603) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (working copy) @@ -36,6 +36,7 @@ private final String tokenSeparator; private final boolean outputUnigrams; private final boolean outputUnigramsIfNoShingles; + private final String fillerToken; public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); @@ -46,7 +47,8 @@ } public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) { - this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false); + this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, + true, false, ShingleFilter.DEFAULT_FILLER_TOKEN); } /** @@ -63,6 +65,7 @@ * minShingleSize tokens in the input stream)? * Note that if outputUnigrams==true, then unigrams are always output, * regardless of whether any shingles are available. + * @param fillerToken filler token to use when positionIncrement is more than 1 */ public ShingleAnalyzerWrapper( Analyzer delegate, @@ -70,7 +73,8 @@ int maxShingleSize, String tokenSeparator, boolean outputUnigrams, - boolean outputUnigramsIfNoShingles) { + boolean outputUnigramsIfNoShingles, + String fillerToken) { super(delegate.getReuseStrategy()); this.delegate = delegate; @@ -91,6 +95,7 @@ this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); this.outputUnigrams = outputUnigrams; this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; + this.fillerToken = fillerToken; } /** @@ -137,6 +142,10 @@ return outputUnigramsIfNoShingles; } + public String getFillerToken() { + return fillerToken; + } + @Override public final Analyzer getWrappedAnalyzer(String fieldName) { return delegate; @@ -150,6 +159,7 @@ filter.setTokenSeparator(tokenSeparator); filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); + filter.setFillerToken(fillerToken); return new TokenStreamComponents(components.getTokenizer(), filter); } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 1562603) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -47,7 +47,7 @@ /** * filler token for when positionIncrement is more than 1 */ - public static final char[] FILLER_TOKEN = { '_' }; + public static final String DEFAULT_FILLER_TOKEN = "_"; /** * default maximum shingle size is 2. @@ -67,7 +67,7 @@ /** * The default string to use when joining adjacent tokens to form a shingle */ - public static final String TOKEN_SEPARATOR = " "; + public static final String DEFAULT_TOKEN_SEPARATOR = " "; /** * The sequence of input stream tokens (or filler tokens, if necessary) @@ -95,9 +95,15 @@ /** * The string to use when joining adjacent tokens to form a shingle */ - private String tokenSeparator = TOKEN_SEPARATOR; + private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR; /** + * The string to insert for each position at which there is no token + * (i.e., when position increment is greater than one). + */ + private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray(); + + /** * By default, we output unigrams (individual tokens) as well as shingles * (token n-grams). */ @@ -284,6 +290,16 @@ this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; } + /** + * Sets the string to insert for each position at which there is no token + * (i.e., when position increment is greater than one). + * + * @param fillerToken string to insert at each position where there is no token + */ + public void setFillerToken(String fillerToken) { + this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray(); + } + @Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; @@ -341,7 +357,7 @@ /** *
Get the next token from the input stream. *
If the next token has positionIncrement > 1,
- * positionIncrement - 1 {@link #FILLER_TOKEN}s are
+ * positionIncrement - 1 {@link #fillerToken}s are
* inserted first.
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
@@ -359,7 +375,7 @@
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
newTarget.offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
@@ -390,7 +406,7 @@
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision 1562603)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (working copy)
@@ -29,7 +29,7 @@
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
- * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
+ * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/>
* </analyzer>
* </fieldType>
*/
@@ -39,6 +39,7 @@
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
private final String tokenSeparator;
+ private final String fillerToken;
/** Creates a new ShingleFilterFactory */
public ShingleFilterFactory(Map