Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 1562434) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -98,6 +98,12 @@ private String tokenSeparator = TOKEN_SEPARATOR; /** + * The string to insert for each position at which there is no token + * (i.e., when position increment is greater than one). + */ + private char[] fillerToken = FILLER_TOKEN; + + /** * By default, we output unigrams (individual tokens) as well as shingles * (token n-grams). */ @@ -284,6 +290,16 @@ this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; } + /** + * Sets the string to insert for each position at which there is no token + * (i.e., when position increment is greater than one). + * + * @param fillerToken string to insert at each position where there is no token + */ + public void setTokenFiller(String fillerToken) { + this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray(); + } + @Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; @@ -341,7 +357,7 @@ /** *
Get the next token from the input stream. *
If the next token has positionIncrement > 1,
- * positionIncrement - 1 {@link #FILLER_TOKEN}s are
+ * positionIncrement - 1 {@link #fillerToken}s are
* inserted first.
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
@@ -359,7 +375,7 @@
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
newTarget.offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
@@ -390,7 +406,7 @@
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision 1562434)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (working copy)
@@ -29,7 +29,7 @@
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
- * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
+ * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/>
* </analyzer>
* </fieldType>
*/
@@ -39,6 +39,7 @@
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
private final String tokenSeparator;
+ private final String fillerToken;
/** Creates a new ShingleFilterFactory */
public ShingleFilterFactory(Map