Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 1562434) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -98,6 +98,12 @@ private String tokenSeparator = TOKEN_SEPARATOR; /** + * The string to insert for each position at which there is no token + * (i.e., when position increment is greater than one). + */ + private char[] fillerToken = FILLER_TOKEN; + + /** * By default, we output unigrams (individual tokens) as well as shingles * (token n-grams). */ @@ -284,6 +290,16 @@ this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; } + /** + * Sets the string to insert for each position at which there is no token + * (i.e., when position increment is greater than one). + * + * @param fillerToken string to insert at each position where there is no token + */ + public void setTokenFiller(String fillerToken) { + this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray(); + } + @Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; @@ -341,7 +357,7 @@ /** *

Get the next token from the input stream. *

If the next token has positionIncrement > 1, - * positionIncrement - 1 {@link #FILLER_TOKEN}s are + * positionIncrement - 1 {@link #fillerToken}s are * inserted first. * @param target Where to put the new token; if null, a new instance is created. * @return On success, the populated token; null otherwise @@ -359,7 +375,7 @@ // A filler token occupies no space newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); - newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { @@ -390,7 +406,7 @@ isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); - newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision 1562434) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (working copy) @@ -29,7 +29,7 @@ * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> * <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2" - * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/> + * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/> * </analyzer> * </fieldType> */ @@ -39,6 +39,7 @@ private final boolean outputUnigrams; private final boolean outputUnigramsIfNoShingles; private final String tokenSeparator; + private final String fillerToken; /** Creates a new ShingleFilterFactory */ public ShingleFilterFactory(Map args) { @@ -58,6 +59,7 @@ outputUnigrams = getBoolean(args, "outputUnigrams", true); outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false); tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR); + fillerToken = get(args, "fillerToken", new String(ShingleFilter.FILLER_TOKEN)); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -69,6 +71,7 @@ r.setOutputUnigrams(outputUnigrams); r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.setTokenSeparator(tokenSeparator); + r.setTokenFiller(fillerToken); return r; } } Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 1562434) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (working copy) @@ -1196,4 +1196,52 @@ new int[] {1, 0, 0, 1, 0, 0}, 20); } + + public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException { + // Analyzing "purple wizard of the", where of and the are removed as a + // stopwords, leaving two trailing holes: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setTokenFiller("--"); + + assertTokenStreamContents(filter, + new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"}, + new int[]{0, 0, 0, 7, 7, 7}, + new int[]{6, 13, 20, 13, 20, 20}, + new int[]{1, 0, 0, 1, 0, 0}, + 20); + + filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setTokenFiller(""); + + assertTokenStreamContents(filter, + new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "}, + new int[]{0, 0, 0, 7, 7, 7}, + new int[]{6, 13, 20, 13, 20, 20}, + new int[]{1, 0, 0, 1, 0, 0}, + 20); + + + filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setTokenFiller(null); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "}, + new int[] {0, 0, 0, 7, 7, 7}, + new int[] {6, 13, 20, 13, 20, 20}, + new int[] {1, 0, 0, 1, 0, 0}, + 20); + + + filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setTokenFiller(null); + filter.setTokenSeparator(null); + + assertTokenStreamContents(filter, + new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"}, + new int[] {0, 0, 0, 7, 7, 7}, + new int[] {6, 13, 20, 13, 20, 20}, + new int[] {1, 0, 0, 1, 0, 0}, + 20); + } }