Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision 1561206) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision ) @@ -29,7 +29,7 @@ * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> * <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2" - * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/> + * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/> * </analyzer> * </fieldType> */ @@ -39,6 +39,7 @@ private final boolean outputUnigrams; private final boolean outputUnigramsIfNoShingles; private final String tokenSeparator; + private final char [] fillerToken; /** Creates a new ShingleFilterFactory */ public ShingleFilterFactory(Map args) { @@ -58,6 +59,7 @@ outputUnigrams = getBoolean(args, "outputUnigrams", true); outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false); tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR); + fillerToken = null == get(args, "fillerToken") ? ShingleFilter.FILLER_TOKEN : get(args, "fillerToken").toCharArray(); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -69,6 +71,7 @@ r.setOutputUnigrams(outputUnigrams); r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.setTokenSeparator(tokenSeparator); + r.setTokenFiller(fillerToken); return r; } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 1561206) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision ) @@ -97,6 +97,8 @@ */ private String tokenSeparator = TOKEN_SEPARATOR; + private char[] fillerToken = FILLER_TOKEN; + /** * By default, we output unigrams (individual tokens) as well as shingles * (token n-grams). @@ -284,6 +286,15 @@ this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; } + /** + * Sets the string that is inserted in between the tokens if pos_inc is > 1 + * @param fillerToken + */ + public void setTokenFiller(char [] fillerToken) { + if(fillerToken!=null) + this.fillerToken = fillerToken; + } + @Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; @@ -341,7 +352,7 @@ /** *

Get the next token from the input stream. *

If the next token has positionIncrement > 1, - * positionIncrement - 1 {@link #FILLER_TOKEN}s are + * positionIncrement - 1 {@link #fillerToken}s are * inserted first. * @param target Where to put the new token; if null, a new instance is created. * @return On success, the populated token; null otherwise @@ -359,7 +370,7 @@ // A filler token occupies no space newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); - newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { @@ -390,7 +401,7 @@ isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); - newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 1561206) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision ) @@ -1196,4 +1196,20 @@ new int[] {1, 0, 0, 1, 0, 0}, 20); } + + public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException { + // Analyzing "purple wizard of the", where of and the are removed as a + // stopwords, leaving two trailing holes: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), + createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setTokenFiller("--".toCharArray()); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"}, + new int[] {0, 0, 0, 7, 7, 7}, + new int[] {6, 13, 20, 13, 20, 20}, + new int[] {1, 0, 0, 1, 0, 0}, + 20); + } }