Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (revision 1562603) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (working copy) @@ -36,6 +36,7 @@ private final String tokenSeparator; private final boolean outputUnigrams; private final boolean outputUnigramsIfNoShingles; + private final String fillerToken; public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); @@ -46,7 +47,8 @@ } public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) { - this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false); + this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, + true, false, ShingleFilter.DEFAULT_FILLER_TOKEN); } /** @@ -63,6 +65,7 @@ * minShingleSize tokens in the input stream)? * Note that if outputUnigrams==true, then unigrams are always output, * regardless of whether any shingles are available. + * @param fillerToken filler token to use when positionIncrement is more than 1 */ public ShingleAnalyzerWrapper( Analyzer delegate, @@ -70,7 +73,8 @@ int maxShingleSize, String tokenSeparator, boolean outputUnigrams, - boolean outputUnigramsIfNoShingles) { + boolean outputUnigramsIfNoShingles, + String fillerToken) { super(delegate.getReuseStrategy()); this.delegate = delegate; @@ -91,6 +95,7 @@ this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); this.outputUnigrams = outputUnigrams; this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; + this.fillerToken = fillerToken; } /** @@ -137,6 +142,10 @@ return outputUnigramsIfNoShingles; } + public String getFillerToken() { + return fillerToken; + } + @Override public final Analyzer getWrappedAnalyzer(String fieldName) { return delegate; @@ -150,6 +159,7 @@ filter.setTokenSeparator(tokenSeparator); filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); + filter.setFillerToken(fillerToken); return new TokenStreamComponents(components.getTokenizer(), filter); } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 1562603) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -47,7 +47,7 @@ /** * filler token for when positionIncrement is more than 1 */ - public static final char[] FILLER_TOKEN = { '_' }; + public static final String DEFAULT_FILLER_TOKEN = "_"; /** * default maximum shingle size is 2. @@ -67,7 +67,7 @@ /** * The default string to use when joining adjacent tokens to form a shingle */ - public static final String TOKEN_SEPARATOR = " "; + public static final String DEFAULT_TOKEN_SEPARATOR = " "; /** * The sequence of input stream tokens (or filler tokens, if necessary) @@ -95,9 +95,15 @@ /** * The string to use when joining adjacent tokens to form a shingle */ - private String tokenSeparator = TOKEN_SEPARATOR; + private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR; /** + * The string to insert for each position at which there is no token + * (i.e., when position increment is greater than one). + */ + private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray(); + + /** * By default, we output unigrams (individual tokens) as well as shingles * (token n-grams). */ @@ -284,6 +290,16 @@ this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; } + /** + * Sets the string to insert for each position at which there is no token + * (i.e., when position increment is greater than one). + * + * @param fillerToken string to insert at each position where there is no token + */ + public void setFillerToken(String fillerToken) { + this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray(); + } + @Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; @@ -341,7 +357,7 @@ /** *

Get the next token from the input stream. *

If the next token has positionIncrement > 1, - * positionIncrement - 1 {@link #FILLER_TOKEN}s are + * positionIncrement - 1 {@link #fillerToken}s are * inserted first. * @param target Where to put the new token; if null, a new instance is created. * @return On success, the populated token; null otherwise @@ -359,7 +375,7 @@ // A filler token occupies no space newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); - newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { @@ -390,7 +406,7 @@ isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); - newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision 1562603) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (working copy) @@ -29,7 +29,7 @@ * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> * <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2" - * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/> + * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/> * </analyzer> * </fieldType> */ @@ -39,6 +39,7 @@ private final boolean outputUnigrams; private final boolean outputUnigramsIfNoShingles; private final String tokenSeparator; + private final String fillerToken; /** Creates a new ShingleFilterFactory */ public ShingleFilterFactory(Map args) { @@ -57,7 +58,8 @@ } outputUnigrams = getBoolean(args, "outputUnigrams", true); outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false); - tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR); + tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); + fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -69,6 +71,7 @@ r.setOutputUnigrams(outputUnigrams); r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.setTokenSeparator(tokenSeparator); + r.setFillerToken(fillerToken); return r; } } Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (revision 1562603) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (working copy) @@ -169,7 +169,8 @@ new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 }); analyzer = new ShingleAnalyzerWrapper( - new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false); + new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, + ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide this sentence into shingles", new String[] { "please divide this", "please divide this sentence", "divide this sentence", "divide this sentence into", @@ -195,7 +196,8 @@ new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 }); analyzer = new ShingleAnalyzerWrapper( - new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false); + new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, + ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide this sentence into shingles", new String[] { "please divide this", "divide this sentence", @@ -211,7 +213,8 @@ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, - "", true, false); + "", true, false, + ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -225,7 +228,8 @@ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, - "", false, false); + "", false, false, + ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -240,7 +244,8 @@ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, - null, true, false); + null, true, false, + ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -254,7 +259,8 @@ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, - "", false, false); + "", false, false, + ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -268,7 +274,8 @@ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, - "", true, false); + "", true, false, + ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -282,7 +289,8 @@ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, - "", false, false); + "", false, false, + ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -297,7 +305,8 @@ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, - "", false, true); + "", false, true, + ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please", new String[] { "please" }, new int[] { 0 }, Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 1562603) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (working copy) @@ -1196,4 +1196,52 @@ new int[] {1, 0, 0, 1, 0, 0}, 20); } + + public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException { + // Analyzing "purple wizard of the", where of and the are removed as a + // stopwords, leaving two trailing holes: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setFillerToken("--"); + + assertTokenStreamContents(filter, + new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"}, + new int[]{0, 0, 0, 7, 7, 7}, + new int[]{6, 13, 20, 13, 20, 20}, + new int[]{1, 0, 0, 1, 0, 0}, + 20); + + filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setFillerToken(""); + + assertTokenStreamContents(filter, + new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "}, + new int[]{0, 0, 0, 7, 7, 7}, + new int[]{6, 13, 20, 13, 20, 20}, + new int[]{1, 0, 0, 1, 0, 0}, + 20); + + + filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setFillerToken(null); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "}, + new int[] {0, 0, 0, 7, 7, 7}, + new int[] {6, 13, 20, 13, 20, 20}, + new int[] {1, 0, 0, 1, 0, 0}, + 20); + + + filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + filter.setFillerToken(null); + filter.setTokenSeparator(null); + + assertTokenStreamContents(filter, + new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"}, + new int[] {0, 0, 0, 7, 7, 7}, + new int[] {6, 13, 20, 13, 20, 20}, + new int[] {1, 0, 0, 1, 0, 0}, + 20); + } }