Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 1523288) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -147,6 +147,12 @@ * true if no shingles have been output yet (for outputUnigramsIfNoShingles). */ boolean noShingleOutput = true; + + /** + * Holds the State after input.end() was called, so we can + * restore it in our end() impl. + */ + private State endState; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -279,7 +285,7 @@ } @Override - public final boolean incrementToken() throws IOException { + public boolean incrementToken() throws IOException { boolean tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { @@ -364,39 +370,63 @@ } isNextInputStreamToken = false; newTarget.isFiller = false; - } else if (!exhausted && input.incrementToken()) { - if (null == target) { - newTarget = new InputWindowToken(cloneAttributes()); + } else if (!exhausted) { + if (input.incrementToken()) { + if (null == target) { + newTarget = new InputWindowToken(cloneAttributes()); + } else { + this.copyTo(target.attSource); + } + if (posIncrAtt.getPositionIncrement() > 1) { + // Each output shingle must contain at least one input token, + // so no more than (maxShingleSize - 1) filler tokens will be inserted. + numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); + // Save the current token as the next input stream token + if (null == nextInputStreamToken) { + nextInputStreamToken = cloneAttributes(); + } else { + this.copyTo(nextInputStreamToken); + } + isNextInputStreamToken = true; + // A filler token occupies no space + newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); + newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.isFiller = true; + --numFillerTokensToInsert; + } else { + newTarget.isFiller = false; + } } else { - this.copyTo(target.attSource); - } - if (posIncrAtt.getPositionIncrement() > 1) { - // Each output shingle must contain at least one input token, - // so no more than (maxShingleSize - 1) filler tokens will be inserted. - numFillerTokensToInsert - = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); - // Save the current token as the next input stream token - if (null == nextInputStreamToken) { - nextInputStreamToken = cloneAttributes(); + exhausted = true; + input.end(); + endState = captureState(); + numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1); + if (numFillerTokensToInsert > 0) { + nextInputStreamToken = new AttributeSource(getAttributeFactory()); + nextInputStreamToken.addAttribute(CharTermAttribute.class); + OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class); + newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset()); + // Recurse/loop just once: + return getNextToken(target); } else { - this.copyTo(nextInputStreamToken); + newTarget = null; } - isNextInputStreamToken = true; - // A filler token occupies no space - newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); - newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); - newTarget.isFiller = true; - --numFillerTokensToInsert; - } else { - newTarget.isFiller = false; } } else { newTarget = null; - exhausted = true; } return newTarget; } + @Override + public void end() throws IOException { + if (!exhausted) { + super.end(); + } else { + restoreState(endState); + } + } + /** *

Fills {@link #inputWindow} with input stream tokens, if available, * shifting to the right if the window was previously full. @@ -445,6 +475,7 @@ isOutputHere = false; noShingleOutput = true; exhausted = false; + endState = null; if (outputUnigramsIfNoShingles && ! outputUnigrams) { // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles gramSize.minValue = minShingleSize; Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 1523288) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (working copy) @@ -34,6 +34,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { + // TODO: merge w/ CannedTokenStream? public class TestTokenStream extends TokenStream { protected int index = 0; @@ -43,14 +44,21 @@ private OffsetAttribute offsetAtt; private PositionIncrementAttribute posIncrAtt; private TypeAttribute typeAtt; + private int finalPosInc; + private int finalOffset; public TestTokenStream(Token[] testToken) { - super(); + this(testToken, 0, 0); + } + + public TestTokenStream(Token[] testToken, int finalPosInc, int finalOffset) { this.testToken = testToken; this.termAtt = addAttribute(CharTermAttribute.class); this.offsetAtt = addAttribute(OffsetAttribute.class); this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); this.typeAtt = addAttribute(TypeAttribute.class); + this.finalPosInc = finalPosInc; + this.finalOffset = finalOffset; } @Override @@ -67,6 +75,15 @@ return false; } } + + @Override + public void end() throws IOException { + super.end(); + if (finalPosInc != 0) { + posIncrAtt.setPositionIncrement(finalPosInc); + offsetAtt.setOffset(finalOffset, finalOffset); + } + } } public static final Token[] TEST_TOKEN = new Token[] { @@ -1170,4 +1187,63 @@ }; checkOneTermReuse(a, "", ""); } + + public void testTrailingHole1() throws IOException { + // Analyzing "wizard of", where of is removed as a + // stopword leaving a trailing hole: + Token[] inputTokens = new Token[] {createToken("wizard", 0, 6)}; + ShingleFilter filter = new ShingleFilter(new TestTokenStream(inputTokens, 1, 9), 2, 2); + + assertTokenStreamContents(filter, + new String[] {"wizard", "wizard _"}, + new int[] {0, 0}, + new int[] {6, 9}, + new int[] {1, 0}, + 9); + } + + public void testTrailingHole2() throws IOException { + // Analyzing "purple wizard of", where of is removed as a + // stopword leaving a trailing hole: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), + createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new TestTokenStream(inputTokens, 1, 16), 2, 2); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "wizard", "wizard _"}, + new int[] {0, 0, 7, 7}, + new int[] {6, 13, 13, 16}, + new int[] {1, 0, 1, 0}, + 16); + } + + public void testTwoTrailingHoles() throws IOException { + // Analyzing "purple wizard of the", where of and the are removed as a + // stopwords, leaving two trailing holes: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), + createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new TestTokenStream(inputTokens, 2, 20), 2, 2); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "wizard", "wizard _"}, + new int[] {0, 0, 7, 7}, + new int[] {6, 13, 13, 20}, + new int[] {1, 0, 1, 0}, + 20); + } + + public void testTwoTrailingHolesTriShingle() throws IOException { + // Analyzing "purple wizard of the", where of and the are removed as a + // stopwords, leaving two trailing holes: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), + createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new TestTokenStream(inputTokens, 2, 20), 2, 3); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "purple wizard _", "wizard", "wizard _", "wizard _ _"}, + new int[] {0, 0, 0, 7, 7, 7}, + new int[] {6, 13, 20, 13, 20, 20}, + new int[] {1, 0, 0, 1, 0, 0}, + 20); + } }