Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 1523288) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; @@ -34,41 +35,6 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { - public class TestTokenStream extends TokenStream { - - protected int index = 0; - protected Token[] testToken; - - private CharTermAttribute termAtt; - private OffsetAttribute offsetAtt; - private PositionIncrementAttribute posIncrAtt; - private TypeAttribute typeAtt; - - public TestTokenStream(Token[] testToken) { - super(); - this.testToken = testToken; - this.termAtt = addAttribute(CharTermAttribute.class); - this.offsetAtt = addAttribute(OffsetAttribute.class); - this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); - this.typeAtt = addAttribute(TypeAttribute.class); - } - - @Override - public final boolean incrementToken() { - clearAttributes(); - if (index < testToken.length) { - Token t = testToken[index++]; - termAtt.copyBuffer(t.buffer(), 0, t.length()); - offsetAtt.setOffset(t.startOffset(), t.endOffset()); - posIncrAtt.setPositionIncrement(t.getPositionIncrement()); - typeAtt.setType(TypeAttribute.DEFAULT_TYPE); - return true; - } else { - return false; - } - } - } - public static final Token[] TEST_TOKEN = new Token[] { createToken("please", 0, 6), createToken("divide", 7, 13), @@ -1066,7 +1032,7 @@ boolean outputUnigrams) throws IOException { - ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); + ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), maxSize); filter.setOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); } @@ -1076,7 +1042,7 @@ String[] types, boolean outputUnigrams) throws IOException { ShingleFilter filter - = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); + = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.setOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); } @@ -1087,7 +1053,7 @@ boolean outputUnigramsIfNoShingles) throws IOException { ShingleFilter filter - = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); + = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); @@ -1098,7 +1064,7 @@ String[] types, boolean outputUnigrams) throws IOException { ShingleFilter filter - = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); + = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.setTokenSeparator(tokenSeparator); filter.setOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); @@ -1170,4 +1136,63 @@ }; checkOneTermReuse(a, "", ""); } + + public void testTrailingHole1() throws IOException { + // Analyzing "wizard of", where of is removed as a + // stopword leaving a trailing hole: + Token[] inputTokens = new Token[] {createToken("wizard", 0, 6)}; + ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 9, inputTokens), 2, 2); + + assertTokenStreamContents(filter, + new String[] {"wizard", "wizard _"}, + new int[] {0, 0}, + new int[] {6, 9}, + new int[] {1, 0}, + 9); + } + + public void testTrailingHole2() throws IOException { + // Analyzing "purple wizard of", where of is removed as a + // stopword leaving a trailing hole: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), + createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "wizard", "wizard _"}, + new int[] {0, 0, 7, 7}, + new int[] {6, 13, 13, 16}, + new int[] {1, 0, 1, 0}, + 16); + } + + public void testTwoTrailingHoles() throws IOException { + // Analyzing "purple wizard of the", where of and the are removed as a + // stopwords, leaving two trailing holes: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), + createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "wizard", "wizard _"}, + new int[] {0, 0, 7, 7}, + new int[] {6, 13, 13, 20}, + new int[] {1, 0, 1, 0}, + 20); + } + + public void testTwoTrailingHolesTriShingle() throws IOException { + // Analyzing "purple wizard of the", where of and the are removed as a + // stopwords, leaving two trailing holes: + Token[] inputTokens = new Token[] {createToken("purple", 0, 6), + createToken("wizard", 7, 13)}; + ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); + + assertTokenStreamContents(filter, + new String[] {"purple", "purple wizard", "purple wizard _", "wizard", "wizard _", "wizard _ _"}, + new int[] {0, 0, 0, 7, 7, 7}, + new int[] {6, 13, 20, 13, 20, 20}, + new int[] {1, 0, 0, 1, 0, 0}, + 20); + } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 1523288) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -147,6 +147,12 @@ * true if no shingles have been output yet (for outputUnigramsIfNoShingles). */ boolean noShingleOutput = true; + + /** + * Holds the State after input.end() was called, so we can + * restore it in our end() impl. + */ + private State endState; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -279,7 +285,7 @@ } @Override - public final boolean incrementToken() throws IOException { + public boolean incrementToken() throws IOException { boolean tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { @@ -364,39 +370,63 @@ } isNextInputStreamToken = false; newTarget.isFiller = false; - } else if (!exhausted && input.incrementToken()) { - if (null == target) { - newTarget = new InputWindowToken(cloneAttributes()); + } else if (!exhausted) { + if (input.incrementToken()) { + if (null == target) { + newTarget = new InputWindowToken(cloneAttributes()); + } else { + this.copyTo(target.attSource); + } + if (posIncrAtt.getPositionIncrement() > 1) { + // Each output shingle must contain at least one input token, + // so no more than (maxShingleSize - 1) filler tokens will be inserted. + numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); + // Save the current token as the next input stream token + if (null == nextInputStreamToken) { + nextInputStreamToken = cloneAttributes(); + } else { + this.copyTo(nextInputStreamToken); + } + isNextInputStreamToken = true; + // A filler token occupies no space + newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); + newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.isFiller = true; + --numFillerTokensToInsert; + } else { + newTarget.isFiller = false; + } } else { - this.copyTo(target.attSource); - } - if (posIncrAtt.getPositionIncrement() > 1) { - // Each output shingle must contain at least one input token, - // so no more than (maxShingleSize - 1) filler tokens will be inserted. - numFillerTokensToInsert - = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); - // Save the current token as the next input stream token - if (null == nextInputStreamToken) { - nextInputStreamToken = cloneAttributes(); + exhausted = true; + input.end(); + endState = captureState(); + numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1); + if (numFillerTokensToInsert > 0) { + nextInputStreamToken = new AttributeSource(getAttributeFactory()); + nextInputStreamToken.addAttribute(CharTermAttribute.class); + OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class); + newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset()); + // Recurse/loop just once: + return getNextToken(target); } else { - this.copyTo(nextInputStreamToken); + newTarget = null; } - isNextInputStreamToken = true; - // A filler token occupies no space - newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); - newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); - newTarget.isFiller = true; - --numFillerTokensToInsert; - } else { - newTarget.isFiller = false; } } else { newTarget = null; - exhausted = true; } return newTarget; } + @Override + public void end() throws IOException { + if (!exhausted) { + super.end(); + } else { + restoreState(endState); + } + } + /** *

Fills {@link #inputWindow} with input stream tokens, if available, * shifting to the right if the window was previously full. @@ -445,6 +475,7 @@ isOutputHere = false; noShingleOutput = true; exhausted = false; + endState = null; if (outputUnigramsIfNoShingles && ! outputUnigrams) { // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles gramSize.minValue = minShingleSize; Index: lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (revision 1523288) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (working copy) @@ -17,6 +17,8 @@ * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; @@ -34,10 +36,29 @@ private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - + private final int finalOffset; + private final int finalPosInc; + public CannedTokenStream(Token... tokens) { this.tokens = tokens; + finalOffset = 0; + finalPosInc = 0; } + + /** If you want trailing holes, pass a non-zero + * finalPosInc. */ + public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) { + this.tokens = tokens; + this.finalOffset = finalOffset; + this.finalPosInc = finalPosInc; + } + + @Override + public void end() throws IOException { + super.end(); + posIncrAtt.setPositionIncrement(finalPosInc); + offsetAtt.setOffset(finalOffset, finalOffset); + } @Override public boolean incrementToken() {