Index: src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java =================================================================== --- src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 694680) +++ src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (arbetskopia) @@ -82,7 +82,44 @@ "word", "shingle", "word", "shingle", "word", "shingle", "word", "shingle", "word", "shingle", "word" }; + + public static final Token[] SIX_GRAM_NO_POSITIONS_TOKENS = new Token[] { + createToken("please", 0, 6), + createToken("please divide", 0, 13), + createToken("please divide this", 0, 18), + createToken("please divide this sentence", 0, 27), + createToken("please divide this sentence into", 0, 32), + createToken("please divide this sentence into shingles", 0, 39), + createToken("divide", 7, 13), + createToken("divide this", 7, 18), + createToken("divide this sentence", 7, 27), + createToken("divide this sentence into", 7, 32), + createToken("divide this sentence into shingles", 7, 39), + createToken("this", 14, 18), + createToken("this sentence", 14, 27), + createToken("this sentence into", 14, 32), + createToken("this sentence into shingles", 14, 39), + createToken("sentence", 19, 27), + createToken("sentence into", 19, 32), + createToken("sentence into shingles", 19, 39), + createToken("into", 28, 32), + createToken("into shingles", 28, 39), + createToken("shingles", 33, 39), + }; + public static final int[] SIX_GRAM_NO_POSITIONS_INCREMENTS = new int[] { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + }; + + public static final String[] SIX_GRAM_NO_POSITIONS_TYPES = new String[] { + "word", "shingle", "shingle", "shingle", "shingle", "shingle", + "word", "shingle", "shingle", "shingle", "shingle", + "word", "shingle", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", + "word" + }; + public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] { createToken("please", 0, 6), createToken("please divide", 0, 13), @@ -158,6 +195,14 @@ this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES, BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES); } + + public void test6GramFilterNoPositions() throws IOException { + + ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6); + filter.setUsingPositionIncrement(false); + this.shingleFilterTest(filter, SIX_GRAM_NO_POSITIONS_TOKENS, + SIX_GRAM_NO_POSITIONS_INCREMENTS, SIX_GRAM_NO_POSITIONS_TYPES); + } public void testTriGramFilter() throws IOException { this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS, @@ -168,7 +213,14 @@ int[] positionIncrements, String[] types) throws IOException { - TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); + shingleFilterTest(new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize), + tokensToCompare, positionIncrements, types); + } + + protected void shingleFilterTest(TokenStream filter, Token[] tokensToCompare, + int[] positionIncrements, String[] types) + throws IOException { + int i = 0; final Token reusableToken = new Token(); for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) { Index: src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 694680) +++ src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (arbetskopia) @@ -70,6 +70,11 @@ * maximum shingle size (number of tokens) */ private int maxShingleSize; + + /** + * @see #setUsingPositionIncrement(boolean) + */ + private boolean usingPositionIncrement = true; /** * Constructs a ShingleFilter with the specified single size from the @@ -141,6 +146,24 @@ } /** + * If true each original token (unigram) or the first related shingle from it + * will get a {@link org.apache.lucene.analysis.Token#getPositionIncrement() positionIncrement} of 1, + * if false all shingle tokens will get a {@link org.apache.lucene.analysis.Token#getPositionIncrement() positionIncrement} of 0. + *
+ * Default value is true. + *
+ * This attribute is typically set false in conjunction with use of the QueryParser that + * when set true will create a MultiPhraseQuery where at least one word/shingle must be + * matched from each word/token, not desired in all situations. Setting this to false + * will instead create a PhraseQuery. + * + * @param usingPositionIncrement the coterminal token positionIncrement setting. + */ + public void setUsingPositionIncrement(boolean usingPositionIncrement){ + this.usingPositionIncrement = usingPositionIncrement; + } + + /** * Clear the StringBuffers that are used for storing the output shingles. */ private void clearShingles() { @@ -255,7 +278,7 @@ if ((! shingleBuf.isEmpty()) && outputUnigrams) { Token unigram = (Token) shingleBuf.getFirst(); - unigram.setPositionIncrement(1); + unigram.setPositionIncrement(usingPositionIncrement ? 1 : 0); outputBuf.add(unigram); } @@ -277,7 +300,7 @@ buf.getChars(0, termLength, termBuffer, 0); shingle.setTermLength(termLength); if ((! outputUnigrams) && j == 1) { - shingle.setPositionIncrement(1); + shingle.setPositionIncrement(usingPositionIncrement ? 1 : 0); } else { shingle.setPositionIncrement(0); }