Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (revision 1452881) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (working copy) @@ -105,6 +105,33 @@ null, false); } + + public void testFilterPositions() throws Exception { + TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3); + assertTokenStreamContents(tokenizer, + new String[]{"a","ab","abc","v","vw","vwx"}, + new int[]{0,0,0,6,6,6}, + new int[]{1,2,3,7,8,9}, + null, + new int[]{1,0,0,1,0,0}, + null, + null, + false); + } + + public void testTokenizerPositions() throws Exception { + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3); + assertTokenStreamContents(tokenizer, + new String[]{"a","ab","abc"}, + new int[]{0,0,0}, + new int[]{1,2,3}, + null, + new int[]{1,0,0}, + null, + null, + false); + } public void testSmallTokenInStream() throws Exception { input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false); Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 1452881) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy) @@ -17,14 +17,15 @@ * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; + import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.AttributeSource; -import java.io.IOException; -import java.io.Reader; - /** * Tokenizes the input from an edge into n-grams of given size(s). *

@@ -39,6 +40,7 @@ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** Specifies which side of the input the n-gram should be generated from */ public static enum Side { @@ -214,6 +216,8 @@ if (inLen == 0) { return false; } + } else { + posIncrAtt.setPositionIncrement(0); } // if the remaining input is too short, we can't generate any n-grams Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (revision 1452881) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import java.io.IOException; @@ -73,9 +74,11 @@ private int tokStart; private int tokEnd; // only used if the length changed before this filter private boolean hasIllegalOffsets; // only if the length changed before this filter + private int savePosIncr; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range @@ -132,6 +135,7 @@ // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; + savePosIncr = posIncrAtt.getPositionIncrement(); } } if (curGramSize <= maxGram) { @@ -146,6 +150,12 @@ } else { offsetAtt.setOffset(tokStart + start, tokStart + end); } + // first ngram gets increment, others don't + if (curGramSize == minGram) { + posIncrAtt.setPositionIncrement(savePosIncr); + } else { + posIncrAtt.setPositionIncrement(0); + } termAtt.copyBuffer(curTermBuffer, start, curGramSize); curGramSize++; return true; Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1452881) +++ lucene/CHANGES.txt (working copy) @@ -69,6 +69,10 @@ the former. Existing facet indices must be reindexed. (Robert Muir, Shai Erera, Mike McCandless) +* LUCENE-4810: EdgeNGramTokenFilter no longer increments position for + multiple ngrams derived from the same input token. (Walter Underwood + via Mike McCandless) + Optimizations * LUCENE-4687: BloomFilterPostingsFormat now lazily initializes delegate