Index: lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java (revision 1167477) +++ lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java (working copy) @@ -693,4 +693,44 @@ s.close(); dir.close(); } + + public void testOptimizeForNgram() throws Exception { + // bi-gram test ABC => AB/BC => AB/BC + PhraseQuery pq1 = new PhraseQuery(); + pq1.add(new Term("f", "AB")); + pq1.add(new Term("f", "BC")); + assertFalse(pq1.optimized()); + + pq1.optimizeForNgram(); + assertTrue(pq1.optimized()); + assertArrayEquals(new Term[]{new Term("f", "AB"), new Term("f", "BC")}, pq1.getTerms()); + assertArrayEquals(new int[]{0, 1}, pq1.getPositions()); + + // bi-gram test ABCD => AB/BC/CD => AB//CD + PhraseQuery pq2 = new PhraseQuery(); + pq2.add(new Term("f", "AB")); + pq2.add(new Term("f", "BC")); + pq2.add(new Term("f", "CD")); + assertFalse(pq2.optimized()); + + pq2.optimizeForNgram(); + assertTrue(pq2.optimized()); + assertArrayEquals(new Term[]{new Term("f", "AB"), new Term("f", "CD")}, pq2.getTerms()); + assertArrayEquals(new int[]{0, 2}, pq2.getPositions()); + + // tri-gram test ABCDEFGH => ABC/BCD/CDE/DEF/EFG/FGH => ABC///DEF//FGH + PhraseQuery pq3 = new PhraseQuery(); + pq3.add(new Term("f", "ABC")); + pq3.add(new Term("f", "BCD")); + pq3.add(new Term("f", "CDE")); + pq3.add(new Term("f", "DEF")); + pq3.add(new Term("f", "EFG")); + pq3.add(new Term("f", "FGH")); + assertFalse(pq3.optimized()); + + pq3.optimizeForNgram(); + assertTrue(pq3.optimized()); + assertArrayEquals(new Term[]{new Term("f", "ABC"), new Term("f", "DEF"), new Term("f", "FGH")}, pq3.getTerms()); + assertArrayEquals(new int[]{0, 3, 5}, pq3.getPositions()); + } } Index: lucene/src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseQuery.java (revision 1167477) +++ lucene/src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.IOException; +import java.util.Iterator; import java.util.Set; import java.util.ArrayList; @@ -47,6 +48,7 @@ private ArrayList positions = new ArrayList(4); private int maxPosition = 0; private int slop = 0; + private boolean optimized = false; /** Constructs an empty phrase query. */ public PhraseQuery() {} @@ -116,7 +118,60 @@ result[i] = positions.get(i).intValue(); return result; } + + /** + * Returns whether n-gram optimizer has been called or not + * @return true if n-gram optimizer has been called, otherwise false + */ + public boolean optimized(){ + return optimized; + } + + /** + * Executes n-gram optimizer. This should be called after constructing + * ({@link #add(Term)}ing or {@link #add(Term, int)}ing) this {@link PhraseQuery}. + * To know n of n-gram, refers to the first term. + */ + public void optimizeForNgram(){ + if(terms.size() == 0) return; + optimizeForNgram(terms.get(0).text().length()); + } + + /** + * Executes n-gram optimizer. This should be called after constructing + * ({@link #add(Term)}ing or {@link #add(Term, int)}ing) this {@link PhraseQuery}. + * + * @param n gram size (N of N-gram) + */ + public void optimizeForNgram(int n){ + if(optimized || slop != 0) return; + optimized = true; + + // check whether optimizable or not + if(n < 2 || // non-overlap n-gram cannot be optimized + terms.size() < 3) return; // too short to optimize + + int prevPosition = positions.get(0).intValue(); + for(int i = 1; i < positions.size(); i++){ + int pos = positions.get(i).intValue(); + if(prevPosition + 1 != pos) return; + prevPosition = pos; + } + int pos = 0; + final int lastPos = terms.size() - 1; + Iterator pi = positions.iterator(); + for(Iterator ti = terms.iterator(); ti.hasNext() && pi.hasNext();){ + pi.next(); + ti.next(); + if(pos % n != 0 && pos < lastPos){ + pi.remove(); + ti.remove(); + } + pos++; + } + } + @Override public Query rewrite(IndexReader reader) throws IOException { if (terms.size() == 1) {