Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (revision 1347557) +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (working copy) @@ -70,6 +70,10 @@ "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically" }; + protected static final String[] phareWithRepeatedTerms = { + "tomasso apple's tree and mandarino's tree are beautiful" + }; + // test data for LUCENE-1448 bug protected static final String[] biMVValues = { "\nLucene/Solr does not require such additional hardware.", @@ -423,6 +427,14 @@ make1dmfIndex( longMVValues ); } + protected void makeIndexRepeatedTerms() throws Exception { + // 1111111111222222222233333333334444444444555555 + //01234567890123456789012345678901234567890123456789012345 + //tomasso apple's tree and mandarino's tree are beautiful + //(1) 2 (3) 4 5 (6) 7 (8) + make1dmfIndex(phareWithRepeatedTerms); + } + protected void makeIndexLongMVB() throws Exception { // "*" ... LF Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java (revision 1347557) +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java (working copy) @@ -178,7 +178,29 @@ assertEquals( "searchengines(1.0)((102,116))", fpl.phraseList.get( 0 ).toString() ); assertEquals( "searchengines(1.0)((157,171))", fpl.phraseList.get( 1 ).toString() ); } + + public void testRepeatedTermsWithSlop() throws Exception { + makeIndexRepeatedTerms(); + + FieldQuery fq = new FieldQuery( pqF(1F,5,"tomasso", "tree" ,"beautiful"),true,true); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq); + FieldPhraseList fpl = new FieldPhraseList(stack, fq); + + assertEquals(1,fpl.phraseList.size()); + assertEquals("tomassotreetreebeautiful(1.0)((0,7)(16,20)(38,42)(47,56))",fpl.phraseList.get(0).toString()); + } + public void testReversedTermsWithSlop() throws Exception { + makeIndexRepeatedTerms(); + + FieldQuery fq = new FieldQuery( pqF(1F,3,"beautiful", "tree"),true,true); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq); + FieldPhraseList fpl = new FieldPhraseList(stack, fq); + + assertEquals(1,fpl.phraseList.size()); + assertEquals("treebeautiful(1.0)((38,42)(47,56))",fpl.phraseList.get(0).toString()); + } + public void test1PhraseLongMVB() throws Exception { makeIndexLongMVB(); Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (revision 1347557) +++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (working copy) @@ -18,12 +18,15 @@ import java.io.IOException; import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.Set; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; @@ -121,9 +124,51 @@ termList.add( new TermInfo( term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight ) ); } } - - // sort by position - Collections.sort(termList); + boolean isInverted = false; + //Check inverted sloop query + if(fieldQuery.hasSlop()) { + + Set termsText = new LinkedHashSet(); + int pos = -1; + int max = -1; + for (int i=0; i< termList.size(); i++) { + TermInfo term = termList.get(i); + if(term.position > max) { + max = term.position; + pos = i; + } + termsText.add(term.text); + } + if(pos == 0 && termList.size() > 1) { + isInverted = true; + } + Term[] orderedTerms = fieldQuery.getOrderedTerms(); + if(termsText.size() == orderedTerms.length) { + int i = 0; + for (String term : termsText) { + if(term.equals(orderedTerms[i].text())) { + i++; + } + else { + isInverted = false; + break; + } + } + fieldQuery.setInverted(isInverted); + } + } + //sort by position + if(isInverted) { + Collections.sort(termList, new Comparator() { + + public int compare(TermInfo o1, TermInfo o2) { + return o2.position - o1.position; + } + + }); + } else { + Collections.sort( termList ); + } } /** Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (revision 1347557) +++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (working copy) @@ -22,6 +22,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; @@ -54,6 +55,14 @@ Map> termSetMap = new HashMap>(); int termOrPhraseNumber; // used for colored tag support + + private boolean isPhraseQuery; + + private boolean hasSlop; + + private Term[] orderedTerm = null; + + private boolean isInverted = false; // The maximum number of different matching terms accumulated from any one MultiTermQuery private static final int MAX_MTQ_TERMS = 1024; @@ -84,6 +93,25 @@ FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException { this (query, null, phraseHighlight, fieldMatch); } + + public Term[] getOrderedTerms() { + return this.orderedTerm; + } + public void setInverted(boolean isInverted) { + this.isInverted = isInverted; + } + + public boolean isInverted() { + return isInverted; + } + + boolean isPhraseQuery() { + return isPhraseQuery; + } + + boolean hasSlop() { + return hasSlop; + } void flatten( Query sourceQuery, IndexReader reader, Collection flatQueries ) throws IOException{ if( sourceQuery instanceof BooleanQuery ){ @@ -110,10 +138,14 @@ flatten(mtqTerms, reader, flatQueries); } else if( sourceQuery instanceof PhraseQuery ){ + this.isPhraseQuery = true; + this.hasSlop = ((PhraseQuery) sourceQuery).getSlop() != 0; if( !flatQueries.contains( sourceQuery ) ){ PhraseQuery pq = (PhraseQuery)sourceQuery; - if( pq.getTerms().length > 1 ) + if( pq.getTerms().length > 1 ) { flatQueries.add( pq ); + orderedTerm = pq.getTerms(); + } else if( pq.getTerms().length == 1 ){ flatQueries.add( new TermQuery( pq.getTerms()[0] ) ); } @@ -327,6 +359,7 @@ int slop; // valid if terminal == true and phraseHighlight == true float boost; // valid if terminal == true int termOrPhraseNumber; // valid if terminal == true + int totalTerms; FieldQuery fieldQuery; Map subMap = new HashMap(); @@ -355,10 +388,12 @@ else if( query instanceof PhraseQuery ){ PhraseQuery pq = (PhraseQuery)query; Term[] terms = pq.getTerms(); + totalTerms = terms.length; Map map = subMap; QueryPhraseMap qpm = null; for( Term term : terms ){ qpm = getOrNewMap( map, term.text() ); + qpm.totalTerms = totalTerms; map = qpm.subMap; } qpm.markTerminal( pq.getSlop(), pq.getBoost() ); @@ -419,10 +454,49 @@ int pos = phraseCandidate.get( 0 ).getPosition(); for( int i = 1; i < phraseCandidate.size(); i++ ){ int nextPos = phraseCandidate.get( i ).getPosition(); - if( Math.abs( nextPos - pos - 1 ) > slop ) return false; + if( Math.abs( nextPos - pos + getDiff() ) > slop ) return false; pos = nextPos; } + if(slop > 0) { + checkSlop(phraseCandidate); + } return true; } + + private void checkSlop(List phraseCandidate) { + int slop = calculateSlop(); + int firstPos = -1; + int firstCandidatePos = 0; + List toRemove = new LinkedList(); + int pos = firstPos = phraseCandidate.get(firstCandidatePos).getPosition(); + for (int i = 1; i < phraseCandidate.size(); i++) { + TermInfo current = phraseCandidate.get(i); + int nextPos = current.getPosition(); + if( Math.abs( nextPos - pos + getDiff() ) > slop) { + toRemove.add(current); + } + //check first and currentLast + if(!toRemove.contains(current) && Math.abs( nextPos - firstPos + getDiff() ) > slop) { + toRemove.add(phraseCandidate.get(firstCandidatePos)); + firstCandidatePos++; + } + pos = nextPos; + } + for (TermInfo wrongElement : toRemove) { + phraseCandidate.remove(wrongElement); + } + } + + private int getDiff() { + return (fieldQuery.isInverted) ? + 1 : - 1; + } + + private int calculateSlop() { + if(!fieldQuery.isInverted){ + return slop + (totalTerms -2); + } else { + return slop -2 +(totalTerms -2); + } + } } } Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (revision 1347557) +++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (working copy) @@ -17,6 +17,8 @@ */ import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.LinkedList; import java.util.List; @@ -30,7 +32,7 @@ public class FieldPhraseList { LinkedList phraseList = new LinkedList(); - + List visitedTerms = new LinkedList(); /** * create a FieldPhraseList that has no limit on the number of phrases to analyze * @@ -66,6 +68,7 @@ while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) ) { phraseCandidate.clear(); + visitedTerms.clear(); TermInfo ti = fieldTermStack.pop(); currMap = fieldQuery.getFieldTermMap( field, ti.getText() ); @@ -75,14 +78,17 @@ // if found, search the longest phrase phraseCandidate.add( ti ); + addVisitedTerms(ti,fieldQuery); while( true ){ ti = fieldTermStack.pop(); nextMap = null; if( ti != null ) nextMap = currMap.getTermMap( ti.getText() ); - if( ti == null || nextMap == null ){ - if( ti != null ) + if( ti == null || (nextMap == null && !(fieldQuery.hasSlop() && visitedTerms.contains(ti.getText())))) { + if( ti != null ) { fieldTermStack.push( ti ); + visitedTerms.remove(ti.getText()); + } if( currMap.isValidTermOrPhrase( phraseCandidate ) ){ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); } @@ -100,11 +106,20 @@ } else{ phraseCandidate.add( ti ); - currMap = nextMap; + if( !visitedTerms.contains(ti.getText())) { + addVisitedTerms(ti, fieldQuery); + currMap = nextMap; + } } } } } + + private void addVisitedTerms(TermInfo ti,FieldQuery query) { + if(query.isPhraseQuery() && query.hasSlop()) { + visitedTerms.add(ti.getText()); + } + } public void addIfNoOverlap( WeightedPhraseInfo wpi ){ for( WeightedPhraseInfo existWpi : getPhraseList() ){ @@ -161,6 +176,14 @@ } public WeightedPhraseInfo( LinkedList terms, float boost, int seqnum ){ + //sort terms necessary only for inverted slop query + Collections.sort(terms, new Comparator() { + + public int compare(TermInfo o1, TermInfo o2) { + // TODO Auto-generated method stub + return o1.getPosition() - o2.getPosition(); + } + }); this.boost = boost; this.seqnum = seqnum;