Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (revision 1137472) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (working copy) @@ -16,6 +16,7 @@ * limitations under the License. */ +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; @@ -23,14 +24,18 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.regex.Pattern; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; /** @@ -48,6 +53,7 @@ // fieldMatch==true, Map // fieldMatch==false, Map Map> termSetMap = new HashMap>(); + Map> termPatternsMap = new HashMap>(); int termOrPhraseNumber; // used for colored tag support @@ -85,7 +91,12 @@ flatten( query, flatQueries ); } } - else if( sourceQuery instanceof TermQuery ){ + else if( sourceQuery instanceof TermQuery || + sourceQuery instanceof WildcardQuery || + sourceQuery instanceof PrefixQuery || + sourceQuery instanceof RegexpQuery + ) + { if( !flatQueries.contains( sourceQuery ) ) flatQueries.add( sourceQuery ); } @@ -200,15 +211,55 @@ */ private String getKey( Query query ){ if( !fieldMatch ) return null; + String key = getQueryField (query); + if (key != null) + return key; + else + throw new RuntimeException( "query \"" + query.toString() + "\" must be flattened first." ); + } + + /** + * @param query a non-composite query + * @return some text representation of the query sufficient for + * @link QuerySearchMap#searchPhrase to match terms with + */ + static String getQueryTermText (Query query) { + if( query instanceof TermQuery ) { + return ((TermQuery)query).getTerm().text(); + } + if (query instanceof WildcardQuery) { + String regex = ((WildcardQuery)query).getTerm().text(); + return regex.replace("*", ".*").replace('?', '.'); + } + if (query instanceof PrefixQuery) { + return ((PrefixQuery)query).getPrefix().text() + ".*"; + } + if (query instanceof RegexpQuery) { + String regex = query.toString(((RegexpQuery)query).getField()); + if (regex.contains("^")) + regex = regex.substring(0, regex.indexOf('^')); + return regex; + } + return null; + } + + static String getQueryField (Query query) { if( query instanceof TermQuery ) return ((TermQuery)query).getTerm().field(); + else if (query instanceof WildcardQuery) { + String regex = ((WildcardQuery)query).getTerm().field(); + return regex.replace("*", ".*").replace('?', '.'); + } + else if (query instanceof PrefixQuery) { + return ((PrefixQuery)query).getPrefix().field() + ".*"; + } + else if (query instanceof RegexpQuery) { + return ((RegexpQuery) query).getField(); + } else if ( query instanceof PhraseQuery ){ - PhraseQuery pq = (PhraseQuery)query; - Term[] terms = pq.getTerms(); - return terms[0].field(); + return ((PhraseQuery)query).getTerms()[0].field(); } - else - throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + return null; } /* @@ -235,19 +286,27 @@ */ void saveTerms( Collection flatQueries ){ for( Query query : flatQueries ){ - Set termSet = getTermSet( query ); - if( query instanceof TermQuery ) - termSet.add( ((TermQuery)query).getTerm().text() ); - else if( query instanceof PhraseQuery ){ + Set termSet = getFieldTermSet( query ); + if( query instanceof PhraseQuery ){ for( Term term : ((PhraseQuery)query).getTerms() ) termSet.add( term.text() ); } - else - throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + else if (query instanceof TermQuery) { + termSet.add (((TermQuery)query).getTerm().text()); + } + else { + String queryTermText = getQueryTermText(query); + if (queryTermText != null) { + List fieldTermPatterns = getFieldTermPatterns (query); + fieldTermPatterns.add(Pattern.compile(queryTermText)); + } else { + throw new RuntimeException( "query \"" + query.toString() + "\" must be flattened first." ); + } + } } } - private Set getTermSet( Query query ){ + private Set getFieldTermSet( Query query ){ String key = getKey( query ); Set set = termSetMap.get( key ); if( set == null ){ @@ -257,9 +316,41 @@ return set; } + private List getFieldTermPatterns (Query query) { + if (termPatternsMap == null) { + termPatternsMap = new HashMap>(); + return null; + } + String key = getKey(query); + List patterns = termPatternsMap.get(key); + if (patterns == null) { + patterns = new ArrayList(); + termPatternsMap.put(key, patterns); + } + return patterns; + } + Set getTermSet( String field ){ return termSetMap.get( fieldMatch ? field : null ); } + + public boolean matchesTerm (String field, String term) { + Set fieldTerms = getTermSet (field); + if (fieldTerms != null && fieldTerms.contains(term)) { + return true; + } + if (termPatternsMap != null) { + List fieldTermPatterns = termPatternsMap.get(fieldMatch ? field : null); + if (fieldTermPatterns != null) { + for (Pattern pattern : fieldTermPatterns) { + if (pattern.matcher(term).matches()) { + return true; + } + } + } + } + return false; + } /** * @@ -269,7 +360,7 @@ */ public QueryPhraseMap getFieldTermMap( String fieldName, String term ){ QueryPhraseMap rootMap = getRootMap( fieldName ); - return rootMap == null ? null : rootMap.subMap.get( term ); + return rootMap == null ? null : rootMap.getTermMap( term ); } /** @@ -298,8 +389,11 @@ int slop; // valid if terminal == true and phraseHighlight == true float boost; // valid if terminal == true int termOrPhraseNumber; // valid if terminal == true + // if terminal and term is to be interpreted as a regex + Pattern regex; FieldQuery fieldQuery; Map subMap = new HashMap(); + ArrayList patternMaps; // submaps with regexes public QueryPhraseMap( FieldQuery fieldQuery ){ this.fieldQuery = fieldQuery; @@ -307,9 +401,18 @@ void addTerm( Term term, float boost ){ QueryPhraseMap map = getOrNewMap( subMap, term.text() ); - map.markTerminal( boost ); + map.markTerminal( 0, boost, null ); } + void addTermPattern( Pattern termPattern, float boost ){ + if (patternMaps == null) { + patternMaps = new ArrayList(); + } + QueryPhraseMap map = new QueryPhraseMap(fieldQuery); + patternMaps.add(map); + map.markTerminal( 0, boost, termPattern ); + } + private QueryPhraseMap getOrNewMap( Map subMap, String term ){ QueryPhraseMap map = subMap.get( term ); if( map == null ){ @@ -332,24 +435,36 @@ qpm = getOrNewMap( map, term.text() ); map = qpm.subMap; } - qpm.markTerminal( pq.getSlop(), pq.getBoost() ); + qpm.markTerminal( pq.getSlop(), pq.getBoost(), null ); } - else - throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + else { + String queryTermText = FieldQuery.getQueryTermText (query); + if (queryTermText != null) + addTermPattern (Pattern.compile(queryTermText), query.getBoost()); + else + throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + } } public QueryPhraseMap getTermMap( String term ){ - return subMap.get( term ); + QueryPhraseMap map = subMap.get( term ); + if (map != null) + return map; + if (patternMaps != null) { + for (QueryPhraseMap m : patternMaps) { + if (m.regex.matcher(term).matches()) + // and if there are multiple matches? will it influence the score only? + return m; + } + } + return null; } - private void markTerminal( float boost ){ - markTerminal( 0, boost ); - } - - private void markTerminal( int slop, float boost ){ + private void markTerminal( int slop, float boost, Pattern regex ){ this.terminal = true; this.slop = slop; this.boost = boost; + this.regex = regex; this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber(); } @@ -372,7 +487,8 @@ public QueryPhraseMap searchPhrase( final List phraseCandidate ){ QueryPhraseMap currMap = this; for( TermInfo ti : phraseCandidate ){ - currMap = currMap.subMap.get( ti.getText() ); + String termText = ti.getText(); + currMap = currMap.getTermMap( termText ); if( currMap == null ) return null; } return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null; Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (revision 1137472) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (working copy) @@ -81,16 +81,18 @@ Set termSet = fieldQuery.getTermSet( fieldName ); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if( termSet == null ) return; + final CharsRef spare = new CharsRef(); for( BytesRef term : tpv.getTerms() ){ - if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue; + String termString = term.utf8ToChars(spare).toString(); + if( !fieldQuery.matchesTerm(fieldName, termString) ) continue; int index = tpv.indexOf( term ); TermVectorOffsetInfo[] tvois = tpv.getOffsets( index ); if( tvois == null ) return; // just return to make null snippets int[] poss = tpv.getTermPositions( index ); if( poss == null ) return; // just return to make null snippets for( int i = 0; i < tvois.length; i++ ) - termList.add( new TermInfo( term.utf8ToChars(spare).toString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); + termList.add( new TermInfo( termString, tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); } // sort by position Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java (revision 1137472) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java (working copy) @@ -22,9 +22,11 @@ import java.util.Map; import java.util.Set; +import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; @@ -834,4 +836,41 @@ phraseCandidate.add( new TermInfo( "c", 4, 5, 6 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); } + + public void testHighlightQuery() throws Exception { + Query query = paW.parse("head*tail"); + FieldQuery fq = new FieldQuery( query, true, true ); + QueryPhraseMap qpm = fq.getFieldTermMap(F, "headandtail"); + assertNotNull (qpm); + assertNotNull(qpm.regex); + assertNull (fq.getFieldTermMap(F, "head")); + List phraseCandidate = new ArrayList(); + phraseCandidate.add( new TermInfo( "headandtail", 0, 12, 0 ) ); + assertNotNull (fq.searchPhrase(F, phraseCandidate)); + } + + public void testPrefixQuery() throws Exception { + Query query = paW.parse("head*"); + FieldQuery fq = new FieldQuery( query, false, false); + QueryPhraseMap qpm = fq.getFieldTermMap(null, "headandtail"); + assertNotNull (qpm); + assertNotNull (qpm.regex); + assertNull (fq.getFieldTermMap(null, "tail")); + List phraseCandidate = new ArrayList(); + phraseCandidate.add( new TermInfo( "headandtail", 0, 12, 0 ) ); + assertNotNull (fq.searchPhrase(null, phraseCandidate)); + } + + public void testRegexpQuery() throws Exception { + Term term = new Term(F, "h[adenti]+l"); + Query query = new RegexpQuery (term); + FieldQuery fq = new FieldQuery( query, true, true); + QueryPhraseMap qpm = fq.getFieldTermMap(F, "headandtail"); + assertNotNull (qpm); + assertNotNull (qpm.regex); + assertNull (fq.getFieldTermMap(F, "tail")); + List phraseCandidate = new ArrayList(); + phraseCandidate.add( new TermInfo( "headandtail", 0, 12, 0 ) ); + assertNotNull (fq.searchPhrase(F, phraseCandidate)); + } } Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java (revision 1137472) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java (working copy) @@ -158,4 +158,15 @@ assertEquals( "ee(90,92,63)", stack.pop().toString() ); assertEquals( "ed(91,93,64)", stack.pop().toString() ); } + + public void testWildcard() throws Exception { + makeIndexLongMV(); + FieldQuery fq = new FieldQuery( paW.parse("th*e"), true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + assertEquals (4, stack.termList.size()); + assertEquals ("the(15,18,2)", stack.pop().toString()); + assertEquals ("these(133,138,20)", stack.pop().toString()); + assertEquals ("the(153,156,23)", stack.pop().toString()); + assertEquals ("the(195,198,31)", stack.pop().toString()); + } } Index: modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java =================================================================== --- modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java (revision 1137472) +++ modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java (working copy) @@ -84,7 +84,14 @@ new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 10), new SpanTermQuery(new Term(field, "credit")) }, 10, false), wcq, }; } - + /* +public static void main (String[] argv) { + List qs = new ArrayList(); + for (String q : STANDARD_QUERIES) + qs.add(q); + createQueries (qs, new StandardAnalyzer(Version.LUCENE_CURRENT)); +} +*/ /** * Parse the strings containing Lucene queries. * @@ -101,19 +108,20 @@ Object query = qs.get(i); Query q = null; if (query instanceof String) { - q = qp.parse((String) query); - + String qstring = (String) query; + q = qp.parse(qstring); + queries.add(q); + // Make a wildcard or prefix query + //qstring = qstring.replaceFirst("([A-Za-z])[a-z]{3}", "$1*"); + //q = qp.parse(qstring); + //queries.add(q); } else if (query instanceof Query) { q = (Query) query; - + queries.add(q); } else { System.err.println("Unsupported Query Type: " + query); } - if (q != null) { - queries.add(q); - } - } catch (Exception e) { e.printStackTrace(); } Index: solr/src/test/org/apache/solr/highlight/FastVectorHighlighterTest.java =================================================================== --- solr/src/test/org/apache/solr/highlight/FastVectorHighlighterTest.java (revision 1137472) +++ solr/src/test/org/apache/solr/highlight/FastVectorHighlighterTest.java (working copy) @@ -69,10 +69,14 @@ "id", "1")); assertU(commit()); assertU(optimize()); - assertQ("Basic summarization", - sumLRF.makeRequest("tv_text:vector"), - "//lst[@name='highlighting']/lst[@name='1']", - "//lst[@name='1']/arr[@name='tv_text']/str[.=' fast vector highlighter test ']" - ); + + String[] tests = new String[] { + "//lst[@name='highlighting']/lst[@name='1']", + "//lst[@name='1']/arr[@name='tv_text']/str[.=' fast vector highlighter test ']" + }; + assertQ("Basic summarization", sumLRF.makeRequest("tv_text:vector"), tests); + assertQ("Wildcard match", sumLRF.makeRequest("tv_text:v*r"), tests); + assertQ("Prefix match", sumLRF.makeRequest("tv_text:vec*"), tests); + assertQ("Regex match", sumLRF.makeRequest("tv_text:/v[a-z]+r/"), tests); } }