Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java (revision 1188460) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java (working copy) @@ -27,13 +27,13 @@ FieldQuery fq = new FieldQuery( tq( "a" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "a(1.0)((0,1))", fpl.getPhraseList().get( 0 ).toString() ); fq = new FieldQuery( tq( "b" ), true, true ); stack = new FieldTermStack( reader, 0, F, fq ); fpl = new FieldPhraseList( stack, fq ); - assertEquals( 0, fpl.phraseList.size() ); + assertEquals( 0, fpl.getPhraseList().size() ); } public void test2TermsIndex() throws Exception { @@ -42,9 +42,9 @@ FieldQuery fq = new FieldQuery( tq( "a" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 2, fpl.phraseList.size() ); - assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( "a(1.0)((2,3))", fpl.phraseList.get( 1 ).toString() ); + assertEquals( 2, fpl.getPhraseList().size() ); + assertEquals( "a(1.0)((0,1))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( "a(1.0)((2,3))", fpl.getPhraseList().get( 1 ).toString() ); } public void test1PhraseIndex() throws Exception { @@ -53,14 +53,14 @@ FieldQuery fq = new FieldQuery( pqF( "a", "b" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "ab(1.0)((0,3))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "ab(1.0)((0,3))", fpl.getPhraseList().get( 0 ).toString() ); fq = new FieldQuery( tq( "b" ), true, true ); stack = new FieldTermStack( reader, 0, F, fq ); fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "b(1.0)((2,3))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "b(1.0)((2,3))", fpl.getPhraseList().get( 0 ).toString() ); } public void test1PhraseIndexB() throws Exception { @@ -72,8 +72,8 @@ FieldQuery fq = new FieldQuery( pqF( "ba", "ac" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "baac(1.0)((2,5))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "baac(1.0)((2,5))", fpl.getPhraseList().get( 0 ).toString() ); } public void test2ConcatTermsIndexB() throws Exception { @@ -85,9 +85,9 @@ FieldQuery fq = new FieldQuery( tq( "ab" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 2, fpl.phraseList.size() ); - assertEquals( "ab(1.0)((0,2))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( "ab(1.0)((2,4))", fpl.phraseList.get( 1 ).toString() ); + assertEquals( 2, fpl.getPhraseList().size() ); + assertEquals( "ab(1.0)((0,2))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( "ab(1.0)((2,4))", fpl.getPhraseList().get( 1 ).toString() ); } public void test2Terms1PhraseIndex() throws Exception { @@ -97,16 +97,16 @@ FieldQuery fq = new FieldQuery( pqF( "a", "b" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "ab(1.0)((4,7))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "ab(1.0)((4,7))", fpl.getPhraseList().get( 0 ).toString() ); // phraseHighlight = false fq = new FieldQuery( pqF( "a", "b" ), false, true ); stack = new FieldTermStack( reader, 0, F, fq ); fpl = new FieldPhraseList( stack, fq ); - assertEquals( 2, fpl.phraseList.size() ); - assertEquals( "a(1.0)((2,3))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( "ab(1.0)((4,7))", fpl.phraseList.get( 1 ).toString() ); + assertEquals( 2, fpl.getPhraseList().size() ); + assertEquals( "a(1.0)((2,3))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( "ab(1.0)((4,7))", fpl.getPhraseList().get( 1 ).toString() ); } public void testPhraseSlop() throws Exception { @@ -115,10 +115,10 @@ FieldQuery fq = new FieldQuery( pqF( 2F, 1, "a", "c" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "ac(2.0)((4,5)(8,9))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( 4, fpl.phraseList.get( 0 ).getStartOffset() ); - assertEquals( 9, fpl.phraseList.get( 0 ).getEndOffset() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "ac(2.0)((4,5)(8,9))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( 4, fpl.getPhraseList().get( 0 ).getStartOffset() ); + assertEquals( 9, fpl.getPhraseList().get( 0 ).getEndOffset() ); } public void test2PhrasesOverlap() throws Exception { @@ -130,8 +130,8 @@ FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "abc(1.0)((2,7))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "abc(1.0)((2,7))", fpl.getPhraseList().get( 0 ).toString() ); } public void test3TermsPhrase() throws Exception { @@ -140,8 +140,8 @@ FieldQuery fq = new FieldQuery( pqF( "a", "b", "c" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "abc(1.0)((6,11))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "abc(1.0)((6,11))", fpl.getPhraseList().get( 0 ).toString() ); } public void testSearchLongestPhrase() throws Exception { @@ -153,9 +153,9 @@ FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 2, fpl.phraseList.size() ); - assertEquals( "ab(1.0)((2,5))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( "abc(1.0)((10,15))", fpl.phraseList.get( 1 ).toString() ); + assertEquals( 2, fpl.getPhraseList().size() ); + assertEquals( "ab(1.0)((2,5))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( "abc(1.0)((10,15))", fpl.getPhraseList().get( 1 ).toString() ); } public void test1PhraseShortMV() throws Exception { @@ -164,8 +164,8 @@ FieldQuery fq = new FieldQuery( tq( "d" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "d(1.0)((9,10))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "d(1.0)((9,10))", fpl.getPhraseList().get( 0 ).toString() ); } public void test1PhraseLongMV() throws Exception { @@ -174,9 +174,9 @@ FieldQuery fq = new FieldQuery( pqF( "search", "engines" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 2, fpl.phraseList.size() ); - assertEquals( "searchengines(1.0)((102,116))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( "searchengines(1.0)((157,171))", fpl.phraseList.get( 1 ).toString() ); + assertEquals( 2, fpl.getPhraseList().size() ); + assertEquals( "searchengines(1.0)((102,116))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( "searchengines(1.0)((157,171))", fpl.getPhraseList().get( 1 ).toString() ); } public void test1PhraseLongMVB() throws Exception { @@ -185,8 +185,8 @@ FieldQuery fq = new FieldQuery( pqF( "sp", "pe", "ee", "ed" ), true, true ); // "speed" -(2gram)-> "sp","pe","ee","ed" FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "sppeeeed(1.0)((88,93))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "sppeeeed(1.0)((88,93))", fpl.getPhraseList().get( 0 ).toString() ); } /* This test shows a big speedup from limiting the number of analyzed phrases in @@ -212,10 +212,10 @@ FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq, limit); if (limit < 0 || limit > 16000) - assertEquals( 16000, fpl.phraseList.size() ); + assertEquals( 16000, fpl.getPhraseList().size() ); else - assertEquals( limit, fpl.phraseList.size() ); - assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() ); + assertEquals( limit, fpl.getPhraseList().size() ); + assertEquals( "a(1.0)((0,1))", fpl.getPhraseList().get( 0 ).toString() ); } */ } Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java (revision 1188460) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java (working copy) @@ -151,10 +151,10 @@ FieldQuery fq = new FieldQuery( pqF( "personal", "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "personalcomputer(1.0)((3,5))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); - assertEquals( 5, fpl.phraseList.get( 0 ).getEndOffset() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "personalcomputer(1.0)((3,5))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( 3, fpl.getPhraseList().get( 0 ).getStartOffset() ); + assertEquals( 5, fpl.getPhraseList().get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex1w2wSearch1partial() throws Exception { @@ -163,10 +163,10 @@ FieldQuery fq = new FieldQuery( tq( "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "computer(1.0)((3,5))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); - assertEquals( 5, fpl.phraseList.get( 0 ).getEndOffset() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "computer(1.0)((3,5))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( 3, fpl.getPhraseList().get( 0 ).getStartOffset() ); + assertEquals( 5, fpl.getPhraseList().get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex1w2wSearch1term1phrase() throws Exception { @@ -178,10 +178,10 @@ FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertTrue( fpl.phraseList.get( 0 ).toString().indexOf( "(1.0)((3,5))" ) > 0 ); - assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); - assertEquals( 5, fpl.phraseList.get( 0 ).getEndOffset() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertTrue( fpl.getPhraseList().get( 0 ).toString().indexOf( "(1.0)((3,5))" ) > 0 ); + assertEquals( 3, fpl.getPhraseList().get( 0 ).getStartOffset() ); + assertEquals( 5, fpl.getPhraseList().get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex2w1wSearch1term() throws Exception { @@ -190,10 +190,10 @@ FieldQuery fq = new FieldQuery( tq( "pc" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "pc(1.0)((3,20))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); - assertEquals( 20, fpl.phraseList.get( 0 ).getEndOffset() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "pc(1.0)((3,20))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( 3, fpl.getPhraseList().get( 0 ).getStartOffset() ); + assertEquals( 20, fpl.getPhraseList().get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex2w1wSearch1phrase() throws Exception { @@ -202,10 +202,10 @@ FieldQuery fq = new FieldQuery( pqF( "personal", "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "personalcomputer(1.0)((3,20))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); - assertEquals( 20, fpl.phraseList.get( 0 ).getEndOffset() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "personalcomputer(1.0)((3,20))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( 3, fpl.getPhraseList().get( 0 ).getStartOffset() ); + assertEquals( 20, fpl.getPhraseList().get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex2w1wSearch1partial() throws Exception { @@ -214,10 +214,10 @@ FieldQuery fq = new FieldQuery( tq( "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertEquals( "computer(1.0)((3,20))", fpl.phraseList.get( 0 ).toString() ); - assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); - assertEquals( 20, fpl.phraseList.get( 0 ).getEndOffset() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertEquals( "computer(1.0)((3,20))", fpl.getPhraseList().get( 0 ).toString() ); + assertEquals( 3, fpl.getPhraseList().get( 0 ).getStartOffset() ); + assertEquals( 20, fpl.getPhraseList().get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex2w1wSearch1term1phrase() throws Exception { @@ -229,10 +229,10 @@ FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); - assertEquals( 1, fpl.phraseList.size() ); - assertTrue( fpl.phraseList.get( 0 ).toString().indexOf( "(1.0)((3,20))" ) > 0 ); - assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); - assertEquals( 20, fpl.phraseList.get( 0 ).getEndOffset() ); + assertEquals( 1, fpl.getPhraseList().size() ); + assertTrue( fpl.getPhraseList().get( 0 ).toString().indexOf( "(1.0)((3,20))" ) > 0 ); + assertEquals( 3, fpl.getPhraseList().get( 0 ).getStartOffset() ); + assertEquals( 20, fpl.getPhraseList().get( 0 ).getEndOffset() ); } private void makeIndex1w() throws Exception { Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (revision 1188460) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (working copy) @@ -45,7 +45,7 @@ } /** - * a constructor. Using {@link SimpleFragListBuilder} and {@link ScoreOrderFragmentsBuilder}. + * a constructor. Using {@link FragListBuilder} and {@link ScoreOrderFragmentsBuilder}. * * @param phraseHighlight true or false for phrase highlighting * @param fieldMatch true of false for field matching Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java (revision 0) @@ -0,0 +1,82 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.search.vectorhighlight.FieldFragList.FieldFragInfo.SubInfo; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.FieldPhraseInfo; +import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; + +/** + * A weighted implementation of {@link FieldFragList}. + */ +public class WeightedFieldFragList extends FieldFragList { + + /** + * a constructor. + * + * @param fragCharSize the length (number of chars) of a fragment + */ + public WeightedFieldFragList( int fragCharSize ) { + super( fragCharSize ); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List phraseInfoList ) + */ + @Override + public void add( int startOffset, int endOffset, List phraseInfoList ) { + + float score = 0; + + List subInfos = new ArrayList(); + + HashSet distinctTerms = new HashSet(); + + int length = 0; + + for( FieldPhraseInfo phraseInfo : phraseInfoList ){ + + subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffset(), phraseInfo.getSeqnum() ) ); + + Iterator it = phraseInfo.getTermsInfos().iterator(); + TermInfo ti; + + while ( it.hasNext() ) { + ti = ( TermInfo ) it.next(); + if ( distinctTerms.add( ti.getText() ) ) + // We take the Math.pow(ti.weight, 2) here to boost important (un-common) terms a little bit. + score += Math.pow( ti.getWeight(), 2 ) * phraseInfo.getBoost(); + length++; + } + } + + // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query + // would cause an equal weight for all fragments regardless of how much words they contain. + // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments + // we "bend" the length with a standard-normalization a little bit. + score *= length * ( 1 / Math.sqrt( length ) ); + + getFragInfos().add( new FieldFragInfo( startOffset, endOffset, subInfos, score ) ); + } + +} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (revision 1188460) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (working copy) @@ -18,7 +18,9 @@ import java.io.IOException; import java.util.Collections; +import java.util.HashMap; import java.util.LinkedList; +import java.util.Map; import java.util.Set; import org.apache.lucene.index.IndexReader; @@ -85,7 +87,12 @@ catch( ClassCastException e ){ return; // just return to make null snippets } + + Map lookup = new HashMap(); + int numDocs = reader.numDocs() - reader.numDeletedDocs(); + float weight = 0; + final CharsRef spare = new CharsRef(); for( BytesRef term : tpv.getTerms() ){ if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue; @@ -94,8 +101,17 @@ if( tvois == null ) return; // just return to make null snippets int[] poss = tpv.getTermPositions( index ); if( poss == null ) return; // just return to make null snippets + + // We don't want to retrieve docFreq every time. + if ( lookup.containsKey( term ) ) + weight = lookup.get( term ).floatValue(); + else { + weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, term ) + 1 ) ) + 1.0 ); + lookup.put( term, new Float( weight ) ); + } + for( int i = 0; i < tvois.length; i++ ) - termList.add( new TermInfo( term.utf8ToChars(spare).toString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); + termList.add( new TermInfo( term.utf8ToChars(spare).toString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i], weight ) ); } // sort by position @@ -134,32 +150,42 @@ public static class TermInfo implements Comparable{ - final String text; - final int startOffset; - final int endOffset; - final int position; + private final String text; + private final int startOffset; + private final int endOffset; + private final int position; + private final float weight; - TermInfo( String text, int startOffset, int endOffset, int position ){ + public TermInfo( String text, int startOffset, int endOffset, int position, float weight ){ this.text = text; this.startOffset = startOffset; this.endOffset = endOffset; this.position = position; + this.weight = weight; } + // Since the test-case failed here's another constructor. + public TermInfo( String text, int startOffset, int endOffset, int position ){ + this(text, startOffset, endOffset, position, 1); + } + public String getText(){ return text; } public int getStartOffset(){ return startOffset; } public int getEndOffset(){ return endOffset; } public int getPosition(){ return position; } + public float getWeight(){ return weight; } @Override public String toString(){ StringBuilder sb = new StringBuilder(); - sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' ); + sb.append( text ).append( '^' ).append( weight ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' ); return sb.toString(); } public int compareTo( TermInfo o ) { return ( this.position - o.position ); } + } + } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java (revision 0) @@ -0,0 +1,32 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A weighted implementation of {@link FragListBuilder}. + */ +public class WeightedFragListBuilder extends BaseFragListBuilder { + + /* (non-Javadoc) + * @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) + */ + public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){ + return createFieldFragList( fieldPhraseList, new WeightedFieldFragList( fragCharSize ), fragCharSize ); + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java (revision 1188460) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java (working copy) @@ -19,7 +19,7 @@ import java.util.List; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.FieldFragList.FieldFragInfo; /** * A simple implementation of FragmentsBuilder. @@ -56,7 +56,7 @@ * do nothing. return the source list. */ @Override - public List getWeightedFragInfoList( List src ) { + public List getWeightedFragInfoList( List src ) { return src; } } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java (revision 0) @@ -0,0 +1,92 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.FieldPhraseInfo; + +/** + * A abstract implementation of {@link FragListBuilder}. + */ +public abstract class BaseFragListBuilder implements FragListBuilder { + + public static final int MARGIN_DEFAULT = 6; + public static final int MIN_FRAG_CHAR_SIZE_FACTOR = 3; + + final int margin; + final int minFragCharSize; + + public BaseFragListBuilder( int margin ){ + if( margin < 0 ) + throw new IllegalArgumentException( "margin(" + margin + ") is too small. It must be 0 or higher." ); + + this.margin = margin; + this.minFragCharSize = Math.max( 1, margin * MIN_FRAG_CHAR_SIZE_FACTOR ); + } + + public BaseFragListBuilder(){ + this( MARGIN_DEFAULT ); + } + + protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ + List wpil = new ArrayList(); + Iterator ite = fieldPhraseList.getPhraseList().iterator(); + FieldPhraseInfo phraseInfo = null; + int startOffset = 0; + boolean taken = false; + while( true ){ + if( !taken ){ + if( !ite.hasNext() ) break; + phraseInfo = ite.next(); + } + taken = false; + if( phraseInfo == null ) break; + + // if the phrase violates the border of previous fragment, discard it and try next phrase + if( phraseInfo.getStartOffset() < startOffset ) continue; + + wpil.clear(); + wpil.add( phraseInfo ); + int st = phraseInfo.getStartOffset() - margin < startOffset ? + startOffset : phraseInfo.getStartOffset() - margin; + int en = st + fragCharSize; + if( phraseInfo.getEndOffset() > en ) + en = phraseInfo.getEndOffset(); + startOffset = en; + + while( true ){ + if( ite.hasNext() ){ + phraseInfo = ite.next(); + taken = true; + if( phraseInfo == null ) break; + } + else + break; + if( phraseInfo.getEndOffset() <= en ) + wpil.add( phraseInfo ); + else + break; + } + fieldFragList.add( st, en, wpil ); + } + return fieldFragList; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (revision 1188460) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (working copy) @@ -27,7 +27,7 @@
  • support multi-term (includes wildcard, range, regexp, etc) queries
  • need Java 1.5
  • highlight fields need to be stored with Positions and Offsets
  • -
  • take into account query boost to score fragments
  • +
  • take into account query boost and/or IDF-weight to score fragments
  • support colored highlight tags
  • pluggable FragListBuilder
  • pluggable FragmentsBuilder
  • @@ -121,10 +121,12 @@ |"search library"|[(12,18),(26,33)]|w=1| +----------------+-----------------+---+ -

    The type of each entry is WeightedPhraseInfo that consists of -an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to -calculate the weight) will be taken into account when Fast Vector Highlighter creates -{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.

    +

    The type of each entry is FieldPhraseInfo that consists of +an array of Toffs (terms offsets) and an array TermInfo (terms information). + + + +

    Step 4.

    In Step 4, Fast Vector Highlighter creates FieldFragList by reference to FieldPhraseList. In this sample case, the following @@ -134,9 +136,42 @@ +---------------------------------+ |"Lucene"[(0,6)] | |"search library"[(12,18),(26,33)]| -|totalBoost=3 | +|score=3 | +---------------------------------+ + +

    +The calculation of the score of each fragment depends on the implementation of FragListBuilder and FieldFragList. +Currently there are basically to approaches available: +

      +
    • SimpleFragListBuilder: sum-of-boosts-approach. The score is calculated by summarizing the query-boosts per term. Per default a term is boosted by 1.0
    • +
    • WeightedFragListBuilder: sum-of-distinct-weights-approach. The score is calculated by summarizing the IDF-weights of distinct terms.
    • +
    +

    Comparison of the two approaches:

    + + + + + + + + + + + + + + + + + + + +
    + query = das alte testament +
    Terms in fragmentsum-of-distinct-weightssum-of-boosts
    das alte testament5.3396213.0
    das alte testament5.3396213.0
    das testament alte5.3396213.0
    das alte testament5.3396213.0
    das testament2.94556882.0
    das alte2.47595952.0
    das das das das1.50153574.0
    das das das1.30036813.0
    das das1.0617462.0
    alte1.01.0
    alte1.01.0
    das0.75076781.0
    das0.75076781.0
    das0.75076781.0
    das0.75076781.0
    das0.75076781.0
    + +

    Step 5.

    In Step 5, by using FieldFragList and the field stored data, Fast Vector Highlighter creates highlighted snippets!

    Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (revision 1188460) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (working copy) @@ -20,16 +20,18 @@ import java.util.ArrayList; import java.util.List; -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.FieldPhraseInfo; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.FieldPhraseInfo.Toffs; /** * FieldFragList has a list of "frag info" that is used by FragmentsBuilder class * to create fragments (snippets). */ -public class FieldFragList { +public abstract class FieldFragList { - private List fragInfos = new ArrayList(); + private List fragInfos = new ArrayList(); + + protected int fragCharSize; /** * a constructor. @@ -37,52 +39,47 @@ * @param fragCharSize the length (number of chars) of a fragment */ public FieldFragList( int fragCharSize ){ + this.fragCharSize = fragCharSize; } /** - * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos + * convert the list of FieldPhraseInfo to FieldFragInfo, then add it to the fragInfos * * @param startOffset start offset of the fragment * @param endOffset end offset of the fragment - * @param phraseInfoList list of WeightedPhraseInfo objects + * @param phraseInfoList list of FieldPhraseInfo objects */ - public void add( int startOffset, int endOffset, List phraseInfoList ){ - fragInfos.add( new WeightedFragInfo( startOffset, endOffset, phraseInfoList ) ); - } + public abstract void add( int startOffset, int endOffset, List phraseInfoList ); /** - * return the list of WeightedFragInfos. + * return the list of FieldFragInfos. * * @return fragInfos. - */ - public List getFragInfos() { + */ + public List getFragInfos() { return fragInfos; } - public static class WeightedFragInfo { + public static class FieldFragInfo { - List subInfos; - float totalBoost; - int startOffset; - int endOffset; + private List subInfos; + private float score; + private int startOffset; + private int endOffset; - public WeightedFragInfo( int startOffset, int endOffset, List phraseInfoList ){ + public FieldFragInfo( int startOffset, int endOffset, List subInfos, float score ){ this.startOffset = startOffset; this.endOffset = endOffset; - subInfos = new ArrayList(); - for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ - SubInfo subInfo = new SubInfo( phraseInfo.text, phraseInfo.termsOffsets, phraseInfo.seqnum ); - subInfos.add( subInfo ); - totalBoost += phraseInfo.boost; - } + this.subInfos = subInfos; + this.score = score; } public List getSubInfos(){ return subInfos; } - public float getTotalBoost(){ - return totalBoost; + public float getScore(){ + return score; } public int getStartOffset(){ @@ -99,17 +96,22 @@ sb.append( "subInfos=(" ); for( SubInfo si : subInfos ) sb.append( si.toString() ); - sb.append( ")/" ).append( totalBoost ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); + sb.append( ")/" ).append( score ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); return sb.toString(); } public static class SubInfo { - final String text; // unnecessary member, just exists for debugging purpose - final List termsOffsets; // usually termsOffsets.size() == 1, - // but if position-gap > 1 and slop > 0 then size() could be greater than 1 - int seqnum; + + // unnecessary member, just exists for debugging purpose + private final String text; + + // usually termsOffsets.size() == 1, + // but if position-gap > 1 and slop > 0 then size() could be greater than 1 + private final List termsOffsets; + + private int seqnum; - SubInfo( String text, List termsOffsets, int seqnum ){ + public SubInfo( String text, List termsOffsets, int seqnum ){ this.text = text; this.termsOffsets = termsOffsets; this.seqnum = seqnum; @@ -123,6 +125,10 @@ return seqnum; } + public String getText(){ + return text; + } + @Override public String toString(){ StringBuilder sb = new StringBuilder(); @@ -134,4 +140,5 @@ } } } + } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (revision 1188460) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (working copy) @@ -1,4 +1,5 @@ package org.apache.lucene.search.vectorhighlight; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -24,12 +25,12 @@ import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; /** - * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder + * FieldPhraseList has a list of FieldPhraseInfo that is used by FragListBuilder * to create a FieldFragList object. */ public class FieldPhraseList { - LinkedList phraseList = new LinkedList(); + private LinkedList phraseList = new LinkedList(); /** * create a FieldPhraseList that has no limit on the number of phrases to analyze @@ -38,7 +39,7 @@ * @param fieldQuery FieldQuery object */ public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery){ - this (fieldTermStack, fieldQuery, Integer.MAX_VALUE); + this( fieldTermStack, fieldQuery, Integer.MAX_VALUE ); } /** @@ -54,8 +55,7 @@ LinkedList phraseCandidate = new LinkedList(); QueryPhraseMap currMap = null; QueryPhraseMap nextMap = null; - while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) ) - { + while( !fieldTermStack.isEmpty() && (getPhraseList().size() < phraseLimit) ){ phraseCandidate.clear(); TermInfo ti = fieldTermStack.pop(); @@ -75,14 +75,14 @@ if( ti != null ) fieldTermStack.push( ti ); if( currMap.isValidTermOrPhrase( phraseCandidate ) ){ - addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); + addIfNoOverlap( new FieldPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); } else{ while( phraseCandidate.size() > 1 ){ fieldTermStack.push( phraseCandidate.removeLast() ); currMap = fieldQuery.searchPhrase( field, phraseCandidate ); if( currMap != null ){ - addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); + addIfNoOverlap( new FieldPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); break; } } @@ -96,30 +96,49 @@ } } } - - void addIfNoOverlap( WeightedPhraseInfo wpi ){ - for( WeightedPhraseInfo existWpi : phraseList ){ - if( existWpi.isOffsetOverlap( wpi ) ) return; + + public void addIfNoOverlap( FieldPhraseInfo wpi ){ + for( FieldPhraseInfo existWpi : getPhraseList() ){ + if( existWpi.isOffsetOverlap( wpi ) ) { + existWpi.getTermsInfos().addAll( wpi.getTermsInfos() ); + return; + } } - phraseList.add( wpi ); + getPhraseList().add( wpi ); } - - public static class WeightedPhraseInfo { - String text; // unnecessary member, just exists for debugging purpose - List termsOffsets; // usually termsOffsets.size() == 1, - // but if position-gap > 1 and slop > 0 then size() could be greater than 1 - float boost; // query boost - int seqnum; + /** + * @return the phraseList + */ + public LinkedList getPhraseList() { + return phraseList; + } + + public static class FieldPhraseInfo { + + // unnecessary member, just exists for debugging purpose + private String text; - public WeightedPhraseInfo( LinkedList terms, float boost ){ + // usually termsOffsets.size() == 1, + // but if position-gap > 1 and slop > 0 then size() could be greater than 1 + private List termsOffsets; + + // Term-info + private List termsInfos; + + // query boost + private float boost; + private int seqnum; + + public FieldPhraseInfo( LinkedList terms, float boost ){ this( terms, boost, 0 ); } - public WeightedPhraseInfo( LinkedList terms, float boost, int number ){ + public FieldPhraseInfo( LinkedList terms, float boost, int number ){ this.boost = boost; this.seqnum = number; termsOffsets = new ArrayList( terms.size() ); + termsInfos = new ArrayList( terms ); TermInfo ti = terms.get( 0 ); termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); if( terms.size() == 1 ){ @@ -131,7 +150,7 @@ int pos = ti.getPosition(); for( int i = 1; i < terms.size(); i++ ){ ti = terms.get( i ); - sb.append( ti.getText() ); + sb.append( ti.getText() + " " ); if( ti.getPosition() - pos == 1 ){ Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); to.setEndOffset( ti.getEndOffset() ); @@ -143,7 +162,23 @@ } text = sb.toString(); } + + public List getTermsInfos() { + return termsInfos; + } + + public float getBoost(){ + return boost; + } + public String getText(){ + return text; + } + + public List getTermsOffset(){ + return termsOffsets; + } + public int getStartOffset(){ return termsOffsets.get( 0 ).startOffset; } @@ -151,8 +186,12 @@ public int getEndOffset(){ return termsOffsets.get( termsOffsets.size() - 1 ).endOffset; } + + public int getSeqnum(){ + return seqnum; + } - public boolean isOffsetOverlap( WeightedPhraseInfo other ){ + public boolean isOffsetOverlap( FieldPhraseInfo other ){ int so = getStartOffset(); int eo = getEndOffset(); int oso = other.getStartOffset(); @@ -167,30 +206,40 @@ @Override public String toString(){ StringBuilder sb = new StringBuilder(); - sb.append( text ).append( '(' ).append( boost ).append( ")(" ); + sb.append( text.trim() ).append( '(' ).append( boost ).append( ")(" ); for( Toffs to : termsOffsets ){ sb.append( to ); } + sb.append( ")(" ); + for( TermInfo ti : termsInfos ){ + sb.append( ti ); + } sb.append( ')' ); return sb.toString(); } public static class Toffs { - int startOffset; - int endOffset; + + private int startOffset; + private int endOffset; + public Toffs( int startOffset, int endOffset ){ this.startOffset = startOffset; this.endOffset = endOffset; } + public void setEndOffset( int endOffset ){ this.endOffset = endOffset; } + public int getStartOffset(){ return startOffset; } + public int getEndOffset(){ return endOffset; } + @Override public String toString(){ StringBuilder sb = new StringBuilder(); @@ -198,5 +247,7 @@ return sb.toString(); } } + } + } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SingleFragListBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SingleFragListBuilder.java (revision 1188460) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SingleFragListBuilder.java (working copy) @@ -21,11 +21,11 @@ import java.util.Iterator; import java.util.List; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; +import org.apache.lucene.search.vectorhighlight.FieldFragList.FieldFragInfo; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.FieldPhraseInfo; /** - * An implementation class of {@link FragListBuilder} that generates one {@link WeightedFragInfo} object. + * An implementation class of {@link FragListBuilder} that generates one {@link FieldFragInfo} object. * Typical use case of this class is that you can get an entire field contents * by using both of this class and {@link SimpleFragmentsBuilder}.
    *
    @@ -38,11 +38,11 @@
       public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList,
           int fragCharSize) {
     
    -    FieldFragList ffl = new FieldFragList( fragCharSize );
    +    FieldFragList ffl = new SimpleFieldFragList( fragCharSize );
     
    -    List wpil = new ArrayList();
    -    Iterator ite = fieldPhraseList.phraseList.iterator();
    -    WeightedPhraseInfo phraseInfo = null;
    +    List wpil = new ArrayList();
    +    Iterator ite = fieldPhraseList.getPhraseList().iterator();
    +    FieldPhraseInfo phraseInfo = null;
         while( true ){
           if( !ite.hasNext() ) break;
           phraseInfo = ite.next();
    Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java
    ===================================================================
    --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java	(revision 1188460)
    +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java	(working copy)
    @@ -21,7 +21,7 @@
     import java.util.Comparator;
     import java.util.List;
     
    -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
    +import org.apache.lucene.search.vectorhighlight.FieldFragList.FieldFragInfo;
     
     /**
      * An implementation of FragmentsBuilder that outputs score-order fragments.
    @@ -57,20 +57,20 @@
        * Sort by score the list of WeightedFragInfo
        */
       @Override
    -  public List getWeightedFragInfoList( List src ) {
    +  public List getWeightedFragInfoList( List src ) {
         Collections.sort( src, new ScoreComparator() );
         return src;
       }
    +  
    +  public static class ScoreComparator implements Comparator {
     
    -  public static class ScoreComparator implements Comparator {
    -
    -    public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) {
    -      if( o1.totalBoost > o2.totalBoost ) return -1;
    -      else if( o1.totalBoost < o2.totalBoost ) return 1;
    +    public int compare( FieldFragInfo o1, FieldFragInfo o2 ) {
    +      if( o1.getScore() > o2.getScore() ) return -1;
    +      else if( o1.getScore() < o2.getScore() ) return 1;
           // if same score then check startOffset
           else{
    -        if( o1.startOffset < o2.startOffset ) return -1;
    -        else if( o1.startOffset > o2.startOffset ) return 1;
    +        if( o1.getStartOffset() < o2.getStartOffset() ) return -1;
    +        else if( o1.getStartOffset() > o2.getStartOffset() ) return 1;
           }
           return 0;
         }
    Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
    ===================================================================
    --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java	(revision 1188460)
    +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java	(working copy)
    @@ -29,9 +29,9 @@
     import org.apache.lucene.index.StoredFieldVisitor;
     import org.apache.lucene.search.highlight.DefaultEncoder;
     import org.apache.lucene.search.highlight.Encoder;
    -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
    -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
    -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
    +import org.apache.lucene.search.vectorhighlight.FieldFragList.FieldFragInfo.SubInfo;
    +import org.apache.lucene.search.vectorhighlight.FieldFragList.FieldFragInfo;
    +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.FieldPhraseInfo.Toffs;
     import org.apache.lucene.store.IndexInput;
     
     public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
    @@ -74,7 +74,7 @@
         throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" );
       }
       
    -  public abstract List getWeightedFragInfoList( List src );
    +  public abstract List getWeightedFragInfoList( List src );
     
       private static final Encoder NULL_ENCODER = new DefaultEncoder();
       
    @@ -106,7 +106,7 @@
         if( maxNumFragments < 0 )
           throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
     
    -    List fragInfos = getWeightedFragInfoList( fieldFragList.getFragInfos() );
    +    List fragInfos = getWeightedFragInfoList( fieldFragList.getFragInfos() );
         
         List fragments = new ArrayList( maxNumFragments );
         Field[] values = getFields( reader, docId, fieldName );
    @@ -114,7 +114,7 @@
         StringBuilder buffer = new StringBuilder();
         int[] nextValueIndex = { 0 };
         for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){
    -      WeightedFragInfo fragInfo = fragInfos.get( n );
    +      FieldFragInfo fragInfo = fragInfos.get( n );
           fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder ) );
         }
         return fragments.toArray( new String[fragments.size()] );
    @@ -143,7 +143,7 @@
         return fields.toArray(new Field[fields.size()]);
       }
     
    -  protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
    +  protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, FieldFragInfo fragInfo,
           String[] preTags, String[] postTags, Encoder encoder ){
         StringBuilder fragment = new StringBuilder();
         final int s = fragInfo.getStartOffset();
    Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java
    ===================================================================
    --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java	(revision 0)
    +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java	(revision 0)
    @@ -0,0 +1,53 @@
    +package org.apache.lucene.search.vectorhighlight;
    +
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +import java.util.ArrayList;
    +import java.util.List;
    +
    +import org.apache.lucene.search.vectorhighlight.FieldFragList.FieldFragInfo.SubInfo;
    +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.FieldPhraseInfo;
    +
    +/**
    + * A simple implementation of {@link FielFragList}.
    + */
    +public class SimpleFieldFragList extends FieldFragList {
    +
    +  /**
    +   * a constructor.
    +   * 
    +   * @param fragCharSize the length (number of chars) of a fragment
    +   */
    +  public SimpleFieldFragList( int fragCharSize ) {
    +    super( fragCharSize );
    +  }
    +
    +  /* (non-Javadoc)
    +   * @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List phraseInfoList )
    +   */
    +  @Override
    +  public void add( int startOffset, int endOffset, List phraseInfoList ) {
    +    float score = 0;
    +    List subInfos = new ArrayList();
    +    for( FieldPhraseInfo phraseInfo : phraseInfoList ){
    +      subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffset(), phraseInfo.getSeqnum() ) );
    +      score += phraseInfo.getBoost();
    +    }
    +    getFragInfos().add( new FieldFragInfo( startOffset, endOffset, subInfos, score ) );
    +  }
    +}
    Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java
    ===================================================================
    --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java	(revision 1188460)
    +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java	(working copy)
    @@ -17,83 +17,15 @@
      * limitations under the License.
      */
     
    -import java.util.ArrayList;
    -import java.util.Iterator;
    -import java.util.List;
    -
    -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
    -
     /**
      * A simple implementation of {@link FragListBuilder}.
      */
    -public class SimpleFragListBuilder implements FragListBuilder {
    +public class SimpleFragListBuilder extends BaseFragListBuilder {
       
    -  public static final int MARGIN_DEFAULT = 6;
    -  public static final int MIN_FRAG_CHAR_SIZE_FACTOR = 3;
    -
    -  final int margin;
    -  final int minFragCharSize;
    -
    -  public SimpleFragListBuilder( int margin ){
    -    if( margin < 0 )
    -      throw new IllegalArgumentException( "margin(" + margin + ") is too small. It must be 0 or higher." );
    -
    -    this.margin = margin;
    -    this.minFragCharSize = Math.max( 1, margin * MIN_FRAG_CHAR_SIZE_FACTOR );
    +  /* (non-Javadoc)
    +   * @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize)
    +   */ 
    +  public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
    +    return createFieldFragList( fieldPhraseList, new SimpleFieldFragList( fragCharSize ), fragCharSize );
       }
    -
    -  public SimpleFragListBuilder(){
    -    this( MARGIN_DEFAULT );
    -  }
    -
    -  public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) {
    -    if( fragCharSize < minFragCharSize )
    -      throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " +
    -          minFragCharSize + " or higher." );
    -
    -    FieldFragList ffl = new FieldFragList( fragCharSize );
    -
    -    List wpil = new ArrayList();
    -    Iterator ite = fieldPhraseList.phraseList.iterator();
    -    WeightedPhraseInfo phraseInfo = null;
    -    int startOffset = 0;
    -    boolean taken = false;
    -    while( true ){
    -      if( !taken ){
    -        if( !ite.hasNext() ) break;
    -        phraseInfo = ite.next();
    -      }
    -      taken = false;
    -      if( phraseInfo == null ) break;
    -
    -      // if the phrase violates the border of previous fragment, discard it and try next phrase
    -      if( phraseInfo.getStartOffset() < startOffset ) continue;
    -
    -      wpil.clear();
    -      wpil.add( phraseInfo );
    -      int st = phraseInfo.getStartOffset() - margin < startOffset ?
    -          startOffset : phraseInfo.getStartOffset() - margin;
    -      int en = st + fragCharSize;
    -      if( phraseInfo.getEndOffset() > en )
    -        en = phraseInfo.getEndOffset();
    -      startOffset = en;
    -
    -      while( true ){
    -        if( ite.hasNext() ){
    -          phraseInfo = ite.next();
    -          taken = true;
    -          if( phraseInfo == null ) break;
    -        }
    -        else
    -          break;
    -        if( phraseInfo.getEndOffset() <= en )
    -          wpil.add( phraseInfo );
    -        else
    -          break;
    -      }
    -      ffl.add( st, en, wpil );
    -    }
    -    return ffl;
    -  }
    -
     }