Index: contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/WhitespaceFragmentsBuilder.java =================================================================== --- contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/WhitespaceFragmentsBuilder.java (revision 0) +++ contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/WhitespaceFragmentsBuilder.java (revision 0) @@ -0,0 +1,72 @@ +package org.apache.lucene.search.vectorhighlight; + +import java.util.List; + +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs; + +/** + * An implementation of FragmentsBuilder that looks for whitespace or the beginning or end of the + * source text for fragment boundaries, to avoid truncating words at the edges. + * + */ +public class WhitespaceFragmentsBuilder extends BaseFragmentsBuilder { + + + /** + * a constructor. + */ + public WhitespaceFragmentsBuilder(){ + super(); + } + + /** + * a constructor. + * + * @param preTags array of pre-tags for markup terms. + * @param postTags array of post-tags for markup terms. + */ + public WhitespaceFragmentsBuilder( String[] preTags, String[] postTags ) { + super( preTags, postTags ); + } + + /** + * do nothing. return the source list. + */ + public List getWeightedFragInfoList(List src) { + return src; + } + + protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){ + StringBuilder fragment = new StringBuilder(); + String src = getFragmentSource( buffer, index, values, fragInfo); + final int s = fragInfo.startOffset; + int srcIndex = 0; + for( SubInfo subInfo : fragInfo.subInfos ){ + for( Toffs to : subInfo.termsOffsets ){ + fragment.append( src.substring( srcIndex, to.startOffset - s ) ).append( getPreTag( subInfo.seqnum ) ) + .append( src.substring( to.startOffset - s, to.endOffset - s ) ).append( getPostTag( subInfo.seqnum ) ); + srcIndex = to.endOffset - s; + } + } + fragment.append( src.substring( srcIndex ) ); + return fragment.toString(); + } + + protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){ + while( buffer.length() < fragInfo.endOffset && index[0] < values.length ){ + if( index[0] > 0 && values[index[0]].length() > 0 ) + buffer.append( ' ' ); + buffer.append( values[index[0]++] ); + } + while(fragInfo.startOffset>0 && !Character.isWhitespace(buffer.charAt(fragInfo.startOffset))){ + fragInfo.startOffset--; + } + while(fragInfo.endOffset < buffer.length() && !Character.isWhitespace(buffer.charAt(fragInfo.endOffset-1))){ + fragInfo.endOffset++; + } + return buffer.substring( fragInfo.startOffset, buffer.length() < fragInfo.endOffset ? buffer.length() : fragInfo.endOffset); + } + +} Index: contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/WhitespaceFragmentsBuilderTest.java =================================================================== --- contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/WhitespaceFragmentsBuilderTest.java (revision 0) +++ contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/WhitespaceFragmentsBuilderTest.java (revision 0) @@ -0,0 +1,130 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.search.Query; + +public class WhitespaceFragmentsBuilderTest extends AbstractTestCase { + + public void test1TermIndex() throws Exception { + FieldFragList ffl = ffl( "a", "a" ); + WhitespaceFragmentsBuilder sfb = new WhitespaceFragmentsBuilder(); + assertEquals( "a", sfb.createFragment( reader, 0, F, ffl ) ); + + // change tags + sfb = new WhitespaceFragmentsBuilder( new String[]{ "[" }, new String[]{ "]" } ); + assertEquals( "[a]", sfb.createFragment( reader, 0, F, ffl ) ); + } + + public void test2Frags() throws Exception { + FieldFragList ffl = ffl( "a", "a b b b b b b b b b b b a b a b" ); + WhitespaceFragmentsBuilder sfb = new WhitespaceFragmentsBuilder(); + String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); + // 3 snippets requested, but should be 2 + assertEquals( 2, f.length ); + assertEquals( "a b b b b b b b b b ", f[0] ); + assertEquals( " b b a b a b", f[1] ); + } + + public void test3Frags() throws Exception { + FieldFragList ffl = ffl( "a c", "a b b b b b b b b b b b a b a b b b b b c a a b b" ); + WhitespaceFragmentsBuilder sfb = new WhitespaceFragmentsBuilder(); + String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); + assertEquals( 3, f.length ); + assertEquals( "a b b b b b b b b b ", f[0] ); + assertEquals( " b b a b a b b b b b ", f[1] ); + assertEquals( " c a a b b", f[2] ); + } + + private FieldFragList ffl( String queryValue, String indexValue ) throws Exception { + make1d1fIndex( indexValue ); + Query query = paW.parse( queryValue ); + FieldQuery fq = new FieldQuery( query, true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + FieldPhraseList fpl = new FieldPhraseList( stack, fq ); + return new SimpleFragListBuilder().createFieldFragList( fpl, 20 ); + } + + public void test1PhraseShortMV() throws Exception { + makeIndexShortMV(); + + FieldQuery fq = new FieldQuery( tq( "d" ), true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + FieldPhraseList fpl = new FieldPhraseList( stack, fq ); + SimpleFragListBuilder sflb = new SimpleFragListBuilder(); + FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); + WhitespaceFragmentsBuilder sfb = new WhitespaceFragmentsBuilder(); + assertEquals( "a b c d e", sfb.createFragment( reader, 0, F, ffl ) ); + } + + public void test1PhraseLongMV() throws Exception { + makeIndexLongMV(); + + FieldQuery fq = new FieldQuery( pqF( "search", "engines" ), true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + FieldPhraseList fpl = new FieldPhraseList( stack, fq ); + SimpleFragListBuilder sflb = new SimpleFragListBuilder(); + FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); + WhitespaceFragmentsBuilder sfb = new WhitespaceFragmentsBuilder(); + assertEquals( " most search engines use only one of these methods. Even the search engines that says they can use the ", + sfb.createFragment( reader, 0, F, ffl ) ); + } + + public void test1PhraseLongMVB() throws Exception { + makeIndexLongMVB(); + + FieldQuery fq = new FieldQuery( pqF( "sp", "pe", "ee", "ed" ), true, true ); // "speed" -(2gram)-> "sp","pe","ee","ed" + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + FieldPhraseList fpl = new FieldPhraseList( stack, fq ); + SimpleFragListBuilder sflb = new SimpleFragListBuilder(); + FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); + WhitespaceFragmentsBuilder sfb = new WhitespaceFragmentsBuilder(); + assertEquals( " processing speed, the", sfb.createFragment( reader, 0, F, ffl ) ); + } + + public void testUnstoredField() throws Exception { + makeUnstoredIndex(); + + FieldQuery fq = new FieldQuery( tq( "aaa" ), true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + FieldPhraseList fpl = new FieldPhraseList( stack, fq ); + SimpleFragListBuilder sflb = new SimpleFragListBuilder(); + FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); + WhitespaceFragmentsBuilder sfb = new WhitespaceFragmentsBuilder(); + assertNull( sfb.createFragment( reader, 0, F, ffl ) ); + } + + protected void makeUnstoredIndex() throws Exception { + IndexWriter writer = new IndexWriter( dir, analyzerW, true, MaxFieldLength.LIMITED ); + Document doc = new Document(); + doc.add( new Field( F, "aaa", Store.NO, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); + writer.addDocument( doc ); + writer.close(); + + reader = IndexReader.open( dir ); + } +}