Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java (revision 1165954) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java (working copy) @@ -35,9 +35,9 @@ String[] f = sofb.createFragments( reader, 0, F, ffl, 3 ); assertEquals( 3, f.length ); // check score order - assertEquals( "c a a b b ", f[0] ); - assertEquals( "b b a b a b b b b b ", f[1] ); - assertEquals( "a b b b b b b b b b ", f[2] ); + assertEquals( "c a a b b", f[0] ); + assertEquals( "b b a b a b b b b b c", f[1] ); + assertEquals( "a b b b b b b b b b b", f[2] ); } private FieldFragList ffl(Query query, String indexValue ) throws Exception { Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java (revision 1165954) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java (working copy) @@ -37,11 +37,11 @@ public void test1TermIndex() throws Exception { FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "a" ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); - assertEquals( "a ", sfb.createFragment( reader, 0, F, ffl ) ); + assertEquals( "a", sfb.createFragment( reader, 0, F, ffl ) ); // change tags sfb = new SimpleFragmentsBuilder( new String[]{ "[" }, new String[]{ "]" } ); - assertEquals( "[a] ", sfb.createFragment( reader, 0, F, ffl ) ); + assertEquals( "[a]", sfb.createFragment( reader, 0, F, ffl ) ); } public void test2Frags() throws Exception { @@ -50,8 +50,8 @@ String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); // 3 snippets requested, but should be 2 assertEquals( 2, f.length ); - assertEquals( "a b b b b b b b b b ", f[0] ); - assertEquals( "b b a b a b ", f[1] ); + assertEquals( "a b b b b b b b b b b", f[0] ); + assertEquals( "b b a b a b", f[1] ); } public void test3Frags() throws Exception { @@ -63,8 +63,8 @@ SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); assertEquals( 3, f.length ); - assertEquals( "a b b b b b b b b b ", f[0] ); - assertEquals( "b b a b a b b b b b ", f[1] ); + assertEquals( "a b b b b b b b b b b", f[0] ); + assertEquals( "b b a b a b b b b b c", f[1] ); assertEquals( "c a a b b ", f[2] ); } @@ -73,7 +73,7 @@ SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); String[] preTags = { "[" }; String[] postTags = { "]" }; - assertEquals( "<h1> [a] </h1> ", + assertEquals( "<h1> [a] </h1>", sfb.createFragment( reader, 0, F, ffl, preTags, postTags, new SimpleHTMLEncoder() ) ); } @@ -106,7 +106,7 @@ SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); - assertEquals( " most search engines use only one of these methods. Even the search engines that says they can use t", + assertEquals( "The most search engines use only one of these methods. Even the search engines that says they can use the", sfb.createFragment( reader, 0, F, ffl ) ); } @@ -119,7 +119,7 @@ SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); - assertEquals( "ssing speed, the ", sfb.createFragment( reader, 0, F, ffl ) ); + assertEquals( "processing speed, the", sfb.createFragment( reader, 0, F, ffl ) ); } public void testUnstoredField() throws Exception { Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java (revision 0) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.search.vectorhighlight; + +import org.apache.lucene.util.LuceneTestCase; + +public class SimpleBoundaryScannerTest extends LuceneTestCase { + static final String TEXT = + "Apache Lucene(TM) is a high-performance, full-featured\ntext search engine library written entirely in Java."; + + public void testFindStartOffset() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BoundaryScanner scanner = new SimpleBoundaryScanner(); + + // test out of range + int start = TEXT.length() + 1; + assertEquals(start, scanner.findStartOffset(text, start)); + start = 0; + assertEquals(start, scanner.findStartOffset(text, start)); + + start = TEXT.indexOf("formance"); + int expected = TEXT.indexOf("high-performance"); + assertEquals(expected, scanner.findStartOffset(text, start)); + } + + public void testFindEndOffset() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BoundaryScanner scanner = new SimpleBoundaryScanner(); + + // test out of range + int start = TEXT.length() + 1; + assertEquals(start, scanner.findEndOffset(text, start)); + start = -1; + assertEquals(start, scanner.findEndOffset(text, start)); + + start = TEXT.indexOf("full-"); + int expected = TEXT.indexOf("\ntext"); + assertEquals(expected, scanner.findEndOffset(text, start)); + } +} Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java (revision 0) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java (revision 0) @@ -0,0 +1,74 @@ +package org.apache.lucene.search.vectorhighlight; + +import java.text.BreakIterator; +import java.util.Locale; + +import org.apache.lucene.util.LuceneTestCase; + +public class BreakIteratorBoundaryScannerTest extends LuceneTestCase { + static final String TEXT = + "Apache Lucene(TM) is a high-performance, full-featured text search engine library written entirely in Java." + + "\nIt is a technology suitable for nearly any application that requires\n" + + "full-text search, especially cross-platform. \nApache Lucene is an open source project available for free download."; + + public void testOutOfRange() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BreakIterator bi = BreakIterator.getWordInstance(Locale.ENGLISH); + BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); + + int start = TEXT.length() + 1; + assertEquals(start, scanner.findStartOffset(text, start)); + assertEquals(start, scanner.findEndOffset(text, start)); + start = 0; + assertEquals(start, scanner.findStartOffset(text, start)); + start = -1; + assertEquals(start, scanner.findEndOffset(text, start)); + } + + public void testWordBoundary() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BreakIterator bi = BreakIterator.getWordInstance(Locale.ENGLISH); + BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); + + int start = TEXT.indexOf("formance"); + int expected = TEXT.indexOf("high-performance"); + testFindStartOffset(text, start, expected, scanner); + + expected = TEXT.indexOf(", full"); + testFindEndOffset(text, start, expected, scanner); + } + + public void testSentenceBoundary() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BreakIterator bi = BreakIterator.getSentenceInstance(); + BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); + + int start = TEXT.indexOf("any application"); + int expected = TEXT.indexOf("It is a"); + testFindStartOffset(text, start, expected, scanner); + + expected = TEXT.indexOf("Apache Lucene is an open source"); + testFindEndOffset(text, start, expected, scanner); + } + + public void testLineBoundary() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BreakIterator bi = BreakIterator.getLineInstance(); + BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); + + int start = TEXT.indexOf("any application"); + int expected = TEXT.indexOf("nearly"); + testFindStartOffset(text, start, expected, scanner); + + expected = TEXT.indexOf("application that requires"); + testFindEndOffset(text, start, expected, scanner); + } + + private void testFindStartOffset(StringBuilder text, int start, int expected, BoundaryScanner scanner) throws Exception { + assertEquals(expected, scanner.findStartOffset(text, start)); + } + + private void testFindEndOffset(StringBuilder text, int start, int expected, BoundaryScanner scanner) throws Exception { + assertEquals(expected, scanner.findEndOffset(text, start)); + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java (revision 0) @@ -0,0 +1,81 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +public class SimpleBoundaryScanner implements BoundaryScanner { + + public static final int DEFAULT_MAX_SCAN = 20; + public static final Character[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'}; + + protected int maxScan; + protected Set boundaryChars; + + public SimpleBoundaryScanner(){ + this( DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS ); + } + + public SimpleBoundaryScanner( int maxScan ){ + this( maxScan, DEFAULT_BOUNDARY_CHARS ); + } + + public SimpleBoundaryScanner( Character[] boundaryChars ){ + this( DEFAULT_MAX_SCAN, boundaryChars ); + } + + public SimpleBoundaryScanner( int maxScan, Character[] boundaryChars ){ + this.maxScan = maxScan; + this.boundaryChars = new HashSet(); + this.boundaryChars.addAll(Arrays.asList(boundaryChars)); + } + + public SimpleBoundaryScanner( int maxScan, Set boundaryChars ){ + this.maxScan = maxScan; + this.boundaryChars = boundaryChars; + } + + public int findStartOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 1 ) return start; + int offset, count = maxScan; + for( offset = start; offset > 0 && count > 0; count-- ){ + // found? + if( boundaryChars.contains( buffer.charAt( offset - 1 ) ) ) return offset; + offset--; + } + // not found + return start; + } + + public int findEndOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 0 ) return start; + int offset, count = maxScan; + //for( offset = start; offset <= buffer.length() && count > 0; count-- ){ + for( offset = start; offset < buffer.length() && count > 0; count-- ){ + // found? + if( boundaryChars.contains( buffer.charAt( offset ) ) ) return offset; + offset++; + } + // not found + return start; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java (revision 0) @@ -0,0 +1,48 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.BreakIterator; + +/** + * A {@link BoundaryScanner} implementation that uses {@link BreakIterator} to find + * boundaries in the text. Boundary {@link Type} can be specified ({@link Type#SENTENCE} is the default). + */ +public class BreakIteratorBoundaryScanner implements BoundaryScanner { + + final BreakIterator bi; + + public BreakIteratorBoundaryScanner(BreakIterator bi){ + this.bi = bi; + } + + public int findStartOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 1 ) return start; + bi.setText(buffer.substring(0, start)); + bi.last(); + return bi.previous(); + } + + public int findEndOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 0 ) return start; + bi.setText(buffer.substring(start)); + return bi.next() + start; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (revision 1165954) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (working copy) @@ -48,14 +48,24 @@ }; public static final String[] COLORED_POST_TAGS = { "" }; private char multiValuedSeparator = ' '; + private final BoundaryScanner boundaryScanner; protected BaseFragmentsBuilder(){ this( new String[]{ "" }, new String[]{ "" } ); } protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){ + this(preTags, postTags, new SimpleBoundaryScanner()); + } + + protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner){ + this( new String[]{ "" }, new String[]{ "" }, boundaryScanner ); + } + + protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){ this.preTags = preTags; this.postTags = postTags; + this.boundaryScanner = boundaryScanner; } static Object checkTagsArgument( Object tags ){ @@ -135,28 +145,36 @@ protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, String[] preTags, String[] postTags, Encoder encoder ){ - final int s = fragInfo.startOffset; - return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s, - preTags, postTags, encoder ); - } - - private String makeFragment( WeightedFragInfo fragInfo, String src, int s, - String[] preTags, String[] postTags, Encoder encoder ){ StringBuilder fragment = new StringBuilder(); + final int s = fragInfo.getStartOffset(); + int[] modifiedStartOffset = { s }; + String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset ); int srcIndex = 0; - for( SubInfo subInfo : fragInfo.subInfos ){ - for( Toffs to : subInfo.termsOffsets ){ + for( SubInfo subInfo : fragInfo.getSubInfos() ){ + for( Toffs to : subInfo.getTermsOffsets() ){ fragment - .append( encoder.encodeText( src.substring( srcIndex, to.startOffset - s ) ) ) - .append( getPreTag( preTags, subInfo.seqnum ) ) - .append( encoder.encodeText( src.substring( to.startOffset - s, to.endOffset - s ) ) ) - .append( getPostTag( postTags, subInfo.seqnum ) ); - srcIndex = to.endOffset - s; + .append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) ) + .append( getPreTag( preTags, subInfo.getSeqnum() ) ) + .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) ) + .append( getPostTag( postTags, subInfo.getSeqnum() ) ); + srcIndex = to.getEndOffset() - modifiedStartOffset[0]; } } fragment.append( encoder.encodeText( src.substring( srcIndex ) ) ); return fragment.toString(); } + + protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values, + int startOffset, int endOffset, int[] modifiedStartOffset ){ + while( buffer.length() < endOffset && index[0] < values.length ){ + if( index[0] > 0 && values[index[0]].tokenized() && values[index[0]].stringValue().length() > 0 ) + buffer.append( getMultiValuedSeparator() ); + buffer.append( values[index[0]++].stringValue() ); + } + int eo = buffer.length() < endOffset ? buffer.length() : boundaryScanner.findEndOffset( buffer, endOffset ); + modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset ); + return buffer.substring( modifiedStartOffset[0], eo ); + } protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, int startOffset, int endOffset ){ Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java (revision 0) @@ -0,0 +1,40 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + */ +public interface BoundaryScanner { + + /** + * Scan backward to find end offset. + * @param buffer scanned object + * @param start start offset to begin + * @return the found start offset + */ + public int findStartOffset( StringBuilder buffer, int start ); + + /** + * Scan forward to find start offset. + * @param buffer scanned object + * @param start start offset to begin + * @return the found end offset + */ + public int findEndOffset( StringBuilder buffer, int start ); +}