Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java
===================================================================
--- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java (revision 1166043)
+++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java (working copy)
@@ -35,9 +35,9 @@
String[] f = sofb.createFragments( reader, 0, F, ffl, 3 );
assertEquals( 3, f.length );
// check score order
- assertEquals( "c a a b b ", f[0] );
- assertEquals( "b b a b a b b b b b ", f[1] );
- assertEquals( "a b b b b b b b b b ", f[2] );
+ assertEquals( "c a a b b", f[0] );
+ assertEquals( "b b a b a b b b b b c", f[1] );
+ assertEquals( "a b b b b b b b b b b", f[2] );
}
private FieldFragList ffl(Query query, String indexValue ) throws Exception {
Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java
===================================================================
--- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java (revision 1166043)
+++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java (working copy)
@@ -37,11 +37,11 @@
public void test1TermIndex() throws Exception {
FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "a" );
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
- assertEquals( "a ", sfb.createFragment( reader, 0, F, ffl ) );
+ assertEquals( "a", sfb.createFragment( reader, 0, F, ffl ) );
// change tags
sfb = new SimpleFragmentsBuilder( new String[]{ "[" }, new String[]{ "]" } );
- assertEquals( "[a] ", sfb.createFragment( reader, 0, F, ffl ) );
+ assertEquals( "[a]", sfb.createFragment( reader, 0, F, ffl ) );
}
public void test2Frags() throws Exception {
@@ -50,8 +50,8 @@
String[] f = sfb.createFragments( reader, 0, F, ffl, 3 );
// 3 snippets requested, but should be 2
assertEquals( 2, f.length );
- assertEquals( "a b b b b b b b b b ", f[0] );
- assertEquals( "b b a b a b ", f[1] );
+ assertEquals( "a b b b b b b b b b b", f[0] );
+ assertEquals( "b b a b a b", f[1] );
}
public void test3Frags() throws Exception {
@@ -63,8 +63,8 @@
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
String[] f = sfb.createFragments( reader, 0, F, ffl, 3 );
assertEquals( 3, f.length );
- assertEquals( "a b b b b b b b b b ", f[0] );
- assertEquals( "b b a b a b b b b b ", f[1] );
+ assertEquals( "a b b b b b b b b b b", f[0] );
+ assertEquals( "b b a b a b b b b b c", f[1] );
assertEquals( "c a a b b ", f[2] );
}
@@ -73,7 +73,7 @@
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
String[] preTags = { "[" };
String[] postTags = { "]" };
- assertEquals( "<h1> [a] </h1> ",
+ assertEquals( "<h1> [a] </h1>",
sfb.createFragment( reader, 0, F, ffl, preTags, postTags, new SimpleHTMLEncoder() ) );
}
@@ -106,7 +106,7 @@
SimpleFragListBuilder sflb = new SimpleFragListBuilder();
FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
- assertEquals( " most search engines use only one of these methods. Even the search engines that says they can use t",
+ assertEquals( "The most search engines use only one of these methods. Even the search engines that says they can use the",
sfb.createFragment( reader, 0, F, ffl ) );
}
@@ -119,7 +119,7 @@
SimpleFragListBuilder sflb = new SimpleFragListBuilder();
FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
- assertEquals( "ssing speed, the ", sfb.createFragment( reader, 0, F, ffl ) );
+ assertEquals( "processing speed, the", sfb.createFragment( reader, 0, F, ffl ) );
}
public void testUnstoredField() throws Exception {
Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java
===================================================================
--- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java (revision 0)
+++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java (revision 0)
@@ -0,0 +1,38 @@
+package org.apache.lucene.search.vectorhighlight;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class SimpleBoundaryScannerTest extends LuceneTestCase {
+ static final String TEXT =
+ "Apache Lucene(TM) is a high-performance, full-featured\ntext search engine library written entirely in Java.";
+
+ public void testFindStartOffset() throws Exception {
+ StringBuilder text = new StringBuilder(TEXT);
+ BoundaryScanner scanner = new SimpleBoundaryScanner();
+
+ // test out of range
+ int start = TEXT.length() + 1;
+ assertEquals(start, scanner.findStartOffset(text, start));
+ start = 0;
+ assertEquals(start, scanner.findStartOffset(text, start));
+
+ start = TEXT.indexOf("formance");
+ int expected = TEXT.indexOf("high-performance");
+ assertEquals(expected, scanner.findStartOffset(text, start));
+ }
+
+ public void testFindEndOffset() throws Exception {
+ StringBuilder text = new StringBuilder(TEXT);
+ BoundaryScanner scanner = new SimpleBoundaryScanner();
+
+ // test out of range
+ int start = TEXT.length() + 1;
+ assertEquals(start, scanner.findEndOffset(text, start));
+ start = -1;
+ assertEquals(start, scanner.findEndOffset(text, start));
+
+ start = TEXT.indexOf("full-");
+ int expected = TEXT.indexOf("\ntext");
+ assertEquals(expected, scanner.findEndOffset(text, start));
+ }
+}
Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java
===================================================================
--- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java (revision 0)
+++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java (revision 0)
@@ -0,0 +1,70 @@
+package org.apache.lucene.search.vectorhighlight;
+
+import java.util.Locale;
+
+import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner.Type;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class BreakIteratorBoundaryScannerTest extends LuceneTestCase {
+ static final String TEXT =
+ "Apache Lucene(TM) is a high-performance, full-featured text search engine library written entirely in Java." +
+ "\nIt is a technology suitable for nearly any application that requires\n" +
+ "full-text search, especially cross-platform. \nApache Lucene is an open source project available for free download.";
+
+ public void testOutOfRange() throws Exception {
+ StringBuilder text = new StringBuilder(TEXT);
+ BoundaryScanner scanner = new BreakIteratorBoundaryScanner(Locale.ENGLISH);
+
+ int start = TEXT.length() + 1;
+ assertEquals(start, scanner.findStartOffset(text, start));
+ assertEquals(start, scanner.findEndOffset(text, start));
+ start = 0;
+ assertEquals(start, scanner.findStartOffset(text, start));
+ start = -1;
+ assertEquals(start, scanner.findEndOffset(text, start));
+ }
+
+ public void testWordBoundary() throws Exception {
+ StringBuilder text = new StringBuilder(TEXT);
+ BoundaryScanner scanner = new BreakIteratorBoundaryScanner(Type.WORD, null);
+
+ int start = TEXT.indexOf("formance");
+ int expected = TEXT.indexOf("high-performance");
+ testFindStartOffset(text, start, expected, scanner);
+
+ expected = TEXT.indexOf(", full");
+ testFindEndOffset(text, start, expected, scanner);
+ }
+
+ public void testSentenceBoundary() throws Exception {
+ StringBuilder text = new StringBuilder(TEXT);
+ BoundaryScanner scanner = new BreakIteratorBoundaryScanner(Type.SENTENCE, null);
+
+ int start = TEXT.indexOf("any application");
+ int expected = TEXT.indexOf("It is a");
+ testFindStartOffset(text, start, expected, scanner);
+
+ expected = TEXT.indexOf("Apache Lucene is an open source");
+ testFindEndOffset(text, start, expected, scanner);
+ }
+
+ public void testLineBoundary() throws Exception {
+ StringBuilder text = new StringBuilder(TEXT);
+ BoundaryScanner scanner = new BreakIteratorBoundaryScanner(Type.LINE, null);
+
+ int start = TEXT.indexOf("any application");
+ int expected = TEXT.indexOf("nearly");
+ testFindStartOffset(text, start, expected, scanner);
+
+ expected = TEXT.indexOf("application that requires");
+ testFindEndOffset(text, start, expected, scanner);
+ }
+
+ private void testFindStartOffset(StringBuilder text, int start, int expected, BoundaryScanner scanner) throws Exception {
+ assertEquals(expected, scanner.findStartOffset(text, start));
+ }
+
+ private void testFindEndOffset(StringBuilder text, int start, int expected, BoundaryScanner scanner) throws Exception {
+ assertEquals(expected, scanner.findEndOffset(text, start));
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java (revision 0)
@@ -0,0 +1,81 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+public class SimpleBoundaryScanner implements BoundaryScanner {
+
+ public static final int DEFAULT_MAX_SCAN = 20;
+ public static final Character[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'};
+
+ protected int maxScan;
+ protected Set boundaryChars;
+
+ public SimpleBoundaryScanner(){
+ this( DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS );
+ }
+
+ public SimpleBoundaryScanner( int maxScan ){
+ this( maxScan, DEFAULT_BOUNDARY_CHARS );
+ }
+
+ public SimpleBoundaryScanner( Character[] boundaryChars ){
+ this( DEFAULT_MAX_SCAN, boundaryChars );
+ }
+
+ public SimpleBoundaryScanner( int maxScan, Character[] boundaryChars ){
+ this.maxScan = maxScan;
+ this.boundaryChars = new HashSet();
+ this.boundaryChars.addAll(Arrays.asList(boundaryChars));
+ }
+
+ public SimpleBoundaryScanner( int maxScan, Set boundaryChars ){
+ this.maxScan = maxScan;
+ this.boundaryChars = boundaryChars;
+ }
+
+ public int findStartOffset(StringBuilder buffer, int start) {
+ // avoid illegal start offset
+ if( start > buffer.length() || start < 1 ) return start;
+ int offset, count = maxScan;
+ for( offset = start; offset > 0 && count > 0; count-- ){
+ // found?
+ if( boundaryChars.contains( buffer.charAt( offset - 1 ) ) ) return offset;
+ offset--;
+ }
+ // not found
+ return start;
+ }
+
+ public int findEndOffset(StringBuilder buffer, int start) {
+ // avoid illegal start offset
+ if( start > buffer.length() || start < 0 ) return start;
+ int offset, count = maxScan;
+ //for( offset = start; offset <= buffer.length() && count > 0; count-- ){
+ for( offset = start; offset < buffer.length() && count > 0; count-- ){
+ // found?
+ if( boundaryChars.contains( buffer.charAt( offset ) ) ) return offset;
+ offset++;
+ }
+ // not found
+ return start;
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java (revision 0)
@@ -0,0 +1,77 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.BreakIterator;
+import java.util.Locale;
+
+/**
+ * A {@link BoundaryScanner} implementation that uses {@link BreakIterator} to find
+ * boundaries in the text. Boundary {@link Type} can be specified ({@link Type#SENTENCE} is the default).
+ */
+public class BreakIteratorBoundaryScanner implements BoundaryScanner {
+
+ BreakIterator bi;
+
+ public BreakIteratorBoundaryScanner(Locale locale){
+ this(Type.SENTENCE, locale);
+ }
+
+ public BreakIteratorBoundaryScanner(Type type, Locale locale){
+ switch (type) {
+ case CHARACTER:
+ bi = locale == null ?
+ BreakIterator.getCharacterInstance() : BreakIterator.getCharacterInstance(locale);
+ break;
+
+ case WORD:
+ bi = locale == null ?
+ BreakIterator.getWordInstance() : BreakIterator.getWordInstance(locale);
+ break;
+
+ case SENTENCE:
+ bi = locale == null ?
+ BreakIterator.getSentenceInstance() : BreakIterator.getSentenceInstance(locale);
+ break;
+
+ case LINE:
+ bi = locale == null ?
+ BreakIterator.getLineInstance() : BreakIterator.getLineInstance(locale);
+ break;
+ }
+ }
+
+ public int findStartOffset(StringBuilder buffer, int start) {
+ // avoid illegal start offset
+ if( start > buffer.length() || start < 1 ) return start;
+ bi.setText(buffer.substring(0, start));
+ bi.last();
+ return bi.previous();
+ }
+
+ public int findEndOffset(StringBuilder buffer, int start) {
+ // avoid illegal start offset
+ if( start > buffer.length() || start < 0 ) return start;
+ bi.setText(buffer.substring(start));
+ return bi.next() + start;
+ }
+
+ public static enum Type {
+ CHARACTER, WORD, SENTENCE, LINE
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (revision 1166043)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (working copy)
@@ -48,14 +48,24 @@
};
public static final String[] COLORED_POST_TAGS = { "" };
private char multiValuedSeparator = ' ';
+ private final BoundaryScanner boundaryScanner;
protected BaseFragmentsBuilder(){
this( new String[]{ "" }, new String[]{ "" } );
}
protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){
+ this(preTags, postTags, new SimpleBoundaryScanner());
+ }
+
+ protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner){
+ this( new String[]{ "" }, new String[]{ "" }, boundaryScanner );
+ }
+
+ protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){
this.preTags = preTags;
this.postTags = postTags;
+ this.boundaryScanner = boundaryScanner;
}
static Object checkTagsArgument( Object tags ){
@@ -135,28 +145,36 @@
protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
String[] preTags, String[] postTags, Encoder encoder ){
- final int s = fragInfo.startOffset;
- return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s,
- preTags, postTags, encoder );
- }
-
- private String makeFragment( WeightedFragInfo fragInfo, String src, int s,
- String[] preTags, String[] postTags, Encoder encoder ){
StringBuilder fragment = new StringBuilder();
+ final int s = fragInfo.getStartOffset();
+ int[] modifiedStartOffset = { s };
+ String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset );
int srcIndex = 0;
- for( SubInfo subInfo : fragInfo.subInfos ){
- for( Toffs to : subInfo.termsOffsets ){
+ for( SubInfo subInfo : fragInfo.getSubInfos() ){
+ for( Toffs to : subInfo.getTermsOffsets() ){
fragment
- .append( encoder.encodeText( src.substring( srcIndex, to.startOffset - s ) ) )
- .append( getPreTag( preTags, subInfo.seqnum ) )
- .append( encoder.encodeText( src.substring( to.startOffset - s, to.endOffset - s ) ) )
- .append( getPostTag( postTags, subInfo.seqnum ) );
- srcIndex = to.endOffset - s;
+ .append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) )
+ .append( getPreTag( preTags, subInfo.getSeqnum() ) )
+ .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) )
+ .append( getPostTag( postTags, subInfo.getSeqnum() ) );
+ srcIndex = to.getEndOffset() - modifiedStartOffset[0];
}
}
fragment.append( encoder.encodeText( src.substring( srcIndex ) ) );
return fragment.toString();
}
+
+ protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values,
+ int startOffset, int endOffset, int[] modifiedStartOffset ){
+ while( buffer.length() < endOffset && index[0] < values.length ){
+ if( index[0] > 0 && values[index[0]].tokenized() && values[index[0]].stringValue().length() > 0 )
+ buffer.append( getMultiValuedSeparator() );
+ buffer.append( values[index[0]++].stringValue() );
+ }
+ int eo = buffer.length() < endOffset ? buffer.length() : boundaryScanner.findEndOffset( buffer, endOffset );
+ modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset );
+ return buffer.substring( modifiedStartOffset[0], eo );
+ }
protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values,
int startOffset, int endOffset ){
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java (revision 0)
@@ -0,0 +1,40 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ */
+public interface BoundaryScanner {
+
+ /**
+ * Scan backward to find end offset.
+ * @param buffer scanned object
+ * @param start start offset to begin
+ * @return the found start offset
+ */
+ public int findStartOffset( StringBuilder buffer, int start );
+
+ /**
+ * Scan forward to find start offset.
+ * @param buffer scanned object
+ * @param start start offset to begin
+ * @return the found end offset
+ */
+ public int findEndOffset( StringBuilder buffer, int start );
+}