Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java (revision 0) @@ -0,0 +1,81 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +public class SimpleBoundaryScanner implements BoundaryScanner { + + public static final int DEFAULT_MAX_SCAN = 20; + // include double-width space + public static final Character[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', '(', '[', '{', '\t', '\n'}; + + protected int maxScan; + protected Set boundaryChars; + + public SimpleBoundaryScanner(){ + this( DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS ); + } + + public SimpleBoundaryScanner( int maxScan ){ + this( maxScan, DEFAULT_BOUNDARY_CHARS ); + } + + public SimpleBoundaryScanner( Character[] boundaryChars ){ + this( DEFAULT_MAX_SCAN, boundaryChars ); + } + + public SimpleBoundaryScanner( int maxScan, Character[] boundaryChars ){ + this.maxScan = maxScan; + this.boundaryChars = new HashSet(); + this.boundaryChars.addAll(Arrays.asList(boundaryChars)); + } + + public SimpleBoundaryScanner( int maxScan, Set boundaryChars ){ + this.maxScan = maxScan; + this.boundaryChars = boundaryChars; + } + + public int findStartOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 1 ) return start; + int offset, count = maxScan; + for( offset = start; offset > 0 && count > 0; count-- ){ + // found? + if( boundaryChars.contains( buffer.charAt( offset - 1 ) ) ) return offset; + offset--; + } + // not found + return start; + } + + public int findEndOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 0 ) return start; + int offset, count = maxScan; + for( offset = start; offset <= buffer.length() && count > 0; count-- ){ + // found? + if( boundaryChars.contains( buffer.charAt( offset ) ) ) return offset; + offset++; + } + // not found + return start; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java (revision 0) @@ -0,0 +1,76 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.BreakIterator; +import java.util.Locale; + +/** + * A {@link BoundaryScanner} implementation that uses {@link BreakIterator} to find + * boundaries in the text. Boundary {@link Type} can be specified ({@link Type#SENTENCE} is the default). + */ +public class BreakIteratorBoundaryScanner implements BoundaryScanner { + + private BreakIterator bi; + + public BreakIteratorBoundaryScanner(Locale locale){ + this(Type.SENTENCE, locale); + } + + public BreakIteratorBoundaryScanner(Type type, Locale locale){ + switch (type) { + case CHARACTER: + bi = locale == null ? + BreakIterator.getCharacterInstance() : BreakIterator.getCharacterInstance(locale); + break; + + case WORD: + bi = locale == null ? + BreakIterator.getWordInstance() : BreakIterator.getWordInstance(locale); + break; + + case SENTENCE: + bi = locale == null ? + BreakIterator.getSentenceInstance() : BreakIterator.getSentenceInstance(locale); + break; + + case LINE: + bi = locale == null ? + BreakIterator.getLineInstance() : BreakIterator.getLineInstance(locale); + break; + } + } + + public int findStartOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 1 ) return start; + bi.setText(buffer.substring(0, start)); + return bi.previous(); + } + + public int findEndOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() ) return start; + bi.setText(buffer.substring(start)); + return bi.next(); + } + + public static enum Type { + CHARACTER, WORD, SENTENCE, LINE + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (revision 1165456) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (working copy) @@ -48,14 +48,24 @@ }; public static final String[] COLORED_POST_TAGS = { "" }; private char multiValuedSeparator = ' '; + private final BoundaryScanner boundaryScanner; protected BaseFragmentsBuilder(){ this( new String[]{ "" }, new String[]{ "" } ); } protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){ + this(preTags, postTags, new SimpleBoundaryScanner()); + } + + protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner){ + this( new String[]{ "" }, new String[]{ "" }, boundaryScanner ); + } + + protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){ this.preTags = preTags; this.postTags = postTags; + this.boundaryScanner = boundaryScanner; } static Object checkTagsArgument( Object tags ){ @@ -135,28 +145,36 @@ protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, String[] preTags, String[] postTags, Encoder encoder ){ - final int s = fragInfo.startOffset; - return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s, - preTags, postTags, encoder ); - } - - private String makeFragment( WeightedFragInfo fragInfo, String src, int s, - String[] preTags, String[] postTags, Encoder encoder ){ StringBuilder fragment = new StringBuilder(); + final int s = fragInfo.getStartOffset(); + int[] modifiedStartOffset = { s }; + String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset ); int srcIndex = 0; - for( SubInfo subInfo : fragInfo.subInfos ){ - for( Toffs to : subInfo.termsOffsets ){ + for( SubInfo subInfo : fragInfo.getSubInfos() ){ + for( Toffs to : subInfo.getTermsOffsets() ){ fragment - .append( encoder.encodeText( src.substring( srcIndex, to.startOffset - s ) ) ) - .append( getPreTag( preTags, subInfo.seqnum ) ) - .append( encoder.encodeText( src.substring( to.startOffset - s, to.endOffset - s ) ) ) - .append( getPostTag( postTags, subInfo.seqnum ) ); - srcIndex = to.endOffset - s; + .append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) ) + .append( getPreTag( preTags, subInfo.getSeqnum() ) ) + .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) ) + .append( getPostTag( postTags, subInfo.getSeqnum() ) ); + srcIndex = to.getEndOffset() - modifiedStartOffset[0]; } } fragment.append( encoder.encodeText( src.substring( srcIndex ) ) ); return fragment.toString(); } + + protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values, + int startOffset, int endOffset, int[] modifiedStartOffset ){ + while( buffer.length() < endOffset && index[0] < values.length ){ + if( index[0] > 0 && values[index[0]].tokenized() && values[index[0]].stringValue().length() > 0 ) + buffer.append( getMultiValuedSeparator() ); + buffer.append( values[index[0]++].stringValue() ); + } + int eo = buffer.length() < endOffset ? buffer.length() : boundaryScanner.findEndOffset( buffer, endOffset ); + modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset ); + return buffer.substring( modifiedStartOffset[0], eo ); + } protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, int startOffset, int endOffset ){ Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java (revision 0) @@ -0,0 +1,40 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + */ +public interface BoundaryScanner { + + /** + * Scan backward to find end offset. + * @param buffer scanned object + * @param start start offset to begin + * @return the found start offset + */ + public int findStartOffset( StringBuilder buffer, int start ); + + /** + * Scan forward to find start offset. + * @param buffer scanned object + * @param start start offset to begin + * @return the found end offset + */ + public int findEndOffset( StringBuilder buffer, int start ); +}