Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 1150180) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy) @@ -221,8 +221,10 @@ textFragmenter.start(text, tokenStream); TokenGroup tokenGroup=new TokenGroup(tokenStream); - - for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); + boolean next = tokenStream.incrementToken(); + // start the first fragment at the beginning of the first token + lastEndOffset = offsetAtt.startOffset(); + for (; next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { if( (offsetAtt.endOffset()>text.length()) Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java (revision 1150180) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java (working copy) @@ -56,17 +56,17 @@ } } - private boolean addDoc (int doc) { + private ScorePosDoc addDoc (int doc) { if (count <= 0 || docs[count-1].doc != doc) { ScorePosDoc spdoc = new ScorePosDoc (doc); docs[count++] = spdoc; - return true; + return spdoc; } - return false; + return null; } public boolean acceptsDocsOutOfOrder() { - return false; + return true; } public void setScorer(Scorer scorer) throws IOException { @@ -91,12 +91,15 @@ @Override public boolean needsPositions() { return true; } + + @Override + public boolean needsPayloads() { return true; } @Override public void collectLeafPosition(Scorer scorer, PositionInterval interval, - int docID) { - addDoc(docID); - docs[count - 1].storePosition(interval); + int docID) throws IOException { + addDoc(docID); + docs[count - 1].storePosition(interval); } @Override Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java (revision 0) @@ -0,0 +1,57 @@ +package org.apache.lucene.search.poshighlight; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.highlight.Encoder; +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.highlight.Highlighter; + +public class PosHighlighter extends Highlighter { + + public PosHighlighter(Formatter formatter, Encoder encoder, PosScorer fragmentScorer) { + super(formatter, encoder, fragmentScorer); + } + + public PosHighlighter(Formatter formatter, PosScorer fragmentScorer) { + super(formatter, fragmentScorer); + } + + public PosHighlighter (PosScorer fragmentScorer) { + super(fragmentScorer); + } + + public PosHighlighter () { + super (new PosScorer()); + } + + public static Query rewriteQuery (Query q) { + // TODO: walk the query tree looking for MTQ's, setting rewrite method, and cloning as needed. + // Q: is there a good way to walk the query tree in a generic way? It seems to require a lot of + // instanceof magic... + if (q instanceof MultiTermQuery) { + ((MultiTermQuery)q).setRewriteMethod (MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); + } + return q; + } + + public static TokenStream getPosTokenStream (Query q, final int docid, String termVectorField, IndexSearcher searcher, String text) throws IOException { + PosCollector collector = new PosCollector(1); + q = rewriteQuery (q); + searcher.search(q, new SingleDocFilter(docid), collector); + ScorePosDoc doc = collector.docs[0]; + if (doc == null) + return null; + + if (termVectorField != null) { + doc.getPositionMap().getTermVectorOffsets(searcher.getIndexReader(), doc.doc, termVectorField); + } + + TokenStream tstream = new PosTokenStream(text, doc.getPositionMap(), 25); + return tstream; + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffset.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffset.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffset.java (revision 0) @@ -0,0 +1,55 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Stores a position interval and its bounding offsets. + * @lucene.experimental + */ +class PosOffset { + + public int getStartPosition() { + return startPosition; + } + + public int getEndPosition() { + return endPosition; + } + + public int getStartOffset() { + return startOffset; + } + + public int getEndOffset() { + return endOffset; + } + + int startPosition; + int endPosition; + int startOffset; + int endOffset; + + public PosOffset(int startPosition, int endPosition, int startOffset, + int endOffset) { + this.startPosition = startPosition; + this.endPosition = endPosition; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + +} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffsetMap.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffsetMap.java (revision 1149428) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffsetMap.java (working copy) @@ -18,44 +18,76 @@ */ import java.io.IOException; +import java.util.Iterator; +import java.util.SortedMap; +import java.util.TreeMap; -import org.apache.lucene.search.positions.PositionIntervalIterator; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermVectorMapper; +import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.util.BytesRef; /** - * Present an array of PositionIntervals as an Iterator. + * Maps positions to offsets. Provides an Iterator sorted by start position. Also provides + * direct lookup of offsets given start position. The mapping may be built up using {@link #put(int, int, int, int)} + * or retrieved from term vectors in the index using {@link #getTermVectorOffsets(IndexReader, int, String)}. + * * @lucene.experimental */ -public class PositionIntervalArrayIterator extends PositionIntervalIterator { +public class PosOffsetMap { - private int next = 0; - private int count; - private PositionInterval[] positions; - - public PositionIntervalArrayIterator (PositionInterval[] positions, int count) { - super(null); - this.positions = positions; - this.count = count; + private SortedMap posOffsetMap; + + public PosOffsetMap() { + this.posOffsetMap = new TreeMap(); } - - @Override - public PositionInterval next() { - if (next >= count) - return null; - return positions[next++]; + + public void put(int startPosition, int endPosition, int startOffset, int endOffset) { + PosOffset po = new PosOffset(startPosition, endPosition, startOffset, endOffset); + posOffsetMap.put(startPosition, po); } - @Override - public PositionIntervalIterator[] subs(boolean inOrder) { - return EMPTY; + public boolean containsKey(int begin) { + return posOffsetMap.containsKey(begin); } - @Override - public void collect() { + public PosOffset getPosOffset (int pos) { + return posOffsetMap.get(pos); + } + + public Iterator iterator() { + return posOffsetMap.values().iterator(); } - @Override - public int advanceTo(int docId) throws IOException { - return 0; + public void getTermVectorOffsets (IndexReader reader, int docid, String fieldName) throws IOException { + reader.getTermFreqVector(docid, fieldName, new OffsetMapper()); } + private class OffsetMapper extends TermVectorMapper { + + public void setExpectations(String field, int numTerms, + boolean storeOffsets, boolean storePositions) { + } + + public void map(BytesRef term, int frequency, + TermVectorOffsetInfo[] offsets, int[] positions) { + for (int i = 0; i < positions.length; i++) { + int pos = positions[i]; + PosOffset po = posOffsetMap.get(pos); + if (po != null) { + po.startOffset = offsets[i].getStartOffset(); + po.endOffset = offsets[i].getEndOffset(); + } + } + } + + } + + public TokenStream getPosTokenStream(IndexReader reader, int docid, + String fieldName, int slop) throws IOException { + return new PosTokenStream(reader.document(docid).getFieldable(fieldName).stringValue(), + PosOffsetMap.this, slop); + } + } \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosScorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosScorer.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosScorer.java (revision 0) @@ -0,0 +1,61 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.search.highlight.TextFragment; + +/** + * Simple highlighting scorer for testing; returns scores from the underlying + * TokenStream; the fragment's score is the total of its tokens' scores. + * + * A more elaborate scorer might give a higher score to fragments containing more distinct tokens. + * + * @lucene.experimental + */ +public class PosScorer implements org.apache.lucene.search.highlight.Scorer { + private ScoreAttribute scoreAtt; + private float fragmentScore; + + @Override + public TokenStream init(TokenStream tokenStream) throws IOException { + scoreAtt = tokenStream.addAttribute(ScoreAttribute.class); + return tokenStream; + } + + @Override + public void startFragment(TextFragment newFragment) { + fragmentScore = 0; + } + + @Override + public float getTokenScore() { + if (scoreAtt.score() > 0) { + fragmentScore += scoreAtt.score(); + return scoreAtt.score(); + } + return 0; + } + + @Override + public float getFragmentScore() { + return fragmentScore; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (revision 1149428) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (working copy) @@ -1,75 +0,0 @@ -package org.apache.lucene.search.poshighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.search.positions.PositionIntervalIterator; -import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; - -/** - * A TokenStream constructed from a stream of positions and their offsets. - * The document is segmented into tokens at the start and end offset of each interval. The intervals - * are assumed to be non-overlapping. - * - * TODO: abstract the dependency on the current PositionOffsetMapper impl; - * allow for implementations of position->offset maps that don't rely on term vectors. - * - * @lucene.experimental - */ -public class PosTokenStream extends TokenStream { - - //this tokenizer generates four attributes: - // term, offset, positionIncrement? and type? - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - //private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final String text; - private final PositionIntervalIterator positions; - - // the index of the current position interval - private PositionInterval pos = null; - private final PositionOffsetMapper pom; - - public PosTokenStream (String text, PositionIntervalIterator positions, PositionOffsetMapper pom) { - this.text = text; - this.positions = positions; - this.pom = pom; - } - - @Override - public final boolean incrementToken() throws IOException { - pos = positions.next(); - if (pos == null){ - return false; - } - int b, e; - b = pom.getStartOffset(pos.begin); - e = pom.getEndOffset(pos.end); - termAtt.append(text, b, e); - offsetAtt.setOffset(b, e); - posIncrAtt.setPositionIncrement(1); - return true; - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (revision 1149428) @@ -0,0 +1,154 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * A TokenStream constructed from a stream of position intervals and their offsets. + * The document is segmented into tokens at the start and end offset of each interval. The intervals + * are assumed to be non-overlapping. Additional non-scoring tokens are generated wrapping before + * and after each interval. The slop parameter controls the size of these surrounding tokens. + * If the intervals are within slop of each other, some of the wrapping tokens will be omitted. + * + * The purpose is to enable a fragmenting highlighter to include text surrounding matching terms without the need to analyze the entire document. + * + * @lucene.experimental + */ +public final class PosTokenStream extends TokenStream { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final ScoreAttribute scoreAtt = addAttribute(ScoreAttribute.class); + private final String text; + private final Iterator poIter; + private final int slop; + private State state; + private PosOffset po = null; + private int lastEndOffset=0; + + public PosTokenStream (String text, PosOffsetMap positions) { + this (text, positions, 25); + } + + public PosTokenStream (String text, PosOffsetMap positions, int slop) { + this.slop = slop; + this.text = text; + this.poIter = positions.iterator(); + this.state = State.Head; + } + + /** + * Goes through these states: + *
    + *
  1. Head: text < first match start - slop
  2. + *
  3. Garbage: text in between matches outside of slop
  4. + *
  5. Preamble: slop chars before a match
  6. + *
  7. Matches: tokens from position interval iterator
  8. + *
  9. Postscript: slop chars after the last match
  10. + *
  11. Tail: text > last match end + slop
  12. + *
  13. Done
  14. + *
+ */ + + enum State { + Head, Garbage, Preamble, Matches, Postscript, Tail, Done + } + + // TODO: break slop at whitespace at (frag size - match size) / 2 + @Override + public final boolean incrementToken() throws IOException { + + clearAttributes(); + + int b, e; + switch(state) { + case Head: + if (nextPos() == null) { + return false; + } + case Garbage: + state = State.Preamble; + b = lastEndOffset; + e = po.startOffset - slop; + if (e > 0) { + scoreAtt.setScore(0); + break; + } // else fall through ... + case Preamble: + state = State.Matches; + if (po.startOffset > 0) { + b = Math.max(0, po.startOffset - slop); + lastEndOffset = e = po.startOffset; + scoreAtt.setScore(0); + break; + } // else fall through ... + case Matches: + if (po.startOffset < lastEndOffset + slop) { + b = po.startOffset; + lastEndOffset = e = po.endOffset; + scoreAtt.setScore(1); + if (nextPos()== null) { + state = State.Postscript; + } + } else { + b = lastEndOffset; + lastEndOffset = e = Math.min(b + slop, po.startOffset - slop); + scoreAtt.setScore(0); + state = State.Garbage; + } + break; + case Postscript: + state = State.Tail; + if (lastEndOffset >= text.length()) + return false; + b = lastEndOffset; + e = Math.min (lastEndOffset + slop, text.length()); + lastEndOffset = e; + scoreAtt.setScore(0); + break; + case Tail: + state = State.Done; + if (lastEndOffset >= text.length()) + return false; + b = lastEndOffset; + e = text.length(); + scoreAtt.setScore(0); + break; + case Done: + default: // compiler obeisance + return false; + } + termAtt.setEmpty(); + termAtt.append(text, b, e); + offsetAtt.setOffset(b, e); + return true; + } + + private PosOffset nextPos () throws IOException { + if (!poIter.hasNext()) + return null; + po = poIter.next(); + return po; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java (revision 1150180) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java (working copy) @@ -1,61 +0,0 @@ -package org.apache.lucene.search.poshighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.search.positions.PositionIntervalIterator; - -/** - * Present an array of PositionIntervals as an Iterator. - * @lucene.experimental - */ -public class PositionIntervalArrayIterator extends PositionIntervalIterator { - - private int next = 0; - private int count; - private PositionInterval[] positions; - - public PositionIntervalArrayIterator (PositionInterval[] positions, int count) { - super(null); - this.positions = positions; - this.count = count; - } - - @Override - public PositionInterval next() { - if (next >= count) - return null; - return positions[next++]; - } - - @Override - public PositionIntervalIterator[] subs(boolean inOrder) { - return EMPTY; - } - - @Override - public void collect() { - } - - @Override - public int advanceTo(int docId) throws IOException { - return 0; - } - -} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java (revision 1150180) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java (working copy) @@ -1,73 +0,0 @@ -package org.apache.lucene.search.poshighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.index.TermVectorMapper; -import org.apache.lucene.index.TermVectorOffsetInfo; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; - -/** - * Create a map of position->offsets using term vectors. TODO: In highlighting, we don't really need the - * entire map; make a sparse map including only required positions. - * - * @lucene.experimental - */ - -public class PositionOffsetMapper extends TermVectorMapper { - private int maxPos = 0; - private static final int BUF_SIZE = 128; - int startOffset[] = new int[BUF_SIZE], endOffset[] = new int[BUF_SIZE]; - - public void setExpectations(String field, int numTerms, - boolean storeOffsets, boolean storePositions) { - } - - public void map(BytesRef term, int frequency, - TermVectorOffsetInfo[] offsets, int[] positions) - { - for (int i = 0; i < positions.length; i++) { - int pos = positions[i]; - if (pos >= startOffset.length) { - grow (pos + BUF_SIZE); - maxPos = pos; - } else if (pos > maxPos) { - maxPos = pos; - } - startOffset[pos] = offsets[i].getStartOffset(); - endOffset[pos] = offsets[i].getEndOffset(); - } - } - - private void grow (int size) { - startOffset = ArrayUtil.grow (startOffset, size); - endOffset = ArrayUtil.grow (endOffset, size); - } - - public int getStartOffset(int pos) { - return startOffset[pos]; - } - - public int getEndOffset(int pos) { - return endOffset[pos]; - } - - public int getMaxPosition() { - return maxPos; - } -} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttribute.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttribute.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttribute.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +/** + * A Token's score. The default value is 0. + * + * @lucene.experimental + */ + +public interface ScoreAttribute extends Attribute { + + /** Returns this Token's score. Defaults to 0. */ + public float score(); + + /** Set the score. + @see #score() */ + public void setScore(float score); +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttributeImpl.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttributeImpl.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttributeImpl.java (revision 0) @@ -0,0 +1,83 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.AttributeImpl; + +/** + * A Token's score. The default value is 0. + * + * @lucene.experimental + */ + +public class ScoreAttributeImpl extends AttributeImpl implements ScoreAttribute { + + private float score = 0; + + public ScoreAttributeImpl () { + this (0); + } + + public ScoreAttributeImpl(float score) { + this.score = score; + } + + /** @return this Token's score. Defaults to 0. */ + @Override + public float score() { + return score; + } + + /** Set the score. + @see #score() */ + @Override + public void setScore(float score) { + this.score = score; + } + + @Override + public void clear() { + score = 0; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof ScoreAttributeImpl) { + final ScoreAttributeImpl o = (ScoreAttributeImpl) other; + return (this.score == o.score); + } + + return false; + } + + @Override + public int hashCode() { + return Float.valueOf(score).hashCode(); + } + + @Override + public void copyTo(AttributeImpl target) { + ScoreAttribute t = (ScoreAttribute) target; + t.setScore(score); + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java (revision 1150180) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java (working copy) @@ -17,50 +17,50 @@ * limitations under the License. */ -import java.util.Comparator; +import java.io.IOException; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; -import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.BytesRef; -/** Used to accumulate position intervals while scoring +/** Used to accumulate position intervals while scoring + * * @lucene.experimental */ public class ScorePosDoc extends ScoreDoc { - public int posCount = 0; - public PositionInterval[] positions; + private PosOffsetMap positionMap; + private BytesRef bytes = new BytesRef(10); public ScorePosDoc(int doc) { super(doc, 0); - positions = new PositionInterval[32]; + this.positionMap = new PosOffsetMap(); } - public void storePosition (PositionInterval pos) { - ensureStorage(); - positions[posCount++] = (PositionInterval) pos.clone(); + public PosOffsetMap getPositionMap () { + return positionMap; } - private void ensureStorage () { - if (posCount >= positions.length) { - PositionInterval temp[] = new PositionInterval[positions.length * 2]; - System.arraycopy(positions, 0, temp, 0, positions.length); - positions = temp; + public void storePosition (PositionInterval interval) throws IOException { + if (positionMap.containsKey(interval.begin)) + // This test is needed b/c sometimes (ConjunctionPosIterator) intervals are reported twice; + // once in call to advanceTo() (see PosCollector.collect()) + // and then again while iterating over remaining positions explicitly. + // And we need to avoid reprocessing since the second time through, + // the payload(s) will already have been consumed... + return; + if (interval.payloadAvailable()) { + interval.nextPayload(bytes); + ByteArrayDataInput dataInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); + + int startOffset = dataInput.readVInt(); + int tokenSize = dataInput.readVInt(); + + positionMap.put(interval.begin, interval.end, startOffset, startOffset + tokenSize); + } else { + positionMap.put(interval.begin, interval.end, 0, 0); } } - public PositionInterval[] sortedPositions() { - ArrayUtil.mergeSort(positions, 0, posCount, new Comparator() { - public int compare(PositionInterval o1, PositionInterval o2) { - return - o1.begin < o2.begin ? -1 : - (o1.begin > o2.begin ? 1 : - (o1.end < o2.end ? -1 : - (o1.end > o2.end ? 1 : - 0))); - } - - }); - return positions; - } } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SimpleFragmenter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SimpleFragmenter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SimpleFragmenter.java (revision 0) @@ -0,0 +1,62 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.search.highlight.Fragmenter; + +/** + * Fragments text in fixed size chunks. Like {@see SimpleFragmenter}, + * but not confused by very large chunks. + * + * @lucene.experimental + */ +public class SimpleFragmenter implements Fragmenter { + private static final int DEFAULT_FRAGMENT_SIZE = 100; + private OffsetAttribute offsetAtt; + private int fragmentSize; + private int lastFragStart; + + public SimpleFragmenter() { + this(DEFAULT_FRAGMENT_SIZE); + } + + /** + * + * @param fragmentSize size in number of characters of each fragment + */ + public SimpleFragmenter(int fragmentSize) { + this.fragmentSize = fragmentSize; + } + + @Override + public void start(String originalText, TokenStream stream) { + offsetAtt = stream.addAttribute(OffsetAttribute.class); + lastFragStart = 0; + } + + @Override + public boolean isNewFragment () { + boolean isNewFrag = (offsetAtt.endOffset() - lastFragStart > fragmentSize); + if (isNewFrag) { + lastFragStart = offsetAtt.startOffset(); + } + return isNewFrag; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SingleDocFilter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SingleDocFilter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SingleDocFilter.java (revision 0) @@ -0,0 +1,67 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; + +/** + * Filter that matches a single document. + * + * @lucene.experimental + * + */ +public class SingleDocFilter extends Filter { + private int doc; + + public SingleDocFilter (int doc) { + this.doc = doc; + } + + @Override + public DocIdSet getDocIdSet(AtomicReaderContext context) { + return new DocIdSet () { + public DocIdSetIterator iterator() { + return new DocIdSetIterator () { + int curr = -1; + public int docID() { + return curr; + } + public int nextDoc() throws IOException { + if (curr < 0) + curr = doc; + else + curr = NO_MORE_DOCS; + return curr; + } + @Override + public int advance(int target) throws IOException { + while (nextDoc() < target) + ; + return curr; + } + }; + }; + }; + }; + +} Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadReader.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadReader.java (revision 0) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadReader.java (revision 0) @@ -0,0 +1,63 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.index.Payload; +import org.apache.lucene.store.ByteArrayDataInput; + +/** + * Reads offsets from a payload. + * + * @lucene.experimental + */ + +public class OffsetPayloadReader extends TokenFilter { + + private OffsetAttribute offsetAtt; + private PayloadAttribute payloadAtt; + private ByteArrayDataInput dataInput = new ByteArrayDataInput (); + + protected OffsetPayloadReader(TokenStream input) { + super(input); + offsetAtt = addAttribute(OffsetAttribute.class); + payloadAtt = addAttribute(PayloadAttribute.class); + } + + @Override + public boolean incrementToken() throws IOException { + if (! input.incrementToken()) + return false; + + Payload payload = payloadAtt.getPayload(); + dataInput.reset(payload.getData(), payload.getOffset(), payload.length()); + + int startOffset = dataInput.readVInt(); + int tokenSize = dataInput.readVInt(); + + offsetAtt.setOffset(startOffset, startOffset + tokenSize); + + return true; + } + +} Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadWriter.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadWriter.java (revision 0) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadWriter.java (revision 0) @@ -0,0 +1,68 @@ +package org.apache.lucene.search.poshighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.index.Payload; +import org.apache.lucene.store.ByteArrayDataOutput; + +/** + * Writes offsets to a payload. + * + * @lucene.experimental + */ + +public class OffsetPayloadWriter extends TokenFilter { + + private OffsetAttribute offsetAtt; + private PayloadAttribute payloadAtt; + private byte[] buf = new byte[10]; + private ByteArrayDataOutput dataOutput = new ByteArrayDataOutput(); + private Payload payload = new Payload (); + + public OffsetPayloadWriter(TokenStream input) { + super(input); + offsetAtt = addAttribute(OffsetAttribute.class); + payloadAtt = addAttribute(PayloadAttribute.class); + } + + @Override + public boolean incrementToken() throws IOException { + if (! input.incrementToken()) + return false; + + dataOutput.reset(buf); + + int startOffset = offsetAtt.startOffset(); + dataOutput.writeVInt(startOffset); + + int tokenSize = offsetAtt.endOffset() - startOffset; + dataOutput.writeVInt(tokenSize); + + payload.setData (buf, 0, dataOutput.getPosition()); + payloadAtt.setPayload(payload); + + return true; + } + +} Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java (revision 1150180) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java (working copy) @@ -1,11 +1,28 @@ package org.apache.lucene.search.poshighlight; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + import java.io.IOException; +import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; @@ -20,25 +37,20 @@ import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; -import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.search.positions.PositionFilterQuery; import org.apache.lucene.search.positions.TestBlockPositionsIterator.BlockPositionIteratorFilter; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.LuceneTestCase; - +import org.apache.lucene.util.Version; +import org.junit.Ignore; /** - * TODO: - * Phrase and Span Queries - * positions callback API + * @lucene.experimental */ public class PosHighlighterTest extends LuceneTestCase { @@ -46,6 +58,7 @@ protected Analyzer analyzer; protected Directory dir; protected IndexSearcher searcher; + protected boolean useOffsetPayloads; private static final String PORRIDGE_VERSE = "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" @@ -54,8 +67,9 @@ @Override public void setUp() throws Exception { super.setUp(); - analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false); + analyzer = new WhitespaceAnalyzer(Version.LUCENE_40); dir = newDirectory(); + useOffsetPayloads = true; } @Override @@ -82,6 +96,13 @@ for( String value: values ) { Document doc = new Document(); Field f = new Field (F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS); + TokenStream tokens = analyzer.tokenStream(F, new StringReader (value)); + if (useOffsetPayloads) { + OffsetPayloadWriter tokensWithOffsets = new OffsetPayloadWriter(tokens); + f.setTokenStream(tokensWithOffsets); + } else { + f.setTokenStream(tokens); + } doc.add (f); writer.addDocument( doc ); } @@ -94,57 +115,43 @@ return doSearch(q, 100); } - private class ConstantScorer implements org.apache.lucene.search.highlight.Scorer { - - @Override - public TokenStream init(TokenStream tokenStream) throws IOException { - return tokenStream; - } - - @Override - public void startFragment(TextFragment newFragment) { - } - - @Override - public float getTokenScore() { - return 1; - } - - @Override - public float getFragmentScore() { - return 1; - } - } - private String[] doSearch(Query q, int maxFragSize) throws IOException, InvalidTokenOffsetsException { return doSearch (q, maxFragSize, 0); } - private String[] doSearch(Query q, int maxFragSize, int docIndex) throws IOException, InvalidTokenOffsetsException { - // ConstantScorer is a fragment Scorer, not a search result (document) Scorer - Highlighter highlighter = new Highlighter (new ConstantScorer()); + + private String[] doSearch(Query q, int maxFragSize, int docid) throws IOException, InvalidTokenOffsetsException { + // PosScorer is a fragment Scorer, not a search result (document) Scorer + PosHighlighter highlighter = new PosHighlighter (); highlighter.setTextFragmenter(new SimpleFragmenter(maxFragSize)); - PosCollector collector = new PosCollector(10); - if (q instanceof MultiTermQuery) { - ((MultiTermQuery)q).setRewriteMethod (MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); - } - searcher.search(q, collector); - ScorePosDoc doc = collector.docs[docIndex]; - if (doc == null) + + String text = searcher.getIndexReader().document(docid).getFieldable(F).stringValue(); + TokenStream tstream = PosHighlighter.getPosTokenStream (q, docid, useOffsetPayloads ? null : F, searcher, text); + if (tstream == null) return null; - String text = searcher.getIndexReader().document(doc.doc).getFieldable(F).stringValue(); - PositionOffsetMapper pom = new PositionOffsetMapper (); - // FIXME: test error cases: for non-stored fields, and fields w/no term vectors - searcher.getIndexReader().getTermFreqVector(doc.doc, F, pom); - - TextFragment[] fragTexts = highlighter.getBestTextFragments(new PosTokenStream - (text, new PositionIntervalArrayIterator(doc.sortedPositions(), doc.posCount), pom), - text, false, 10); + TextFragment[] fragTexts = highlighter.getBestTextFragments (tstream, text, false, 10); String[] frags = new String[fragTexts.length]; for (int i = 0; i < frags.length; i++) frags[i] = fragTexts[i].toString(); return frags; } + /* + private String[] doSearchHighlighter(Query q, int maxFragSize, int docid) throws IOException, InvalidTokenOffsetsException { + // PosScorer is a fragment Scorer, not a search result (document) Scorer + Highlighter highlighter = new Highlighter (new QueryScorer(q)); + highlighter.setTextFragmenter(new SimpleFragmenter(maxFragSize)); + + String text = searcher.getIndexReader().document(docid).getFieldable(F).stringValue(); + TokenStream tstream = analyzer.reusableTokenStream(F, new StringReader(text)); + tstream.reset(); + TextFragment[] fragTexts = highlighter.getBestTextFragments (tstream, text, false, 10); + String[] frags = new String[fragTexts.length]; + for (int i = 0; i < frags.length; i++) + frags[i] = fragTexts[i].toString(); + return frags; + } + */ + public void testTerm () throws Exception { insertDocs(analyzer, "This is a test test"); String frags[] = doSearch (new TermQuery(new Term(F, "test"))); @@ -152,9 +159,9 @@ } public void testSeveralSnippets () throws Exception { - String input = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + + String input = "this is some long text. It has the word long in many places. " + "Let us see what happens to long in this case."; - String gold = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + + String gold = "this is some long text. It has the word long in many places. " + "Let us see what happens to long in this case."; insertDocs(analyzer, input); String frags[] = doSearch (new TermQuery(new Term(F, "long")), input.length()); @@ -231,10 +238,8 @@ // make sure we highlight the phrase, and not the terms outside the phrase assertEquals ("is it that this is a test, is it", frags[0]); } - - /* - * Failing ... PhraseQuery scorer needs positions()? - */ + + @Ignore("PhraseQuery doesn't yet have positions") public void testPhraseOriginal() throws Exception { insertDocs(analyzer, "This is a test"); PhraseQuery pq = new PhraseQuery(); @@ -260,8 +265,38 @@ insertDocs(analyzer, "This is a test"); String frags[] = doSearch (new WildcardQuery(new Term(F, "t*t"))); assertEquals ("This is a test", frags[0]); + + // generate too many clauses from rewrite + StringBuilder buf = new StringBuilder(); + for (int i = 0; i < 2000; i++) { + buf.append('A'); + buf.append(Integer.toString(i)); + buf.append("Z "); + } + insertDocs(analyzer, buf.toString()); + Exception ex = null; + try { + // NB: 'traditional' HL throws the same exception: + // frags = doSearchHighlighter(new WildcardQuery(new Term(F, "A*Z")), 50, 0); + frags = doSearch(new WildcardQuery(new Term(F, "A*Z")), 50, 0); + } catch (BooleanQuery.TooManyClauses e) { + ex = e; + } + assertNotNull (ex); } + @Ignore("rewriting complex queries not implemented yet") + public void testCompositeWildcard () throws Exception { + insertDocs(analyzer, "This is a test"); + + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new WildcardQuery(new Term(F, "T*s")), Occur.MUST)); + bq.add(new BooleanClause (new WildcardQuery(new Term(F, "t*t")), Occur.MUST)); + + String frags[] = doSearch (bq); + assertEquals ("This is a test", frags[0]); + } + public void testMultipleDocumentsAnd() throws Exception { insertDocs(analyzer, "This document has no matches", @@ -270,9 +305,9 @@ BooleanQuery bq = new BooleanQuery(); bq.add(new BooleanClause (new TermQuery(new Term(F, "Pease")), Occur.MUST)); bq.add(new BooleanClause (new TermQuery(new Term(F, "porridge")), Occur.MUST)); - String frags[] = doSearch (bq, 50, 0); + String frags[] = doSearch (bq, 50, 1); assertEquals ("Pease porridge hot! Pease porridge cold! Pease", frags[0]); - frags = doSearch (bq, 50, 1); + frags = doSearch (bq, 50, 2); assertEquals ("This document has some Pease porridge in it", frags[0]); } @@ -288,10 +323,48 @@ BooleanQuery bq = new BooleanQuery(); bq.add(new BooleanClause (new TermQuery(new Term(F, "Pease")), Occur.SHOULD)); bq.add(new BooleanClause (new TermQuery(new Term(F, "porridge")), Occur.SHOULD)); - String frags[] = doSearch (bq, 50, 0); + String frags[] = doSearch (bq, 50, 1); assertEquals ("Pease porridge hot! Pease porridge cold! Pease", frags[0]); - frags = doSearch (bq, 50, 1); + frags = doSearch (bq, 50, 2); assertEquals ("This document has some Pease porridge in it", frags[0]); } + + public void testLongishDocument() throws Exception { + StringBuilder buf = new StringBuilder (); + for (int i = 0 ; i < 1000; i++) + buf.append ("dummy "); + buf.append ("This document has some Pease porridge in it"); + insertDocs(analyzer, buf.toString()); + + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "Pease")), Occur.SHOULD)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "porridge")), Occur.SHOULD)); + String frags[] = doSearch (bq, 50, 0); + assertEquals ("y This document has some Pease porridge in it", frags[0]); + insertDocs(analyzer, buf.toString()); + for (int i = 0 ; i < 1000; i++) + buf.append (" dummy"); + insertDocs(analyzer, buf.toString()); + frags = doSearch (bq, 50, 0); + assertEquals ("y This document has some Pease porridge", frags[0]); + frags = doSearch (bq, 75, 0); + assertEquals ("y This document has some Pease porridge in it dummy dummy dummy ", frags[0]); + } + + public void testFragmentation() throws Exception { + StringBuilder buf = new StringBuilder (); + for (int i = 0 ; i < 1000; i++) { + buf.append (Integer.toString(i)); + buf.append(' '); + } + insertDocs(analyzer, buf.toString()); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "10")), Occur.SHOULD)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "500")), Occur.SHOULD)); + String[] frags = doSearch (bq, 75, 0); + assertEquals ("0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ", frags[0]); + assertEquals (" 494 495 496 497 498 499 500 501 502 503 504 505 506 ", frags[1]); + } + } Index: lucene/src/java/org/apache/lucene/search/IndexSearcher.java =================================================================== --- lucene/src/java/org/apache/lucene/search/IndexSearcher.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/IndexSearcher.java (working copy) @@ -564,7 +564,7 @@ assert filter != null; - Scorer scorer = weight.scorer(context, ScorerContext.def()); + Scorer scorer = weight.scorer(context, ScorerContext.def().needsPositions(collector.needsPositions()).needsPayloads(collector.needsPayloads())); if (scorer == null) { return; } Index: lucene/src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TermScorer.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -211,7 +211,7 @@ } @Override - public void collect() { + public void collect() throws IOException { collector.collectLeafPosition(scorer, interval, docID); } Index: lucene/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java (working copy) @@ -119,7 +119,7 @@ } @Override - public void collect() { + public void collect() throws IOException { collector.collectComposite(scorer, interval, currentDoc); for (PositionIntervalIterator iter : iterators) { iter.collect(); Index: lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java (working copy) @@ -83,7 +83,7 @@ } @Override - public void collect() { + public void collect() throws IOException { collector.collectComposite(scorer, queue.queueInterval, currentDoc); for (PositionIntervalIterator iter : iterators) { iter.collect(); Index: lucene/src/java/org/apache/lucene/search/positions/DisjunctionPositionIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/DisjunctionPositionIterator.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/positions/DisjunctionPositionIterator.java (working copy) @@ -68,7 +68,7 @@ } @Override - public void collect() { + public void collect() throws IOException { collector.collectComposite(scorer, queue.queueInterval, currentDoc); iterators[queue.top().index].collect(); } Index: lucene/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java (working copy) @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +import org.apache.lucene.search.positions.PositionIntervalIterator.PositionCollector; import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; /** * Index: lucene/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java (working copy) @@ -90,7 +90,7 @@ } @Override - public void collect() { + public void collect() throws IOException { collector.collectComposite(scorer, interval, currentDoc); for (PositionIntervalIterator iter : iterators) { iter.collect(); Index: lucene/src/java/org/apache/lucene/search/positions/PositionFilterQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/PositionFilterQuery.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/positions/PositionFilterQuery.java (working copy) @@ -195,7 +195,7 @@ } @Override - public void collect() { + public void collect() throws IOException { other.collect(); } Index: lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java (revision 1149428) +++ lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java (working copy) @@ -1,135 +0,0 @@ -package org.apache.lucene.search.positions; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import java.io.IOException; - -import org.apache.lucene.search.Scorer; -import org.apache.lucene.util.BytesRef; - -/** - * - * @lucene.experimental - */ // nocommit - javadoc -public abstract class PositionIntervalIterator { - - public static final PositionIntervalIterator[] EMPTY = new PositionIntervalIterator[0]; - public static final int NO_MORE_DOCS = Integer.MAX_VALUE; - public static final PositionCollector EMPTY_COLLECTOR = new PositionCollector() { - - @Override - public void collectLeafPosition(Scorer scorer, PositionInterval interval, - int docID) { - } - - @Override - public void collectComposite(Scorer scorer, PositionInterval interval, - int docID) { - } - - }; - - protected int currentDoc = -1; - protected final Scorer scorer; - protected PositionCollector collector = EMPTY_COLLECTOR; - - public PositionIntervalIterator(Scorer scorer) { - this.scorer = scorer; - } - - public abstract int advanceTo(int docId) throws IOException; - - public abstract PositionInterval next() throws IOException; - - public void setPositionCollector(PositionCollector collector) { - if (collector == null) { - throw new IllegalArgumentException("PositionCollector must not be null"); - } - this.collector = collector; - PositionIntervalIterator[] subs = subs(false); - for (PositionIntervalIterator positionIntervalIterator : subs) { - positionIntervalIterator.setPositionCollector(collector); - } - } - - - public abstract void collect(); - - public abstract PositionIntervalIterator[] subs(boolean inOrder); - - public int docID() { - return currentDoc; - } - - public Scorer getScorer() { - return scorer; - } - - public static interface PositionIntervalFilter { - public abstract PositionIntervalIterator filter( - PositionIntervalIterator iter); - } - - public static class PositionInterval implements Cloneable { - - public int begin; - public int end; - - public PositionInterval(int begin, int end) { - this.begin = begin; - this.end = end; - } - - public PositionInterval() { - this(0, 0); - } - - public boolean nextPayload(BytesRef ref) throws IOException { - return false; - } - - public boolean payloadAvailable() { - return false; - } - - public void reset() { - begin = end = -1; - } - - @Override - public Object clone() { - try { - return super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(); // should not happen - } - } - - @Override - public String toString() { - return "PositionInterval [begin=" + begin + ", end=" + end + "]"; - } - - } - - public static interface PositionCollector { - public void collectLeafPosition(Scorer scorer, PositionInterval interval, int docID); - public void collectComposite(Scorer scorer, PositionInterval interval, int docID); - - } - -} Index: lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java (revision 0) +++ lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java (revision 1149428) @@ -0,0 +1,135 @@ +package org.apache.lucene.search.positions; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.BytesRef; + +/** + * + * @lucene.experimental + */ // nocommit - javadoc +public abstract class PositionIntervalIterator { + + public static final PositionIntervalIterator[] EMPTY = new PositionIntervalIterator[0]; + public static final int NO_MORE_DOCS = Integer.MAX_VALUE; + public static final PositionCollector EMPTY_COLLECTOR = new PositionCollector() { + + @Override + public void collectLeafPosition(Scorer scorer, PositionInterval interval, + int docID) { + } + + @Override + public void collectComposite(Scorer scorer, PositionInterval interval, + int docID) { + } + + }; + + protected int currentDoc = -1; + protected final Scorer scorer; + protected PositionCollector collector = EMPTY_COLLECTOR; + + public PositionIntervalIterator(Scorer scorer) { + this.scorer = scorer; + } + + public abstract int advanceTo(int docId) throws IOException; + + public abstract PositionInterval next() throws IOException; + + public void setPositionCollector(PositionCollector collector) { + if (collector == null) { + throw new IllegalArgumentException("PositionCollector must not be null"); + } + this.collector = collector; + PositionIntervalIterator[] subs = subs(false); + for (PositionIntervalIterator positionIntervalIterator : subs) { + positionIntervalIterator.setPositionCollector(collector); + } + } + + + public abstract void collect() throws IOException; + + public abstract PositionIntervalIterator[] subs(boolean inOrder); + + public int docID() { + return currentDoc; + } + + public Scorer getScorer() { + return scorer; + } + + public static interface PositionIntervalFilter { + public abstract PositionIntervalIterator filter( + PositionIntervalIterator iter); + } + + public static class PositionInterval implements Cloneable { + + public int begin; + public int end; + + public PositionInterval(int begin, int end) { + this.begin = begin; + this.end = end; + } + + public PositionInterval() { + this(0, 0); + } + + public boolean nextPayload(BytesRef ref) throws IOException { + return false; + } + + public boolean payloadAvailable() { + return false; + } + + public void reset() { + begin = end = -1; + } + + @Override + public Object clone() { + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(); // should not happen + } + } + + @Override + public String toString() { + return "PositionInterval [begin=" + begin + ", end=" + end + "]"; + } + + } + + public static interface PositionCollector { + public void collectLeafPosition(Scorer scorer, PositionInterval interval, int docID) throws IOException; + public void collectComposite(Scorer scorer, PositionInterval interval, int docID) throws IOException; + + } + +} Index: lucene/src/java/org/apache/lucene/search/positions/RangePositionsIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/RangePositionsIterator.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/positions/RangePositionsIterator.java (working copy) @@ -64,7 +64,7 @@ } @Override - public void collect() { + public void collect() throws IOException { collector.collectComposite(null, interval, iterator.docID()); iterator.collect(); } Index: lucene/src/java/org/apache/lucene/search/positions/WithinPositionIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/WithinPositionIterator.java (revision 1150180) +++ lucene/src/java/org/apache/lucene/search/positions/WithinPositionIterator.java (working copy) @@ -57,7 +57,7 @@ } @Override - public void collect() { + public void collect() throws IOException { collector.collectComposite(null, interval, iterator.docID()); iterator.collect(); }