Index: java/org/apache/lucene/search/DisjunctionDISI.java =================================================================== --- java/org/apache/lucene/search/DisjunctionDISI.java (revision 0) +++ java/org/apache/lucene/search/DisjunctionDISI.java (revision 0) @@ -0,0 +1,196 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.List; +import java.util.Iterator; +import java.io.IOException; + +import org.apache.lucene.util.DISIQueue; + +/** Disjunction for DISIs + */ +class DisjunctionDISI extends DocIdSetIterator { + /** The number of subscorers. */ + private final int nrScorers; + + /** The subscorers. */ + protected final List subScorers; + + /** The minimum number of scorers that should match. */ + private final int minimumNrMatchers; + + /** The scorerDocQueue contains all subscorers ordered by their current doc(), + * with the minimum at the top. + *
The scorerDocQueue is initialized the first time next() or skipTo() is called. + *
An exhausted scorer is immediately removed from the scorerDocQueue. + *
If less than the minimumNrMatchers scorers + * remain in the scorerDocQueue next() and skipTo() return false. + *

+ * After each to call to next() or skipTo() + * currentSumScore is the total score of the current matching doc, + * nrMatchers is the number of matching scorers, + * and all scorers are after the matching doc, or are exhausted. + */ + private DISIQueue disiQueue = null; + + /** The document number of the current match. */ + private int currentDoc = -1; + + /** The number of Iterators that provide the current match. */ + protected int nrMatchers = -1; + + /** Construct a DisjunctionScorer. + * @param subScorers A collection of at least two subscorers. + * @param minimumNrMatchers The positive minimum number of subscorers that should + * match to match this query. + *
When minimumNrMatchers is bigger than + * the number of subScorers, + * no matches will be produced. + *
When minimumNrMatchers equals the number of subScorers, + * it more efficient to use ConjunctionScorer. + * @throws IOException + */ + public DisjunctionDISI( List subScorers, int minimumNrMatchers) throws IOException { + nrScorers = subScorers.size(); + + if (minimumNrMatchers <= 0) { + throw new IllegalArgumentException("Minimum nr of matchers must be positive"); + } + if (nrScorers <= 1) { + throw new IllegalArgumentException("There must be at least 2 subScorers"); + } + + this.minimumNrMatchers = minimumNrMatchers; + this.subScorers = subScorers; + + } + + /** Construct a DisjunctionScorer, using one as the minimum number + * of matching subscorers. + * @throws IOException + */ + public DisjunctionDISI(List subScorers) throws IOException { + this(subScorers, 1); + } + + /** Called the first time next() or skipTo() is called to + * initialize scorerDocQueue. + */ + private void initDISIQueue() throws IOException { + Iterator si = subScorers.iterator(); + disiQueue = new DISIQueue(nrScorers); + + while (si.hasNext()) { + DocIdSetIterator se = (DocIdSetIterator) si.next(); + if (se.next()) { // doc() method will be used in scorerDocQueue. + disiQueue.insert(se); + } + } + } + + + public boolean next() throws IOException { + if(disiQueue==null) initDISIQueue(); + + return (disiQueue.size() >= minimumNrMatchers) + && advanceAfterCurrent(); + } + + + /** Advance all subscorers after the current document determined by the + * top of the scorerDocQueue. + * Repeat until at least the minimum number of subscorers match on the same + * document and all subscorers are after that document or are exhausted. + *
On entry the scorerDocQueue has at least minimumNrMatchers + * available. At least the scorer with the minimum document number will be advanced. + * @return true iff there is a match. + *
In case there is a match, currentDoc, currentSumScore, + * and nrMatchers describe the match. + * + * @todo Investigate whether it is possible to use skipTo() when + * the minimum number of matchers is bigger than one, ie. try and use the + * character of ConjunctionScorer for the minimum number of matchers. + * Also delay calling score() on the sub scorers until the minimum number of + * matchers is reached. + *
For this, a DocIdSetIterator array with minimumNrMatchers elements might + * hold Scorers at currentDoc that are temporarily popped from scorerQueue. + */ + protected boolean advanceAfterCurrent() throws IOException { + do { // repeat until minimum nr of matchers + currentDoc = disiQueue.topDoc(); + nrMatchers = 1; + do { // Until all subscorers are after currentDoc + if (! disiQueue.topNextAndAdjustElsePop()) { + if (disiQueue.size()== 0) { + break; // nothing more to advance, check for last match. + } + } + if (disiQueue.topDoc() != currentDoc) { + break; // All remaining subscorers are after currentDoc. + } + nrMatchers++; + } while (true); + + if (nrMatchers >= minimumNrMatchers) { + return true; + } else if (disiQueue.size() < minimumNrMatchers) { + return false; + } + } while (true); + } + + + public int doc() { return currentDoc; } + + /** Returns the number of subscorers matching the current document. + * Initially invalid, until {@link #next()} is called the first time. + */ + public int nrMatchers() { + return nrMatchers; + } + + /** Skips to the first match beyond the current whose document number is + * greater than or equal to a given target. + *
When this method is used the {@link #explain(int)} method should not be used. + *
The implementation uses the skipTo() method on the subscorers. + * @param target The target document number. + * @return true iff there is such a match. + */ + public boolean skipTo(int target) throws IOException { + if(disiQueue==null) initDISIQueue(); + + if (disiQueue.size() < minimumNrMatchers) { + return false; + } + if (target <= currentDoc) { + return true; + } + do { + if (disiQueue.topDoc() >= target) { + return advanceAfterCurrent(); + } else if (! disiQueue.topSkipToAndAdjustElsePop(target)) { + if (disiQueue.size() < minimumNrMatchers) { + return false; + } + } + } while (true); + } + + +} Index: java/org/apache/lucene/util/DISIQueue.java =================================================================== --- java/org/apache/lucene/util/DISIQueue.java (revision 0) +++ java/org/apache/lucene/util/DISIQueue.java (revision 0) @@ -0,0 +1,195 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Derived from org.apache.lucene.util.PriorityQueue of March 2005 */ + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; + +/** A ScorerDocQueue maintains a partial ordering of its DISIs such that the + least DISI can always be found in constant time. Put()'s and pop()'s + require log(size) time. The ordering is by DISI.doc(). + */ +public class DISIQueue { + private final DocIdSetIterator[] heap; + private final int maxSize; + private int size; + + + private DocIdSetIterator topDISI; // same as heap[1], only for speed + + /** Create a ScorerDocQueue with a maximum size. */ + public DISIQueue(int maxSize) { + // assert maxSize >= 0; + size = 0; + int heapSize = maxSize + 1; + heap = new DocIdSetIterator[heapSize]; + this.maxSize = maxSize; + topDISI = heap[1]; // initially null + } + + /** + * Adds a DocIdSetIterator to a ScorerDocQueue in log(size) time. + * If one tries to add more Scorers than maxSize + * a RuntimeException (ArrayIndexOutOfBound) is thrown. + */ + public final void put(DocIdSetIterator disi) { + size++; + heap[size] = disi; + upHeap(); + } + + /** + * Adds a DocIdSetIterator to the DISIQueue in log(size) time if either + * the DISIQueue is not full, or not lessThan(scorer, top()). + * @param disi + * @return true if scorer is added, false otherwise. + */ + public boolean insert(DocIdSetIterator disi){ + if (size < maxSize) { + put(disi); + return true; + } else { + + if ((size > 0) && (! (disi.doc() < topDISI.doc()))) { // heap[1] is top() + heap[1] = disi; + downHeap(); + return true; + } else { + return false; + } + } + } + + /** Returns the least DocIdSetIterator of the ScorerDocQueue in constant time. + * Should not be used when the queue is empty. + */ + public final DocIdSetIterator top() { + // assert size > 0; + return topDISI; + } + + /** Returns document number of the least DocIdSetIterator of the ScorerDocQueue + * in constant time. + * Should not be used when the queue is empty. + */ + public final int topDoc() { + // assert size > 0; + return topDISI.doc(); + } + + + public final boolean topNextAndAdjustElsePop() throws IOException { + return checkAdjustElsePop( topDISI.next()); + } + + public final boolean topSkipToAndAdjustElsePop(int target) throws IOException { + return checkAdjustElsePop( topDISI.skipTo(target)); + } + + private boolean checkAdjustElsePop(boolean cond) { + if (!cond) { // see also popNoResult + heap[1] = heap[size]; // move last to first + heap[size] = null; + size--; + } + downHeap(); + return cond; + } + + /** Removes and returns the least scorer of the ScorerDocQueue in log(size) + * time. + * Should not be used when the queue is empty. + */ + public final DocIdSetIterator pop() { + // assert size > 0; + DocIdSetIterator result = topDISI; + popNoResult(); + return result; + } + + /** Removes the least scorer of the ScorerDocQueue in log(size) time. + * Should not be used when the queue is empty. + */ + private final void popNoResult() { + heap[1] = heap[size]; // move last to first + heap[size] = null; + size--; + downHeap(); // adjust heap + } + + /** Returns the number of scorers currently stored in the ScorerDocQueue. */ + public final int size() { + return size; + } + + /** Removes all entries from the ScorerDocQueue. */ + public final void clear() { + for (int i = 0; i <= size; i++) { + heap[i] = null; + } + size = 0; + } + + private final void upHeap() { + int i = size; + DocIdSetIterator node = heap[i]; // save bottom node + int j = i >>> 1; + while ((j > 0) && (node.doc() < heap[j].doc())) { + heap[i] = heap[j]; // shift parents down + i = j; + j = j >>> 1; + } + heap[i] = node; // install saved node + topDISI = heap[1]; + } + + /** Should be called when the scorer at top changes doc() value. + * Still log(n) worst case, but it's at least twice as fast to

+   *  { pq.top().change(); pq.adjustTop(); }
+   * 
instead of
+   *  { o = pq.pop(); o.change(); pq.push(o); }
+   * 
+ */ + private final void downHeap() { + int i = 1; + int j = 2; //i << 1; // find smaller child + int k = 3; //j + 1; + + final DocIdSetIterator node = heap[i]; // save top node + + if ((k <= size) && (heap[k].doc() < heap[j].doc())) { + j = k; + } + + while ((j <= size) && (heap[j].doc() < node.doc())) { + heap[i] = heap[j]; // shift up child + i = j; + j = i << 1; + k = j + 1; + if (k <= size && (heap[k].doc() < heap[j].doc())) { + j = k; + } + } + + heap[i] = node; // install saved node + topDISI = heap[1]; + } +} Index: test/org/apache/lucene/search/TestDisjunctionDISI.java =================================================================== --- test/org/apache/lucene/search/TestDisjunctionDISI.java (revision 0) +++ test/org/apache/lucene/search/TestDisjunctionDISI.java (revision 0) @@ -0,0 +1,118 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; +import java.util.Random; + +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.DocIdBitSet; + +public class TestDisjunctionDISI extends TestCase { + /** Main for running test case by itself. */ + public static void main(String args[]) { + TestRunner.run(new TestSuite(TestDisjunctionDISI.class)); + } + + static Random rand = new Random(); + static final int MAX_BIT_SET_SIZE = 10; + + public void testDisjunction() throws IOException{ + for(int iter = 0; iter < 1000 ; iter++){ + BitSet bs1 = generateBitSet(rand.nextInt(MAX_BIT_SET_SIZE)); + BitSet bs2 = generateBitSet(rand.nextInt(MAX_BIT_SET_SIZE)); + + //Test minimum Nr should match == 1 + BitSet result = new BitSet(); + result.or(bs1); result.or(bs2); + + List disis = new ArrayList(); + disis.add((new DocIdBitSet(bs1)).iterator()); + disis.add((new DocIdBitSet(bs2)).iterator()); + + DisjunctionDISI disjunction = new DisjunctionDISI(disis,1); + + assertTrue(disjunction.doc()==-1); + assertTrue(validateViaNext(result, disjunction)); + + disis.clear();//no restart()... + disis.add((new DocIdBitSet(bs1)).iterator()); + disis.add((new DocIdBitSet(bs2)).iterator()); + disjunction = new DisjunctionDISI(disis,1);//no restart()... + assertTrue(disjunction.doc()==-1); + assertTrue(validateViaSkipTo(result, disjunction)); + + + //Test minimumNrShouldMatch(2); + result = new BitSet(); + result.or(bs1); result.and(bs2); + + disis.clear();//no restart()... + disis.add((new DocIdBitSet(bs1)).iterator()); + disis.add((new DocIdBitSet(bs2)).iterator()); + disjunction = new DisjunctionDISI(disis,2); + assertTrue(disjunction.doc()==-1); + assertTrue(validateViaNext(result, disjunction)); + + disis.clear();//no restart()... + disis.add((new DocIdBitSet(bs1)).iterator()); + disis.add((new DocIdBitSet(bs2)).iterator()); + disjunction = new DisjunctionDISI(disis,2); + assertTrue(disjunction.doc()==-1); + assertTrue(validateViaSkipTo(result, disjunction)); + + } + } + + + private BitSet generateBitSet(int maxSize){ + BitSet bs = new BitSet(); + for(int i =0; i= 0; i = bs.nextSetBit(i+1)) { + if(!disi.next()) return false; + if(disi.doc() !=i ) return false; + } + + return true; +} + +private boolean validateViaSkipTo(BitSet bs, DocIdSetIterator disi) throws IOException{ + + for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i+1)) { + if(!disi.skipTo(i)) return false; + if(disi.doc() !=i ) return false; + } + + return true; +} + +}