Index: src/java/org/apache/lucene/search/FieldComparator.java =================================================================== --- src/java/org/apache/lucene/search/FieldComparator.java (revision 763737) +++ src/java/org/apache/lucene/search/FieldComparator.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import java.text.Collator; import java.util.Locale; +import java.util.Arrays; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.ExtendedFieldCache.DoubleParser; @@ -335,10 +336,18 @@ private LongParser parser; private long bottom; - LongComparator(int numHits, String field, FieldCache.Parser parser) { + LongComparator(int numHits, boolean reversed, String field, FieldCache.Parser parser) { values = new long[numHits]; this.field = field; this.parser = (LongParser) parser; + final long v; + if (reversed) { + v = Long.MIN_VALUE; + } else { + v = Long.MAX_VALUE; + } + Arrays.fill(values, v); + bottom = v; } public int compare(int slot1, int slot2) { @@ -585,6 +594,14 @@ this.sortPos = sortPos; this.reversed = reversed; this.field = field; + if (reversed) { + Arrays.fill(ords, -1); + bottomOrd = -1; + } else { + Arrays.fill(ords, Integer.MAX_VALUE); + bottomOrd = Integer.MAX_VALUE; + } + //bottomSlot = 0; } public int compare(int slot1, int slot2) { Index: src/java/org/apache/lucene/search/SortField.java =================================================================== --- src/java/org/apache/lucene/search/SortField.java (revision 763737) +++ src/java/org/apache/lucene/search/SortField.java (working copy) @@ -444,7 +444,8 @@ * @param reversed True if the SortField is reversed * @return {@link FieldComparator} to use when sorting */ - protected FieldComparator getComparator(final IndexReader[] subReaders, final int numHits, final int sortPos, final boolean reversed) throws IOException { + // nocommit + public FieldComparator getComparator(final IndexReader[] subReaders, final int numHits, final int sortPos, final boolean reversed) throws IOException { if (locale != null) { // TODO: it'd be nice to allow FieldCache.getStringIndex @@ -467,7 +468,7 @@ return new FieldComparator.FloatComparator(numHits, field, parser); case SortField.LONG: - return new FieldComparator.LongComparator(numHits, field, parser); + return new FieldComparator.LongComparator(numHits, reversed, field, parser); case SortField.DOUBLE: return new FieldComparator.DoubleComparator(numHits, field, parser); Index: src/java/org/apache/lucene/search/IndexSearcher.java =================================================================== --- src/java/org/apache/lucene/search/IndexSearcher.java (revision 763737) +++ src/java/org/apache/lucene/search/IndexSearcher.java (working copy) @@ -43,6 +43,15 @@ private IndexReader[] sortedSubReaders; private int[] sortedStarts; + // nocommit + public IndexReader[] getSortedSubReaders() { + return sortedSubReaders; + } + // nocommit + public int[] getDocBases() { + return sortedStarts; + } + /** Creates a searcher searching the index in the named directory. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error @@ -95,7 +104,8 @@ static private final IndexReader[] indexReaderZeroArray = new IndexReader[0]; protected void sortSubReaders(boolean docsInOrder) { - + // nocommit + docsInOrder = true; List subReadersList = new ArrayList(); gatherSubReaders(subReadersList, reader); sortedSubReaders = (IndexReader[]) subReadersList.toArray(indexReaderZeroArray); Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 763737) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -41,7 +41,8 @@ /** * @version $Id$ */ -class SegmentReader extends DirectoryIndexReader { +// nocommit +public class SegmentReader extends DirectoryIndexReader { private String segment; private SegmentInfo si; private int readBufferSize; @@ -569,6 +570,10 @@ } } } + + public BitVector getDeletedDocs() { + return deletedDocs; + } private void loadDeletedDocs() throws IOException { // NOTE: the bitvector is stored using the regular directory, not cfs Index: src/java/org/apache/lucene/index/SegmentTermDocs.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermDocs.java (revision 763737) +++ src/java/org/apache/lucene/index/SegmentTermDocs.java (working copy) @@ -21,7 +21,8 @@ import org.apache.lucene.util.BitVector; import org.apache.lucene.store.IndexInput; -class SegmentTermDocs implements TermDocs { +// nocommit +public class SegmentTermDocs implements TermDocs { protected SegmentReader parent; protected IndexInput freqStream; protected int count; @@ -93,6 +94,15 @@ } } + // nocommit + public IndexInput getFreqStream() { + return freqStream; + } + // nocommit + public int getTermFreq() { + return df; + } + public void close() throws IOException { freqStream.close(); if (skipListReader != null) Index: contrib/benchmark/gen.py =================================================================== --- contrib/benchmark/gen.py (revision 0) +++ contrib/benchmark/gen.py (revision 0) @@ -0,0 +1,961 @@ +import sys +import os + +# TODO +# - support sparse filters (as iterator, boolean query clause?) +# - get reversed working +# - cleanup +# - merge in all stop criteria, eg ++count > aLimit +# - for scoredoc we do't need to re-add docBase -- just init doc=0 +# - hmm: will we "flood" the instruction cache when many threads running different specialized code? +# - for real integration +# - need to "know" that filter will give us an openbitset, somehow? +# - tweak order of ifs +# - make numHit bind @ runtime +# - allow deletes/no per segment +# - TODO: figure out if side-by-side arrays are slower, or +# pointer-to-obj is slower, for the queue +# - score doc & fields +# - assert topN > 0 +# - downHeap could be optimized -- that initial "if k < 10" +# - for TopScoreDoc collection, try to re-coalesce multiple if statemetns (it got slower!!) + +class Writer: + + def __init__(self): + self.indent = 0 + self.l = [] + self.vars = {} + + def __call__(self, s): + if s.find('}') != -1: + self.indent -= 1 + c = 0 + while c < len(s)-1 and s[c] == ' ': + c += 1 + self.l.append(' '*(2*self.indent-c) + s) + if s.find('{') != -1: + self.indent += 1 + + def __str__(self): + return '\n'.join(self.l) + + def getVar(self, prefix): + upto = 0 + while True: + v = '%s%s' % (prefix, upto) + if v not in self.vars: + self.vars[v] = True + return v + upto += 1 + + def getID(self): + # TODO + return 'a' + + def releaseVar(self, v): + del self.vars[v] + +class PQCollector: + + doScores = True + doMaxScore = True + valueType = None + + def upperValueType(self): + return self.valueType[0].upper() + self.valueType[1:] + + def __init__(self, w, topN, doScore='both'): + self.w = w + self.topN = topN + if doScore == 'both': + self.doScores = True + self.doMaxScore = True + elif doScore == 'track': + self.doScores = True + self.doMaxScore = False + else: + self.doScores = False + self.doMaxScore = False + + def writeTopClass(self): + pass + + def topInit(self): + self.w('int hitCount = 0;') + if self.__class__ != ScoreDocCollector: + self.w('final SortField sortField = sort.getSort()[0];') + self.w('final String sortFieldID = sortField.getField();') + if self.doScores and self.__class__ != ScoreDocCollector and self.doMaxScore: + self.w('float maxScore = Float.NEGATIVE_INFINITY;') + + def collectOne(self, scoreVar): + w = self.w + w('hitCount++;') + + if DEBUG: + w('System.out.println("doc=" + (doc+docBase) + " score=" + %s);' % scoreVar) + + if 0: + # Queue is not full -- insert & upheap + w('if (!queueFull) {') + w(' queueFull = (hitCount==%d);' % self.topN) + w(' final int fullDoc = doc + docBase;') + + self.copyCurrent('hitCount') + + w('') + w(' // Upheap') + + iVar = w.getVar('i') + w(' int %s = hitCount;' % iVar) + + jVar = w.getVar('j') + w(' int %s = %s >>> 1;' % (jVar, iVar)) + + w(' while(%s > 0) {' % jVar) + self.lessThanCurrentBreak(jVar) + self.copy(jVar, iVar) + w(' %s = %s;' % (iVar, jVar)) + w(' %s = %s >>> 1;' % (jVar, jVar)) + w(' }') + w.releaseVar(jVar) + self.install(iVar) + + w.releaseVar(iVar) + self.endInsert() + w('') + + # Queue is full -- check if hit competes + w('} else {') + + self.writeCompetes() + + w('') + w(' final int fullDoc = doc + docBase;') + self.copyCurrentToBottom() + + w('') + w(' // Downheap') + + iVar = w.getVar('i') + w(' int %s = 1;' % iVar) + + jVar = w.getVar('j') + w(' int %s = %s << 1;' % (jVar, iVar)) + + kVar = w.getVar('k') + w(' int %s = %s+1;' % (kVar, jVar)) + w(' if (%s <= %d) {' % (kVar, self.topN)) + self.lessThan(kVar, jVar, 'lt') + w(' if (lt) {') + w(' %s = %s;' % (jVar, kVar)) + w(' }') + w(' }') + w(' while(%s <= %d) {' % (jVar, self.topN)) + self.currentGreaterThanBreak(jVar) + self.copy(jVar, iVar) + w(' %s = %s;' % (iVar, jVar)) + w(' %s = %s << 1;' % (jVar, iVar)) + w(' %s = %s+1;' % (kVar, jVar)) + w(' if (%s <= %d) {' % (kVar, self.topN)) + self.lessThan(kVar, jVar, 'lt') + w(' if (lt) {') + w(' %s = %s;' % (jVar, kVar)) + w(' }') + w(' }') + w(' }') + w.releaseVar(kVar) + w.releaseVar(jVar) + + self.installBottom(iVar) + w.releaseVar(iVar) + + self.endInsert() + #w(' }') + + def createResults(self, v, c): + w = self.w + + w('') + if self.__class__ == ScoreDocCollector: + self.w('final float maxScore = queueScores[1];') + + w('// Build results -- sort pqueue entries') + w('final SorterTemplate sorter = new SorterTemplate() {') + w(' protected int compare(int i, int j) {') + self.lessThan('i', 'j', 'lt') + w(' if (lt) {') + w(' return 1;') + w(' } else {') + w(' // pq entries are never equal') + w(' return -1;') + w(' }') + w(' }') + w(' protected void swap(int i, int j) {') + self.swap('i', 'j') + w(' }') + w('};') + + w('// Extract results') + w('final int numHits = hitCount > %d ? %d : hitCount;' % (self.topN, self.topN)) + w('sorter.quickSort(1, numHits);') + w('final ScoreDoc[] hits = new ScoreDoc[numHits];') + w('for(int i=0;i<%s;i++) {' % self.topN) + self.createScoreDoc('hits[i]', 'i') + w('}') + + if self.doMaxScore: + x = 'maxScore' + else: + x = 'Float.NaN' + if str(c).find('OneField') != -1: + w('final TopDocs results = new TopFieldDocs(hitCount, hits, sort.getSort(), %s);' % x) + else: + w('final TopDocs results = new TopDocs(hitCount, hits, %s);' % x) + +class ScoreDocCollector(PQCollector): + + def createScoreDoc(self, dest, src): + self.w('%s = new ScoreDoc(queueDocs[1+%s], queueScores[1+%s]);' % (dest, src, src)) + + def topInit(self): + PQCollector.topInit(self) + self.w('final int[] queueDocs = new int[%d];' % (1+self.topN)); + self.w('final float[] queueScores = new float[%d];' % (1+self.topN)); + self.w('// prefill w/ sentinel') + self.w('Arrays.fill(queueScores, Float.NEGATIVE_INFINITY);') + self.w('Arrays.fill(queueDocs, Integer.MAX_VALUE);') + self.w('float bottomScore = Float.NEGATIVE_INFINITY;') + + def readerInit(self): + pass + + def lessThanCurrentBreak(self, v): + self.w('if (queueScores[%s] < score) {' % v) + self.w(' break;') + self.w('}') + + def currentGreaterThanBreak(self, v): + self.w('if (queueScores[%s] > score || (queueScores[%s] == score && queueDocs[%s] < fullDoc)) {' % (v, v, v)) + self.w(' break;') + self.w('}') + + def lessThan(self, a, b, v): + self.w('final boolean %s = queueScores[%s] < queueScores[%s] || (queueScores[%s] == queueScores[%s] && queueDocs[%s] > queueDocs[%s]);' % (v, a, b, a, b, a, b)) + + def copy(self, src, dest): + self.w('queueDocs[%s] = queueDocs[%s];' % (dest, src)) + self.w('queueScores[%s] = queueScores[%s];' % (dest, src)) + + if 0: + def copyCurrent(self, dest): + self.w('queueDocs[%s] = fullDoc;' % dest) + self.w('queueScores[%s] = score;' % dest) + + def installBottom(self, dest): + self.w('queueDocs[%s] = fullDoc;' % dest) + self.w('queueScores[%s] = score;' % dest) + + # install = installBottom + + def copyCurrentToBottom(self): + if DEBUG: + self.w('System.out.println(" boot doc=" + queueDocs[1] + " for new doc=" + fullDoc + " (eq?=" + (score == bottomScore) + ")" + " score=" + score + " bottomScore=" + bottomScore);') + self.w('queueDocs[1] = fullDoc;') + self.w('queueScores[1] = score;') + + def endInsert(self): + self.w('bottomScore = queueScores[1];') + if DEBUG: + self.w('System.out.println(" bottom=" + bottomScore + " doc=" + queueDocs[1]);') + + def writeCompetes(self): + self.w('//System.out.println(" competes bottom=" + bottomScore);') + self.w('if (score <= bottomScore) {') + self.w(' continue;') + self.w('}') + + def swap(self, i, j): + w = self.w + w('final int itmp = queueDocs[%s];' % i) + w('queueDocs[%s] = queueDocs[%s];' % (i, j)) + w('queueDocs[%s] = itmp;' % j) + w('final float ftmp = queueScores[%s];' % i) + w('queueScores[%s] = queueScores[%s];' % (i, j)) + w('queueScores[%s] = ftmp;' % j) + +class OneFieldScoringCollector(PQCollector): + + doScores = True + + def createScoreDoc(self, dest, src): + self.w('final Comparable[] fields = new Comparable[1];') + if self.valueType is not None: + self.w('fields[0] = new %s(queue[1+%s].value);' % (self.upperValueType(), src)) + else: + self.w('fields[0] = comp.value(queue[1+%s].slot);' % src) + self.w('%s = new FieldDoc(queue[1+%s].docID, queue[1+%s].score, fields);' % (dest, src, src)) + + def readerInit(self): + w = self.w + if self.valueType is not None: + w('// TODO: support custom parser') + w('final %s[] docValues = ExtendedFieldCache.EXT_DEFAULT.get%ss(r, sortFieldID);' % \ + (self.valueType, self.upperValueType())) + else: + w('comp.setNextReader(r, docBase, hitCount>%d?%d:hitCount);' % (self.topN, self.topN)) + + def writeTopClass(self): + w = self.w + w('final static class Entry {') + if self.valueType is not None: + w(' // local inlined value') + w(' %s value;' % self.valueType) + else: + w(' // value by reference') + w(' final int slot;') + w(' int docID;') + w(' float score;') + if self.valueType is not None: + w(' public Entry(%s value, int docID, float score) {' % self.valueType) + w(' this.value = value;') + else: + w(' public Entry(int slot, int docID, float score) {') + w(' this.slot = slot;') + w(' this.docID = docID;') + w(' this.score = score;') + w(' }') + w('}') + + def topInit(self): + PQCollector.topInit(self) + w = self.w + w('final Entry[] queue = new Entry[%d];' % (1+self.topN)); + if self.valueType is not None: + w('// pre-fill queue with "bottom"') + w('final %s sentinel;' % self.valueType) + w('if (sortField.getReverse()) {') + w(' sentinel = %s.MIN_VALUE;' % self.upperValueType()) + w('} else {') + w(' sentinel = %s.MAX_VALUE;' % self.upperValueType()) + w('}') + w('// queue[0] is never used') + w('for (int x=1;x<%d;x++) {' % (1+self.topN)) + if self.valueType is not None: + w(' queue[x] = new Entry(sentinel, Integer.MAX_VALUE, Float.NEGATIVE_INFINITY);') + else: + w(' queue[x] = new Entry(x-1, Integer.MAX_VALUE, Float.NEGATIVE_INFINITY);') + w('}') + + w('Entry bottom = queue[1];') + if self.valueType is None: + w('final FieldComparator comp = sort.getSort()[0].getComparator(null, %d, 0, false);' % self.topN) + else: + w('%s bottomValue = sentinel;' % self.valueType) + + def lessThanCurrentBreak(self, v): + w = self.w + if self.valueType is not None: + w('if (queue[slot].value < queue[%s].value) {' % v) + else: + w('if (comp.compare(slot, queue[%s].slot) < 0) {' % v) + w(' break;') + w('}') + + if 0: + w('final int c = comp.compare(slot, queue[%s].slot);' % v) + w('if (c < 0) {') + w(' break;') + w('}') + if 0: + w('if (c == 0) {') + w(' // Must break tie by docID') + w(' if (node.docID < queue[%s].docID) {' % v) + w(' break;') + w(' }') + w('} else if (c < 0) {') + w(' break;') + w('}') + + def currentGreaterThanBreak(self, v): + w = self.w + if self.valueType is None: + w('final int c = comp.compare(queue[%s].slot, bottom.slot);' % v) + w('if (c == 0) {') + w(' // Must break tie by docID') + w(' if (queue[%s].docID < bottom.docID) {' % v) + w(' break;') + w(' }') + w('} else if (c < 0) {') + w(' break;') + w('}') + else: + self.w('if (queue[%s].value == bottomValue) {' % v) + w(' // Must break tie by docID') + w(' if (queue[%s].docID < bottom.docID) {' % v) + w(' break;') + w(' }') + w('} else if (queue[%s].value < bottomValue) {' % v) + w(' break;') + w('}') + + + def lessThan(self, a, b, v): + w = self.w + c = w.getVar('c') + w('final boolean %s;'% v) + if self.valueType is None: + w('final int %s = comp.compare(queue[%s].slot, queue[%s].slot);' % (c, a, b)) + w('if (%s == 0) {' % c) + else: + w('if (queue[%s].value == queue[%s].value) {' % (a, b)) + w(' // Must break tie by docID') + w(' %s = queue[%s].docID > queue[%s].docID;' % (v, a, b)) + w('} else {') + if self.valueType is None: + w(' %s = %s > 0;' % (v, c)) + else: + w(' %s = queue[%s].value > queue[%s].value;' % (v, a, b)) + w('}') + w.releaseVar(c) + + def copy(self, src, dest): + self.w('queue[%s] = queue[%s];' % (dest, src)) + + if 0: + # Copy new hit to queue + def copyCurrent(self, dest): + self.w('final int slot = %s-1;' % dest) + self.w('final Entry node = queue[%s] = new Entry(slot, docBase+doc, score);' % dest) + self.w('comp.copy(slot, doc, score);') + + # Copy previous node from copyCurrent + def install(self, dest): + self.w('queue[%s] = node;' % dest) + + def installBottom(self, dest): + self.w('queue[%s] = bottom;' % dest) + + def copyCurrentToBottom(self): + self.w('bottom.docID = fullDoc;') + self.w('bottom.score = score;') + if self.valueType is None: + self.w('comp.copy(bottom.slot, doc, score);') + else: + self.w('bottomValue = bottom.value = docValues[doc];') + + def endInsert(self): + self.w('bottom = queue[1];') + if self.valueType is None: + self.w('comp.setBottom(bottom.slot);') + else: + self.w('bottomValue = bottom.value;') + + def writeCompetes(self): + w = self.w + if self.valueType is None: + w(' final int cmp = comp.compareBottom(doc, score);') + w(' if (cmp <= 0) {') + else: + w(' if (docValues[doc] >= bottomValue) {') + if DEBUG: + w(' System.out.println(" no compete bottom");') + w(' continue;') + #w(' } else if (cmp == 0 && doc + docBase > bottom.docID) {') + #w(' continue;') + w(' }') + if DEBUG: + w(' System.out.println(" does compete bottom");') + + def swap(self, i, j): + w = self.w + w('final Entry tmp = queue[%s];' % i) + w('queue[%s] = queue[%s];' % (i, j)) + w('queue[%s] = tmp;' % j) + +class OneFieldNonScoringCollector(PQCollector): + + doScores = False + + def createScoreDoc(self, dest, src): + self.w('final Comparable[] fields = new Comparable[1];') + if self.valueType is not None: + self.w('fields[0] = new %s(queue[1+%s].value);' % (self.upperValueType(), src)) + else: + self.w('fields[0] = comp.value(queue[1+%s].slot);' % src) + self.w('%s = new FieldDoc(queue[1+%s].docID, Float.NaN, fields);' % (dest, src)) + + def readerInit(self): + w = self.w + if self.valueType is not None: + w('// TODO: support custom parser') + w('final %s[] docValues = ExtendedFieldCache.EXT_DEFAULT.get%ss(r, sortFieldID);' % \ + (self.valueType, self.valueType[0].upper() + self.valueType[1:])) + else: + w('comp.setNextReader(r, docBase, hitCount>%d?%d:hitCount);' % (self.topN, self.topN)) + + def writeTopClass(self): + w = self.w + w('final static class Entry {') + if self.valueType is not None: + w(' // local inlined value') + w(' %s value;' % self.valueType) + else: + w(' // value by reference') + w(' final int slot;') + w(' int docID;') + if self.valueType is not None: + w(' public Entry(%s value, int docID) {' % self.valueType) + w(' this.value = value;') + else: + w(' public Entry(int slot, int docID) {') + w(' this.slot = slot;') + w(' this.docID = docID;') + w(' }') + w('}') + + def topInit(self): + PQCollector.topInit(self) + w = self.w + w('final Entry[] queue = new Entry[%d];' % (1+self.topN)); + if self.valueType is not None: + w('// pre-fill queue with "bottom"') + w('final %s sentinel;' % self.valueType) + w('if (sortField.getReverse()) {') + w(' sentinel = %s.MIN_VALUE;' % self.upperValueType()) + w('} else {') + w(' sentinel = %s.MAX_VALUE;' % self.upperValueType()) + w('}') + w('// queue[0] is never used') + w('for (int x=1;x<%d;x++) {' % (1+self.topN)) + if self.valueType is not None: + w(' queue[x] = new Entry(sentinel, Integer.MAX_VALUE);') + else: + w(' queue[x] = new Entry(x-1, Integer.MAX_VALUE);') + w('}') + + w('Entry bottom = queue[1];') + if self.valueType is None: + w('final FieldComparator comp = sort.getSort()[0].getComparator(null, %d, 0, false);' % self.topN) + else: + w('%s bottomValue = sentinel;' % self.valueType) + + def lessThanCurrentBreak(self, v): + w = self.w + if self.valueType is not None: + w('if (queue[slot].value < queue[%s].value) {' % v) + else: + w('if (comp.compare(slot, queue[%s].slot) < 0) {' % v) + w(' break;') + w('}') + if 0: + w('final int c = comp.compare(slot, queue[%s].slot);' % v) + w('if (c < 0) {') + w(' break;') + w('}') + if 0: + w('final int c = comp.compare(slot, queue[%s].slot);' % v) + w('if (c == 0) {') + w(' // Must break tie by docID') + w(' if (node.docID < queue[%s].docID) {' % v) + w(' break;') + w(' }') + w('} else if (c < 0) {') + w(' break;') + w('}') + + def currentGreaterThanBreak(self, v): + w = self.w + if self.valueType is None: + w('final int c = comp.compare(queue[%s].slot, bottom.slot);' % v) + w('if (c == 0) {') + w(' // Must break tie by docID') + w(' if (queue[%s].docID < bottom.docID) {' % v) + w(' break;') + w(' }') + w('} else if (c < 0) {') + w(' break;') + w('}') + else: + w('if (queue[%s].value == bottomValue) {' % v) + w(' // Must break tie by docID') + w(' if (queue[%s].docID < bottom.docID) {' % v) + w(' break;') + w(' }') + w('} else if (queue[%s].value < bottomValue) {' % v) + w(' break;') + w('}') + + def lessThan(self, a, b, v): + w = self.w + c = w.getVar('c') + w('final boolean %s;'% v) + if self.valueType is None: + w('final int %s = comp.compare(queue[%s].slot, queue[%s].slot);' % (c, a, b)) + w('if (%s == 0) {' % c) + else: + w('if (queue[%s].value == queue[%s].value) {' % (a, b)) + w(' // Must break tie by docID') + w(' %s = queue[%s].docID > queue[%s].docID;' % (v, a, b)) + w('} else {') + if self.valueType is None: + w(' %s = %s > 0;' % (v, c)) + else: + w(' %s = queue[%s].value > queue[%s].value;' % (v, a, b)) + w('}') + w.releaseVar(c) + + def copy(self, src, dest): + self.w('queue[%s] = queue[%s];' % (dest, src)) + + if 0: + # Copy new hit to queue + def copyCurrent(self, dest): + self.w('final int slot = %s-1;' % dest) + self.w('final Entry node = queue[%s] = new Entry(slot, docBase+doc);' % dest) + self.w('comp.copy(slot, doc, Float.NaN);') + + # Copy previous node from copyCurrent + def install(self, dest): + self.w('queue[%s] = node;' % dest) + + def installBottom(self, dest): + self.w('queue[%s] = bottom;' % dest) + + def copyCurrentToBottom(self): + self.w('bottom.docID = fullDoc;') + if self.valueType is None: + self.w('comp.copy(bottom.slot, doc, score);') + else: + self.w('bottomValue = bottom.value = docValues[doc];') + + def endInsert(self): + self.w('bottom = queue[1];') + if self.valueType is None: + self.w('comp.setBottom(bottom.slot);') + else: + self.w('bottomValue = bottom.value;') + + def writeCompetes(self): + w = self.w + if self.valueType is None: + w(' final int cmp = comp.compareBottom(doc, score);') + w(' if (cmp <= 0) {') + else: + w(' if (docValues[doc] >= bottomValue) {') + if DEBUG: + w(' System.out.println(" no compete bottom");') + w(' continue;') + #w(' } else if (cmp == 0 && doc + docBase > bottom.docID) {') + #w(' continue;') + w(' }') + if DEBUG: + w(' System.out.println(" does compete bottom");') + + def swap(self, i, j): + w = self.w + w('final Entry tmp = queue[%s];' % i) + w('queue[%s] = queue[%s];' % (i, j)) + w('queue[%s] = tmp;' % j) + +class TermQuery: + + SCORE_CACHE_SIZE = 32 + BLOCK_SIZE = 32 + + # If False, we grab IndexInput to freqStream and process doc/freq ourselves + USE_TERMDOCS = False + + def setWriter(self, w): + self.w = w + self.id = w.getID() + + def var(self, name): + return '%s%s%s' % (self.id, name[0].upper(), name[1:]) + + def releaseVar(self, v): + self.w.releaseVar(v) + + def writeTop(self, c): + w = self.w + w('final Term %s = t;' % self.var('t')) + if self.USE_TERMDOCS: + w('final int[] %s = new int[%d];' % (self.var('docs'), self.BLOCK_SIZE)) + w('final int[] %s = new int[%d];' % (self.var('freqs'), self.BLOCK_SIZE)) + + if c.doScores: + w('final float[] %s = new float[%s];' % (self.var('scoreCache'), self.SCORE_CACHE_SIZE)) + + if c.doScores: + w('final float %s = new TermQuery(%s).weight(searcher).getValue();' % \ + (self.var('weightValue'), self.var('t'))) + w('//System.out.println("weight=" + %s);' % self.var('weightValue')) + w('for(int i=0;i<%s;i++) {' % self.SCORE_CACHE_SIZE) + w(' %s[i] = sim.tf(i) * %s;' % (self.var('scoreCache'), self.var('weightValue'))) + w('}') + + def writePerReader(self, c, doFilter, doDeletes): + w = self.w + # TODO: how to "share" this when more than one term is against this same field + if c.doScores: + w('final byte[] %s = r.norms(%s.field());' % (self.var('norms'), self.var('t'))) + w('final TermDocs %s = r.termDocs(%s);' % (self.var('TD'), self.var('t'))) + if not self.USE_TERMDOCS: + w('final IndexInput %s = ((SegmentTermDocs) %s).getFreqStream();' % (self.var('freqStream'), + self.var('TD'))) + w('final int %s = ((SegmentTermDocs ) %s).getTermFreq();' % (self.var('limit'), + self.var('TD'))) + w('int count = 0;') + if not doFilter and doDeletes: + w('final BitVector deletedDocs = ((SegmentReader) r).getDeletedDocs();') + + def writeTopIter(self, c, doFilter, doDeletes): + + w = self.w + + if not self.USE_TERMDOCS: + w('int doc = 0;') + + w('while (true) {') + + if self.USE_TERMDOCS: + count = w.getVar('count') + w('final int %s = %s.read(%s, %s);' % (count, + self.var('TD'), + self.var('docs'), + self.var('freqs'))) + w('if (%s == 0) {' % count) + w(' break;') + w('}') + + + if self.USE_TERMDOCS: + v = w.getVar('i') + # TODO: inline "32" for common case? + w('for(int %s=0;%s<%s;%s++) {' % (v, + v, + count, + v)) + w(' final int doc = %s[%s];' % (self.var('docs'), v)) + if c.doScores: + w(' final int %s = %s[%s];' % (self.var('freq'), self.var('freqs'), v)) + else: + w(' if (++count > %s) {' % self.var('limit')) + w(' break;') + w(' }') + w(' final int x = %s.readVInt();' % self.var('freqStream')) + w(' doc += x>>>1;') + + if doFilter: + w(' // Filter has pre-folded deletes in') + w(' if (!filterBits.fastGet(doc)) {') + w(' if ((x & 1) == 0) {') + w(' // Skip freq') + w(' %s.readVInt();' % self.var('freqStream')) + w(' }') + w(' continue;') + w(' }') + elif doDeletes: + w(' if (deletedDocs != null && deletedDocs.get(doc)) {') + w(' if ((x & 1) == 0) {') + w(' // Skip freq') + w(' %s.readVInt();' % self.var('freqStream')) + w(' }') + w(' continue;') + w(' }') + + if c.doScores: + w(' final int %s;' % self.var('freq')) + w(' if ((x & 1) != 0) {') + w(' %s = 1;' % self.var('freq')) + w(' } else {') + w(' %s = %s.readVInt();' % (self.var('freq'), self.var('freqStream'))) + w(' }') + else: + w(' if ((x & 1) == 0) {') + w(' // Skip freq') + w(' %s.readVInt();' % self.var('freqStream')) + w(' }') + + # TODO: would be nice to share a single norm lookup of this doc against this field, somehow + + if c.doScores: + scoreVar = 'score' + w(' final float %s = (%s < %d ? %s[%s] : sim.tf(%s)*%s) * normDecoder[%s[doc] & 0xFF];' % \ + (scoreVar, + self.var('freq'), + self.SCORE_CACHE_SIZE, + self.var('scoreCache'), + self.var('freq'), + self.var('freq'), + self.var('weightValue'), + self.var('norms'))) + w(' //System.out.println("doc=" + doc + " score=" + score);') + if c.__class__ != ScoreDocCollector and c.doMaxScore: + w(' if (score > maxScore) {') + w(' maxScore = score;') + w(' }') + else: + scoreVar = 'Float.NaN' + + c.collectOne(scoreVar) + + if self.USE_TERMDOCS: + w('}') + w.releaseVar(v) + w.releaseVar(count) + + w('}') + +DEBUG = '-verbose' in sys.argv + +def gen(w, query, c, doFilter, doDeletes, fileName, sortString): + + query.setWriter(w) + + w('package org.apache.lucene.benchmark.byTask.tasks;') + + w('import org.apache.lucene.util.*;') + w('import org.apache.lucene.store.*;') + w('import org.apache.lucene.search.*;') + w('import org.apache.lucene.index.*;') + w('import org.apache.lucene.benchmark.byTask.*;') + w('import org.apache.lucene.benchmark.byTask.feeds.*;') + w('import java.io.IOException;') + w('import java.util.Arrays;') + + className = os.path.splitext(os.path.split(fileName)[1])[0] + + w('public class %s extends ReadTask {' % className) + + c.writeTopClass() + + w(' public %s(PerfRunData runData) {' % className) + w(' super(runData);') + w(' }') + w(' public boolean withRetrieve() {') + w(' return false;') + w(' }') + w(' public boolean withSearch() {') + w(' return true;') + w(' }') + w(' public boolean withTraverse() {') + w(' return false;') + w(' }') + w(' public boolean withWarm() {') + w(' return false;') + w(' }') + w(' public QueryMaker getQueryMaker() {') + w(' return getRunData().getQueryMaker(this);') + w(' }') + + if str(c).find('OneField') != -1: + w(' public Sort getSort() {') + w(' return new Sort(%s);' % sortString) + w(' }') + + w(' public TopDocs search(final IndexSearcher searcher, final Term t, final Sort sort, final Filter filter) throws IOException {') + + c.topInit() + + w(' final float[] normDecoder = Similarity.getNormDecoder();') + w(' final Similarity sim = searcher.getSimilarity();') + w(' final IndexReader[] subReaders = searcher.getSortedSubReaders();') + w(' final int[] docBases = searcher.getDocBases();') + + query.writeTop(c) + + w('') + w(' for(int i=0;i compile.log 2>&1') != 0: + raise RuntimeError('compile failed (see compile.log)') + +if windows: + RESULTS = 'results.win64' +else: + RESULTS = 'results' + +VERIFY = '-verify' in sys.argv + +ALG = ''' +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory +work.dir = $INDEX$ +search.num.hits = 10 +query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker +file.query.maker.file = queries.txt +log.queries=true +filter.pct = $FILT$ + +OpenReader +{"XSearchWarm" $SEARCH$} +$ROUNDS$ +CloseReader +RepSumByPrefRound XSearch +''' + +if os.path.exists('searchlogs'): + shutil.rmtree('searchlogs') + +os.makedirs('searchlogs') + +open('%s.txt' % RESULTS, 'wb').write('||Query||Sort||Filt|Deletes||Scoring||Hits||QPS (base)||QPS (new)||%||\n') + +numHit = 10 +counter = 0 + +if '-delindex' in sys.argv: + i = sys.argv.index('-delindex') + DEL_INDEX = sys.argv[1+i] + del sys.argv[i:i+2] + if not os.path.exists(DEL_INDEX): + raise RuntimeError('index "%s" does not exist' % DEL_INDEX) +else: + DEL_INDEX = None + +if '-nodelindex' in sys.argv: + i = sys.argv.index('-nodelindex') + NO_DEL_INDEX = sys.argv[1+i] + del sys.argv[i:i+2] + if not os.path.exists(NO_DEL_INDEX): + raise RuntimeError('index "%s" does not exist' % NO_DEL_INDEX) +else: + NO_DEL_INDEX = None + +if DEL_INDEX is None and NO_DEL_INDEX is None: + raise RuntimeError('you must specify at least one of -delindex or -nodelindex') + +def run(new, query, sortField, doScore, filt, delP): + global counter + + t0 = time.time() + + s = ALG + + if not VERIFY: + s = s.replace('$ROUNDS$', +''' +{ "Rounds" + { "Run" + { "TestSearchSpeed" + { "XSearchReal" $SEARCH$ > : 3.0s + } + NewRound + } : $NROUND$ +} +''') + nround = 5 + s = s.replace('$NROUND$', str(nround)) + else: + s = s.replace('$ROUNDS$', '') + + if linux: + prefix = '/big/scratch/lucene' + else: + prefix = '/lucene' + + if delP is None: + index = NO_DEL_INDEX + else: + index = DEL_INDEX + + s = s.replace('$INDEX$', index) + + open('queries.txt', 'wb').write(query + '\n') + + if filt == None: + f = '0.0' + else: + f = str(filt) + s = s.replace('$FILT$', f) + + if new: + search = 'FastSearch' + w = gen.Writer() + if sortField == 'score': + c = gen.ScoreDocCollector(w, numHit) + elif doScore == 'no': + c = gen.OneFieldNonScoringCollector(w, numHit) + else: + c = gen.OneFieldScoringCollector(w, numHit, doScore) + if sortField == 'doctitle': + sortString = 'new SortField("doctitle", SortField.STRING)' + elif sortField == 'docdate': + sortString = 'new SortField("docdate", SortField.LONG)' + c.valueType = 'long' + else: + sortString = None + open('src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java', 'a').close() + gen.gen(w, gen.TermQuery(), c, filt != None, + delP != None, + 'src/java/org/apache/lucene/benchmark/byTask/tasks/FastSearchTask.java', + sortString) + print ' compile...' + if os.system('ant compile > compile.log 2>&1') != 0: + raise RuntimeError('compile failed (see compile.log)') + else: + if sortField == 'score': + search = 'Search' + elif sortField == 'doctitle': + search = 'SearchWithSort(doctitle:string)' + elif sortField == 'docdate': + search = 'SearchWithSort(docdate:long)' + else: + raise RuntimeError("no") + + s = s.replace('$SEARCH$', search) + fileOut = 'searchlogs/%d' % counter + counter += 1 + + if 0: + fileOut = 'searchlogs/query_%s_%s_%s_%s_%s_%s' % \ + (query.replace(' ', '_').replace('"', ''), + sortField, filt, doScore, new, delP) + + open('tmp.alg', 'wb').write(s) + + if windows: + command = 'java -Xms1024M -Xmx1024M -Xbatch -server -classpath "../../build/classes/java;../../build/classes/demo;../../build/contrib/highlighter/classes/java;../../contrib/benchmark/lib/commons-digester-1.7.jar;../../contrib/benchmark/lib/commons-collections-3.1.jar;../../contrib/benchmark/lib/commons-logging-1.0.4.jar;../../contrib/benchmark/lib/commons-beanutils-1.7.0.jar;../../contrib/benchmark/lib/xerces-2.9.0.jar;../../contrib/benchmark/lib/xml-apis-2.9.0.jar;../../build/contrib/benchmark/classes/java" org.apache.lucene.benchmark.byTask.Benchmark tmp.alg > %s' % fileOut + else: + command = 'java -Xms1024M -Xmx1024M -Xbatch -server -classpath ../../build/classes/java:../../build/classes/demo:../../build/contrib/highlighter/classes/java:../../contrib/benchmark/lib/commons-digester-1.7.jar:../../contrib/benchmark/lib/commons-collections-3.1.jar:../../contrib/benchmark/lib/commons-logging-1.0.4.jar:../../contrib/benchmark/lib/commons-beanutils-1.7.0.jar:../../contrib/benchmark/lib/xerces-2.9.0.jar:../../contrib/benchmark/lib/xml-apis-2.9.0.jar:../../build/contrib/benchmark/classes/java org.apache.lucene.benchmark.byTask.Benchmark tmp.alg > %s' % fileOut + + print ' %s' % fileOut + + res = os.system(command) + + if res != 0: + raise RuntimeError('FAILED') + + best = None + count = 0 + nhits = None + warmTime = None + meths = [] + r = re.compile('^ ([0-9]+): (.*)$') + topN = [] + + for line in open(fileOut, 'rb').readlines(): + m = r.match(line.rstrip()) + if m is not None: + topN.append(m.group(2)) + if line.startswith('NUMHITS='): + nhits = int(line[8:].strip()) + if line.startswith('XSearchWarm'): + v = line.strip().split() + warmTime = float(v[5]) + if line.startswith('XSearchReal'): + v = line.strip().split() + # print len(v), v + upto = 0 + i = 0 + qps = None + while i < len(v): + if v[i] == '-': + i += 1 + continue + else: + upto += 1 + i += 1 + if upto == 5: + #print 'GOT: %s' % v[i-1] + qps = float(v[i-1].replace(',', '')) + break + + if qps is None: + raise RuntimeError('did not find qps') + + count += 1 + if best is None or qps > best: + best = qps + + if not VERIFY: + if count != nround: + raise RuntimeError('did not find %s rounds (got %s)' % (nround, count)) + + if warmTime is None: + raise RuntimeError('did not find warm time') + + if nhits is None: + raise RuntimeError('did not see NUMHITS=line') + + # print ' NHIT: %s' % nhits + + # print ' %.1f qps; %.1f sec' % (best, time.time()-t0) + all.append((new, query, sortBy, filt, nhits, warmTime, best)) + else: + best = 1.0 + + return nhits, best, topN + +def cleanScores(l): + for i in range(len(l)): + pos = l[i].find(' score=') + l[i] = l[i][:pos].strip() + +all = [] + +filts = (None, 25.0, 10.0) + +queries = ('1',) + +if VERIFY: + #queries = ('1147',) + queries = ('1',) + +sort = ('score', 'docdate', 'doctitle') + +deletes = (None, 5) + +#doScores = ('no', 'track', 'both') +doScores = ('both',) + +for query in queries: + for sortBy in sort: + + if sortBy == 'score': + doScores0 = ('both',) + else: + doScores0 = doScores + + for doScore in doScores0: + for filt in filts: + for delP in deletes: + + if delP is None and NO_DEL_INDEX is None: + continue + if delP is not None and DEL_INDEX is None: + continue + + print + print 'RUN: query=%s sort=%s scores=%s filt=%s deletes=%s [%s]' % (query, sortBy, doScore, filt, delP, datetime.datetime.now()) + print ' new...' + nhits1, qps1, topN1 = run(True, query, sortBy, doScore, filt, delP) + print ' qps %.2f' % qps1 + print ' old...' + nhits2, qps2, topN2 = run(False, query, sortBy, doScore, filt, delP) + print ' qps %.2f' % qps2 + print ' %d hits' % nhits1 + print ' %.1f%%' % (100.*(qps1-qps2)/qps2) + + f = open('%s.pk' % RESULTS, 'wb') + cPickle.dump(all, f) + f.close() + + if nhits1 != nhits2: + raise RuntimeError('hits differ: %s vs %s' % (nhits1, nhits2)) + + if len(topN1) != numHit: + raise RuntimeError('not enough hits: %s vs %s' % (len(topN1), nhits1)) + + if topN1 != topN2: + if doScore == 'no': + cleanScores(topN1) + cleanScores(topN2) + if topN1 != topN2: + raise RuntimeError('results differ') + else: + raise RuntimeError('results differ') + + if sortBy == 'score': + s0 = 'Relevance' + elif sortBy == 'doctitle': + s0 = 'Title (string)' + elif sortBy == 'docdate': + s0 = 'Date (long)' + + if filt == None: + f = 'no' + else: + f = '%d%%' % filt + + if delP == None: + d = 'no' + else: + d = '%d%%' % delP + + if doScore == 'both': + s = 'Track,Max' + elif doScore == 'track': + s = 'Track' + else: + s = 'no' + + pct = (qps1-qps2)/qps2 + if pct <= 0.0: + color = 'red' + else: + color = 'green' + p = '{color:%s}%.1f%%{color}' % (color, 100.*pct) + + open('%s.txt' % RESULTS, 'ab').write('|%s|%s|%s|%s|%s|%d|%.1f|%.1f|%s|\n' % \ + (query, s0, f, d, s, nhits1, qps2, qps1, p)) Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision 763737) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (working copy) @@ -23,6 +23,8 @@ import java.util.Iterator; import java.util.List; import java.util.Set; +import java.util.HashMap; +import java.util.Random; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -35,7 +37,14 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.Sort; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.index.SegmentReader; +import org.apache.lucene.util.BitVector; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.SortedVIntList; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; @@ -59,10 +68,16 @@ */ public abstract class ReadTask extends PerfTask { + private final double fpct; + public ReadTask(PerfRunData runData) { super(runData); + fpct = getRunData().getConfig().get("filter.pct", 0.0); } + // nocommit + static boolean first = true; + public int doLogic() throws Exception { int res = 0; boolean closeReader = false; @@ -97,17 +112,30 @@ } QueryMaker queryMaker = getQueryMaker(); Query q = queryMaker.makeQuery(); + Filter filter = getFilter(); Sort sort = getSort(); TopDocs hits; final int numHits = numHits(); if (numHits > 0) { - if (sort != null) { - hits = searcher.search(q, null, numHits, sort); + if (this instanceof FastSearchTask) { + hits = ((FastSearchTask) this).search(searcher, ((TermQuery) q).getTerm(), sort, filter); + } else if (sort != null) { + hits = searcher.search(q, filter, numHits, sort); } else { - hits = searcher.search(q, numHits); + hits = searcher.search(q, filter, numHits); } - //System.out.println("q=" + q + ":" + hits.totalHits + " total hits"); + // nocommit + if (first && hits != null) { + System.out.println("NUMHITS=" + hits.totalHits); + for(int i=0;i 0.0) { + System.out.println("FILT=" + fpct); + filter = new MyFilter(fpct); + } else if (first2) { + System.out.println("FILT=none"); + first2 = false; + } + } + return filter; + } + + static final HashMap filters = new HashMap(); + static private final boolean filtSparse = false; + + private class MyFilter extends Filter { + private final double fpct; + + MyFilter(double fpct) { + this.fpct = fpct; + } + + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + DocIdSet bits = (DocIdSet) filters.get(reader); + if (bits == null) { + final int numDocs = reader.maxDoc(); + OpenBitSet openBits = new OpenBitSet(numDocs); + final Random r = new java.util.Random(42244224); + int setCount = 0; + final boolean inverted; + final double fpct2; + if (fpct > 50.0) { + fpct2 = 100.0 - fpct; + inverted = true; + } else { + fpct2 = fpct; + inverted = false; + } + + final int target = (int) ((fpct2/100.0)*numDocs); + while(setCount < target) { + setCount += openBits.getAndSet(r.nextInt(numDocs)) ? 0:1; + } + if (setCount != openBits.cardinality()) { + throw new RuntimeException("count mismatch"); + } + if (inverted) { + OpenBitSet bits2 = new OpenBitSet(numDocs); + for(int i=0;i