Index: common-build.xml =================================================================== --- common-build.xml (revision 826708) +++ common-build.xml (working copy) @@ -58,8 +58,8 @@ - - + + Index: src/java/org/apache/lucene/search/FieldSortedHitQueue.java =================================================================== --- src/java/org/apache/lucene/search/FieldSortedHitQueue.java (revision 826708) +++ src/java/org/apache/lucene/search/FieldSortedHitQueue.java (working copy) @@ -299,7 +299,8 @@ * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ - static ScoreDocComparator comparatorInt (final IndexReader reader, final String fieldname, final FieldCache.IntParser parser) + // nocommit -- made public + public static ScoreDocComparator comparatorInt (final IndexReader reader, final String fieldname, final FieldCache.IntParser parser) throws IOException { final String field = fieldname.intern(); final int[] fieldOrder = FieldCache.DEFAULT.getInts(reader, field, parser); @@ -424,7 +425,8 @@ * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ - static ScoreDocComparator comparatorString (final IndexReader reader, final String fieldname) + // nocommit -- made public + public static ScoreDocComparator comparatorString (final IndexReader reader, final String fieldname) throws IOException { final String field = fieldname.intern(); final FieldCache.StringIndex index = FieldCache.DEFAULT.getStringIndex (reader, field); Index: src/java/org/apache/lucene/search/FieldCacheImpl.java =================================================================== --- src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 826708) +++ src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy) @@ -705,7 +705,7 @@ // we expect that there is at most one term per document if (t >= mterms.length) throw new RuntimeException ("there are more terms than " + "documents in field \"" + field + "\", but it's impossible to sort on " + - "tokenized fields"); + "tokenized fields t=" + t + " mterms.length=" + mterms.length); mterms[t] = term.text(); termDocs.seek (termEnum); Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OneSortNoScoreCollector.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OneSortNoScoreCollector.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OneSortNoScoreCollector.java (revision 0) @@ -0,0 +1,130 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.LinkedList; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.ScoreDocComparator; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.SortComparatorSource; + +public class OneSortNoScoreCollector extends Collector { + private final LinkedList _pqList; + private final int _numHits; + private ScoreDocComparatorQueue _currentPQ; + private int _totalHits; + private int _docBase; + private final String _field; + private NonScoreDoc _bottom; + private boolean _queueFull; + private final SortComparatorSource _sortComparatorSource; + private final NonScoreDoc _tmp; + private ScoreDocComparator _currentComparator; + + public static class NonScoreDoc extends ScoreDoc{ + NonScoreDoc(int docid){ + super(docid,1.0f); + } + + ScoreDocComparatorQueue comparatorQueue; + } + + public OneSortNoScoreCollector(String field,int numHits,SortComparatorSource sortComparatorSource) { + _pqList = new LinkedList(); + _numHits = numHits; + _field = field; + _sortComparatorSource = sortComparatorSource; + _totalHits = 0; + _docBase = 0; + _bottom = null; + _queueFull = false; + _currentComparator = null; + _tmp = new NonScoreDoc(0); + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return true; + } + + @Override + public void collect(int doc) throws IOException { + _totalHits++; + if (_queueFull){ + _tmp.doc = doc; + if (_currentComparator.compare(_bottom,_tmp) <= 0) return; + _bottom.doc=doc; + _bottom = (NonScoreDoc)_currentPQ.updateTop(); + } + else{ + ScoreDoc nsd = new NonScoreDoc(doc); + _bottom = (NonScoreDoc)_currentPQ.add(nsd); + _queueFull = (_currentPQ.size() >= _numHits); + } + } + + @Override + public void setNextReader(IndexReader reader, int docBase) throws IOException { + _docBase = docBase; + _currentComparator = _sortComparatorSource.newComparator(reader, _field); + _currentPQ = new ScoreDocComparatorQueue(_numHits,_docBase,_currentComparator); + _pqList.add(_currentPQ); + _queueFull = false; + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + } + + public int getTotalHits(){ + return _totalHits; + } + + public ArrayList getTop(){ + ArrayList> iterList = new ArrayList>(_pqList.size()); + for (ScoreDocComparatorQueue pq : _pqList){ + int count = pq.size(); + NonScoreDoc[] resList = new NonScoreDoc[count]; + for (int i = count - 1; i >= 0; i--) { + NonScoreDoc doc = (NonScoreDoc)pq.pop(); + doc.comparatorQueue=pq; + resList[i]=doc; + } + iterList.add(Arrays.asList(resList).iterator()); + } + ArrayList resList = ListMerger.mergeLists(0, _numHits, iterList, new Comparator() { + + public int compare(NonScoreDoc o1, NonScoreDoc o2) { + //numStringCompares++; + Comparable s1 = o1.comparatorQueue._comparator.sortValue(o1); + Comparable s2 = o2.comparatorQueue._comparator.sortValue(o2); + if (s1 == null) { + if (s2 == null) { + return 0; + } else { + return -1; + } + } else if (s2 == null) { + return 1; + } + int v = s1.compareTo(s2); + if (v==0){ + return o1.doc + o1.comparatorQueue._base - o2.doc - o2.comparatorQueue._base; + } else { + return v; + } + } + }); + + for (NonScoreDoc doc : resList){ + doc.doc += doc.comparatorQueue._base; + } + return resList; + } +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OneSortNoScoreCollector.java ___________________________________________________________________ Added: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithOldSortTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithOldSortTask.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithOldSortTask.java (revision 0) @@ -0,0 +1,160 @@ +package org.apache.lucene.benchmark.byTask.tasks; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.ScoreDocComparator; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.FieldSortedHitQueue; +import org.apache.lucene.index.IndexReader; + +/** + * Does sort search on specified field. + * + */ +public class SearchWithOldSortTask extends ReadTask { + + private Sort sort; + + public SearchWithOldSortTask(PerfRunData runData) { + super(runData); + } + + /** + * SortFields: field:type,field:type[,noscore][,nomaxscore] + * + * If noscore is present, then we turn off score tracking + * in {@link org.apache.lucene.search.TopFieldCollector}. + * If nomaxscore is present, then we turn off maxScore tracking + * in {@link org.apache.lucene.search.TopFieldCollector}. + * + * name:string,page:int,subject:string + * + */ + public void setParams(String sortField) { + super.setParams(sortField); + String[] fields = sortField.split(","); + SortField[] sortFields = new SortField[fields.length]; + int upto = 0; + for (int i = 0; i < fields.length; i++) { + String field = fields[i]; + SortField sortField0; + if (field.equals("doc")) { + sortField0 = SortField.FIELD_DOC; + } if (field.equals("score")) { + sortField0 = SortField.FIELD_SCORE; + } else { + int index = field.lastIndexOf(":"); + String fieldName; + String typeString; + if (index != -1) { + fieldName = field.substring(0, index); + typeString = field.substring(1+index, field.length()); + } else { + throw new RuntimeException("You must specify the sort type ie page:int,subject:string"); + } + int type = getType(typeString); + sortField0 = new SortField(fieldName, type); + } + sortFields[upto++] = sortField0; + } + + if (upto < sortFields.length) { + SortField[] newSortFields = new SortField[upto]; + System.arraycopy(sortFields, 0, newSortFields, 0, upto); + sortFields = newSortFields; + } + this.sort = new Sort(sortFields); + } + + private int getType(String typeString) { + int type; + if (typeString.equals("float")) { + type = SortField.FLOAT; + } else if (typeString.equals("double")) { + type = SortField.DOUBLE; + } else if (typeString.equals("byte")) { + type = SortField.BYTE; + } else if (typeString.equals("short")) { + type = SortField.SHORT; + } else if (typeString.equals("int")) { + type = SortField.INT; + } else if (typeString.equals("long")) { + type = SortField.LONG; + } else if (typeString.equals("string")) { + type = SortField.STRING; + } else if (typeString.equals("string_val")) { + type = SortField.STRING_VAL; + } else { + throw new RuntimeException("Unrecognized sort field type " + typeString); + } + return type; + } + + public boolean supportsParams() { + return true; + } + + public QueryMaker getQueryMaker() { + return getRunData().getQueryMaker(this); + } + + public boolean withRetrieve() { + return false; + } + + public boolean withSearch() { + return true; + } + + public boolean withTraverse() { + return false; + } + + public boolean withWarm() { + return false; + } + + public boolean withScore() { + return false; + } + + public boolean withMaxScore() { + return false; + } + + public Sort getSort() { + if (sort == null) { + throw new IllegalStateException("No sort field was set"); + } + return sort; + } + + public ScoreDocComparator getScoreDocComparator(IndexReader reader, String fieldName) throws IOException { + final SortField sortField = sort.getSort()[0]; + if (sortField.getType() == SortField.STRING) { + return FieldSortedHitQueue.comparatorString(reader, fieldName); + } else if (sortField.getType() == SortField.INT) { + return FieldSortedHitQueue.comparatorInt(reader, fieldName, null); + } else { + return null; + } + } +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithOldSortTask.java ___________________________________________________________________ Added: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ListMerger.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ListMerger.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ListMerger.java (revision 0) @@ -0,0 +1,149 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.util.PriorityQueue; + +/** + * @author ymatsuda + * + */ +public class ListMerger +{ + public static class MergedIterator implements Iterator + { + private class IteratorNode + { + public Iterator _iterator; + public T _curVal; + + public IteratorNode(Iterator iterator) + { + _iterator = iterator; + _curVal = null; + } + + public boolean fetch() + { + if(_iterator.hasNext()) + { + _curVal = _iterator.next(); + return true; + } + _curVal = null; + return false; + } + } + + private final PriorityQueue _queue; + + private MergedIterator(final int length, final Comparator comparator) + { + _queue = new PriorityQueue() + { + { + this.initialize(length); + } + + @SuppressWarnings("unchecked") + @Override + protected boolean lessThan(Object o1, Object o2) + { + T v1 = ((IteratorNode)o1)._curVal; + T v2 = ((IteratorNode)o2)._curVal; + + return (comparator.compare(v1, v2) < 0); + } + }; + } + + public MergedIterator(final List> iterators, final Comparator comparator) + { + this(iterators.size(), comparator); + for(Iterator iterator : iterators) + { + IteratorNode ctx = new IteratorNode(iterator); + if(ctx.fetch()) _queue.insert(ctx); + } + } + + public MergedIterator(final Iterator[] iterators, final Comparator comparator) + { + this(iterators.length, comparator); + for(Iterator iterator : iterators) + { + IteratorNode ctx = new IteratorNode(iterator); + if(ctx.fetch()) _queue.insert(ctx); + } + } + + public boolean hasNext() + { + return _queue.size() > 0; + } + + @SuppressWarnings("unchecked") + public T next() + { + IteratorNode ctx = (IteratorNode)_queue.top(); + T val = ctx._curVal; + if (ctx.fetch()) + { + _queue.adjustTop(); + } + else + { + _queue.pop(); + } + return val; + } + + public void remove() + { + throw new UnsupportedOperationException(); + } + } + + private ListMerger() { } + + public static Iterator mergeLists(final Iterator[] iterators, final Comparator comparator) + { + return new MergedIterator(iterators, comparator); + } + + public static Iterator mergeLists(final List> iterators, final Comparator comparator) + { + return new MergedIterator(iterators, comparator); + } + + public static ArrayList mergeLists(int offset, int count, Iterator[] iterators, Comparator comparator) + { + return mergeLists(offset, count, new MergedIterator(iterators, comparator)); + } + + public static ArrayList mergeLists(int offset, int count, List> iterators, Comparator comparator) + { + return mergeLists(offset, count, new MergedIterator(iterators, comparator)); + } + + private static ArrayList mergeLists(int offset, int count, Iterator mergedIter) + { + for (int c = 0; c < offset && mergedIter.hasNext(); c++) + { + mergedIter.next(); + } + + ArrayList mergedList = new ArrayList(); + + for (int c = 0; c < count && mergedIter.hasNext(); c++) + { + mergedList.add(mergedIter.next()); + } + + return mergedList; + } + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ListMerger.java ___________________________________________________________________ Added: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision 826708) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (working copy) @@ -31,6 +31,8 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.SortComparatorSource; +import org.apache.lucene.search.ScoreDocComparator; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.ScoreDoc; @@ -59,6 +61,19 @@ public ReadTask(PerfRunData runData) { super(runData); } + + private final SortComparatorSource sortSource = new SortComparatorSource() { + public ScoreDocComparator newComparator(IndexReader reader, String fieldName) throws IOException { + return getScoreDocComparator(reader, fieldName); + } + }; + + public ScoreDocComparator getScoreDocComparator(IndexReader reader, String fieldName) throws IOException { + return null; + } + + private static boolean first = true; + public int doLogic() throws Exception { int res = 0; boolean closeReader = false; @@ -94,20 +109,55 @@ QueryMaker queryMaker = getQueryMaker(); Query q = queryMaker.makeQuery(); Sort sort = getSort(); - TopDocs hits; + TopDocs hits = null; + List oldHits = null; final int numHits = numHits(); + int totalHits = 0; if (numHits > 0) { if (sort != null) { - // TODO: change the following to create TFC with in/out-of order + if (sort.getSort().length != 1) { + throw new RuntimeException("sort length is " + sort.getSort().length); + } + // TODO: change the following to create TFC with in/out-of order // according to whether the query's Scorer. - TopFieldCollector collector = TopFieldCollector.create(sort, numHits, - true, withScore(), withMaxScore(), false); - searcher.search(q, collector); - hits = collector.topDocs(); + if (doOldSortAPI) { + OneSortNoScoreCollector c = new OneSortNoScoreCollector(sort.getSort()[0].getField(), + numHits, + sortSource); + + searcher.search(q, c); + oldHits = c.getTop(); + totalHits = c.getTotalHits(); + } else { + TopFieldCollector collector = TopFieldCollector.create(sort, numHits, + true, withScore(), withMaxScore(), false); + searcher.search(q, collector); + hits = collector.topDocs(); + totalHits = hits.totalHits; + } } else { hits = searcher.search(q, numHits); + totalHits = hits.totalHits; } - //System.out.println("q=" + q + ":" + hits.totalHits + " total hits"); + if (first) { + first = false; + System.out.println("NUMHITS=" + totalHits); + System.out.println("MAXDOC=" + searcher.getIndexReader().maxDoc()); + System.out.println("NUMDOCS=" + searcher.getIndexReader().numDocs()); + if (hits != null) { + for(int i=0;i 0; + } + + return doc1.doc > doc2.doc; + } +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ScoreDocComparatorQueue.java ___________________________________________________________________ Added: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (revision 826708) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (working copy) @@ -266,7 +266,7 @@ } } - public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { + synchronized public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String[] tuple = parser.next(); docData.clear(); docData.setName(tuple[ID]); Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (revision 826781) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (working copy) @@ -92,10 +92,12 @@ fields = new HashMap(); // Initialize the map with the default fields. - fields.put(BODY_FIELD, new Field(BODY_FIELD, "", store, bodyIndex, termVector)); + // nocommit -- need separate control on whether body + // is stored: + fields.put(BODY_FIELD, new Field(BODY_FIELD, "", Store.NO, bodyIndex, termVector)); fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector)); fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector)); - fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + fields.put(ID_FIELD, new Field(ID_FIELD, "", Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector)); doc = new Document(); @@ -210,7 +212,7 @@ bdy = body.substring(0, size); // use part docData.setBody(body.substring(size)); // some left } - Field bodyField = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal); + Field bodyField = ds.getField(BODY_FIELD, Store.NO, bodyIndexVal, termVecVal); bodyField.setValue(bdy); doc.add(bodyField); @@ -234,7 +236,7 @@ } } - //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); + // System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); return doc; } @@ -391,6 +393,8 @@ indexVal = norms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS; bodyIndexVal = bodyNorms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS; } + // nocommit + bodyIndexVal = Index.ANALYZED; boolean termVecPositions = config.get("doc.term.vector.positions", false); boolean termVecOffsets = config.get("doc.term.vector.offsets", false); if (termVecPositions && termVecOffsets) { Index: contrib/benchmark/sortBench.py =================================================================== --- contrib/benchmark/sortBench.py (revision 0) +++ contrib/benchmark/sortBench.py (revision 0) @@ -0,0 +1,478 @@ +import types +import re +import time +import os +import shutil +import sys +import cPickle +import datetime + +# TODO +# - build wiki/random index as needed (balanced or not, varying # segs, docs) +# - verify step +# - run searches +# - get all docs query in here + +if sys.platform.lower().find('darwin') != -1: + osName = 'osx' +elif sys.platform.lower().find('win') != -1: + osName = 'windows' +elif sys.platform.lower().find('linux') != -1: + osName = 'linux' +else: + osName = 'unix' + +# let shell find it: +JAVA_COMMAND = 'java -Xms1024M -Xmx1024M -Xbatch -server' + +INDEX_NUM_THREADS = 4 + +INDEX_NUM_DOCS = 1000000 + +LOG_DIR = 'logs' + +DO_BALANCED = False + +if osName == 'osx': + WIKI_FILE = '/x/lucene/enwiki-20090724-pages-articles.xml.bz2' + INDEX_DIR_BASE = '/lucene' +else: + WIKI_FILE = '/x/lucene/enwiki-20090724-pages-articles.xml.bz2' + INDEX_DIR_BASE = '/x/lucene' + +NUM_ROUND = 7 + +if 0: + print 'compile...' + if '-nocompile' not in sys.argv: + if os.system('ant compile > compile.log 2>&1') != 0: + raise RuntimeError('compile failed (see compile.log)') + +BASE_SEARCH_ALG = ''' +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory +work.dir = $INDEX$ +search.num.hits = $NUM_HITS$ +query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker +file.query.maker.file = queries.txt +log.queries=true +log.step=100000 + +OpenReader +{"XSearchWarm" $SEARCH$} +$ROUNDS$ +CloseReader +RepSumByPrefRound XSearch +''' + +BASE_INDEX_ALG = ''' +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer + +$OTHER$ + +doc.stored = true +doc.term.vector = false +log.step.AddDoc=10000 + +directory=FSDirectory +autocommit=false +compound=false + +work.dir=$WORKDIR$ + +{ "BuildIndex" + - CreateIndex + $INDEX_LINE$ + - CloseIndex +} + +RepSumByPrefRound BuildIndex +''' + +class RunAlgs: + + def __init__(self, resultsPrefix): + self.counter = 0 + self.results = [] + self.fOut = open('%s.txt' % resultsPrefix, 'wb') + + def makeIndex(self, source, numDocs, balancedNumSegs=None): + + if source not in ('wiki', 'random'): + raise RuntimeError('source must be wiki or random') + + indexName = 'work.%s.nd%gM' % (source, numDocs/1000000.0) + if balancedNumSegs is not None: + indexName += '_balanced%d' % balancedNumSegs + fullIndexPath = '%s/%s' % (INDEX_DIR_BASE, indexName) + + if os.path.exists(fullIndexPath): + print 'Index %s already exists...' % fullIndexPath + return indexName + + print 'Now create index %s...' % fullIndexPath + + s = BASE_INDEX_ALG + + if source == 'wiki': + other = '''content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource +docs.file=%s +doc.tokenized = false +''' % WIKI_FILE + else: + other = '''doc.index.props = true +doc.tokenized = false +doc.body.tokenized = false +content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource +''' + if INDEX_NUM_THREADS > 1: + other += 'doc.reuse.fields=false\n' + s = s.replace('$INDEX_LINE$', '[ { "AddDocs" AddDoc > : %s } : %s' % \ + (numDocs/INDEX_NUM_THREADS, INDEX_NUM_THREADS)) + else: + s = s.replace('$INDEX_LINE$', '{ "AddDocs" AddDoc > : %s' % \ + numDocs) + + s = s.replace('$WORKDIR$', fullIndexPath) + + if balancedNumSegs is not None: + other += ''' merge.factor=1000 + max.buffered=%d + ram.flush.mb=2000 + ''' % (numDocs/balancedNumSegs) + else: + if source == 'random': + other += 'ram.flush.mb=1.0\n' + else: + other += 'ram.flush.mb=32.0\n' + + s = s.replace('$OTHER$', other) + + try: + self.runOne(s, 'index_%s' % indexName, isIndex=True) + except: + if os.path.exists(fullIndexPath): + shutil.rmtree(fullIndexPath) + raise + return indexName + + def getLogPrefix(self, **dArgs): + l = dArgs.items() + l.sort() + return '_'.join(['%s=%s' % tup for tup in l]) + + def runOne(self, alg, logFileName, indexNumDocs=None, queries=None, verify=False, isIndex=False): + + if queries is not None: + if type(queries) in types.StringTypes: + queries = [queries] + open('queries.txt', 'wb').write('\n'.join(queries)) + + algFile = 'tmp.%s.alg' % os.getpid() + open(algFile, 'wb').write(alg) + + fullLogFileName = '%s/%s' % (LOG_DIR, logFileName) + print ' log: %s' % fullLogFileName + + command = '%s -classpath ../../build/classes/java:../../build/classes/demo:../../build/contrib/highlighter/classes/java:lib/commons-digester-1.7.jar:lib/commons-collections-3.1.jar:lib/commons-compress-1.0.jar:lib/commons-logging-1.0.4.jar:lib/commons-beanutils-1.7.0.jar:lib/xerces-2.9.0.jar:lib/xml-apis-2.9.0.jar:../../build/contrib/benchmark/classes/java org.apache.lucene.benchmark.byTask.Benchmark %s > %s 2>&1' % (JAVA_COMMAND, algFile, fullLogFileName) + + try: + t0 = time.time() + if os.system(command) != 0: + raise RuntimeError('FAILED') + t1 = time.time() + finally: + os.remove(algFile) + + if isIndex: + s = open(fullLogFileName, 'rb').read() + if s.find('Exception in thread "') != -1 or s.find('at org.apache.lucene') != -1: + raise RuntimeError('alg hit exceptions') + return + + else: + + # Parse results: + bestQPS = None + count = 0 + nhits = None + ndocs = None + warmTime = None + r = re.compile('^ ([0-9]+): (.*)$') + topN = [] + + for line in open(fullLogFileName, 'rb').readlines(): + m = r.match(line.rstrip()) + if m is not None: + topN.append(m.group(2)) + if line.startswith('NUMHITS='): + nhits = int(line[8:].strip()) + if line.startswith('NUMDOCS='): + ndocs = int(line[8:].strip()) + if line.startswith('XSearchWarm'): + v = line.strip().split() + warmTime = float(v[5]) + if line.startswith('XSearchReal'): + v = line.strip().split() + # print len(v), v + upto = 0 + i = 0 + qps = None + while i < len(v): + if v[i] == '-': + i += 1 + continue + else: + upto += 1 + i += 1 + if upto == 5: + qps = float(v[i-1].replace(',', '')) + break + + if qps is None: + raise RuntimeError('did not find qps') + + count += 1 + if bestQPS is None or qps > bestQPS: + bestQPS = qps + + if not verify: + if count != NUM_ROUND: + raise RuntimeError('did not find %s rounds (got %s)' % (NUM_ROUND, count)) + else: + bestQPS = 1.0 + warmTime = None + + if warmTime is None: + raise RuntimeError('did not find warm time') + + if nhits is None: + raise RuntimeError('did not see NUMHITS=line') + + if ndocs is None: + raise RuntimeError('did not see NUMDOCS=line') + + if ndocs != indexNumDocs: + raise RuntimeError('indexNumDocs mismatch: expected %d but got %d' % (indexNumDocs, ndocs)) + + return nhits, warmTime, bestQPS, topN + + def getAlg(self, indexPath, searchTask, numHits, verify=False): + + s = BASE_SEARCH_ALG + + if not verify: + s = s.replace('$ROUNDS$', + ''' + { "Rounds" + { "Run" + { "TestSearchSpeed" + { "XSearchReal" $SEARCH$ > : 3.0s + } + NewRound + } : %d + } + ''' % NUM_ROUND) + else: + s = s.replace('$ROUNDS$', '') + + s = s.replace('$INDEX$', indexPath) + s = s.replace('$SEARCH$', searchTask) + s = s.replace('$NUM_HITS$', str(numHits)) + + return s + + def compare(self, baseline, new, *params): + + if new[0] != baseline[0]: + raise RuntimeError('baseline found %d hits but new found %d hits' % (baseline[0], new[0])) + + qpsOld = baseline[2] + qpsNew = new[2] + pct = 100.0*(qpsNew-qpsOld)/qpsOld + print ' diff: %.1f%%' % pct + self.results.append((qpsOld, qpsNew, params)) + + self.fOut.write('|%s|%.2f|%.2f|%.1f%%|\n' % \ + ('|'.join(str(x) for x in params), + qpsOld, qpsNew, pct)) + self.fOut.flush() + + def save(self, name): + f = open('%s.pk' % name, 'wb') + cPickle.dump(self.results, f) + f.close() + +def verify(r1, r2): + if r1[0] != r2[0]: + raise RuntimeError('different total hits: %s vs %s' % (r1[0], r2[0])) + + h1 = r1[3] + h2 = r2[3] + if len(h1) != len(h2): + raise RuntimeError('different number of results') + else: + for i in range(len(h1)): + s1 = h1[i].replace('score=NaN', 'score=na') + s2 = h2[i].replace('score=NaN', 'score=na') + if s1 != s2: + raise RuntimeError('hit %s differs: %s vs %s' % (i, s1 ,s2)) + +def usage(): + print + print 'Usage: python -u %s -run | -report ' % sys.argv[0] + print + print ' -run runs all tests, saving results to file .pk' + print ' -report opens .pk and prints Jira table' + print + sys.exit(1) + +def main(): + + if not os.path.exists(LOG_DIR): + os.makedirs(LOG_DIR) + + if '-run' in sys.argv: + i = sys.argv.index('-run') + mode = 'run' + if i < len(sys.argv)-1: + name = sys.argv[1+i] + else: + usage() + elif '-report' in sys.argv: + i = sys.argv.index('-report') + mode = 'report' + if i < len(sys.argv)-1: + name = sys.argv[1+i] + else: + usage() + else: + usage() + + if mode == 'run': + run(name) + else: + report(name) + +def report(name): + + print '||Seg size||Query||Tot hits||Sort||Top N||QPS old||QPS new||Pct change||' + + results = cPickle.load(open('%s.pk' % name)) + for qpsOld, qpsNew, params in results: + pct = 100.0*(qpsNew-qpsOld)/qpsOld + if pct < 0.0: + c = 'red' + else: + c = 'green' + + if not DO_BALANCED and params[0] == 'balanced': + continue + + params = list(params) + sort = params[3] + sort = sort.replace(':string', '') + sort = sort.replace('doctitle', 'title') + sort = sort.replace('sort_field:int', 'rand int') + sort = sort.replace('random_string', 'rand string') + params[3] = sort + + query = params[1] + if query == '*:*': + query = '' + params[1] = query + + pct = '{color:%s}%.1f%%{color}' % (c, pct) + print '|%s|%.2f|%.2f|%s|' % \ + ('|'.join(str(x) for x in params), + qpsOld, qpsNew, pct) + +def run(name): + + r = RunAlgs(name) + + if not os.path.exists(WIKI_FILE): + print + print 'NOTE: wiki source file "%s" does not exist; skipping wikipedia index tests (edit WIKI_FILE in this script & restart if this is wrong)' % WIKI_FILE + print + doWiki = False + else: + doWiki = True + print + + if DO_BALANCED: + balancedTup = (None, 20) + else: + balancedTup = (None,) + + indexes = {} + for source in ('wiki', 'random'): + if source != 'wiki' or doWiki: + for balanced in balancedTup: + #indexes[(source, balanced)] = r.makeIndex(source, 2000000, balancedNumSegs=balanced) + indexes[(source, balanced)] = r.makeIndex(source, INDEX_NUM_DOCS, balancedNumSegs=balanced) + + for balanced in balancedTup: + if doWiki: + sources = ('wiki', 'random') + else: + sources = ('random',) + + for source in sources: + if source == 'random': + queries = ('*:*',) + else: + queries = ('1', '*:*') + + for query in queries: + if source == 'random': + sorts = ('random_string:string', 'country:string', 'sort_field:int') + else: + sorts = ('doctitle:string',) + for sort in sorts: + for numHits in (10, 25, 50, 100, 500, 1000): + + if balanced is None: + s = 'log' + else: + s = 'balanced' + + print '\nRUN: balanced=%s source=%s query=%s sort=%s nhits=%d' % \ + (s, source, query, sort, numHits) + + prefix = r.getLogPrefix(balanced=balanced, source=source, query=query, sort=sort, numHits=numHits) + indexPath = '%s/%s' % (INDEX_DIR_BASE, indexes[(source, balanced)]) + + # singlePQ + s = r.getAlg(indexPath, + 'SearchWithSort(%s,noscore,nomaxscore)' % sort, + numHits) + singlePQ = r.runOne(s, 'singlePQ_%s' % prefix, INDEX_NUM_DOCS, query) + + # multiPQ + s = r.getAlg(indexPath, + 'SearchWithOldSort(%s)' % sort, + numHits) + s = 'old.sort.api=true\n' + s + multiPQ = r.runOne(s, 'multiPQ_%s' % prefix, INDEX_NUM_DOCS, query) + + verify(singlePQ, multiPQ) + + print ' %d hits' % singlePQ[0] + + if balanced is None: + bs = 'log' + else: + bs = 'balanced' + + r.compare(singlePQ, multiPQ, + bs, query, singlePQ[0], sort, numHits) + r.save(name) + +def cleanScores(l): + for i in range(len(l)): + pos = l[i].find(' score=') + l[i] = l[i][:pos].strip() + +if __name__ == '__main__': + main() Property changes on: contrib/benchmark/sortBench.py ___________________________________________________________________ Added: svn:special + *