Index: contrib/benchmark/conf/tokenize.alg =================================================================== --- contrib/benchmark/conf/tokenize.alg (revision 0) +++ contrib/benchmark/conf/tokenize.alg (revision 0) @@ -0,0 +1,36 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- + +# +# This alg reads all tokens out of a document but does not index them. +# This is useful for benchmarking tokenizers. +# +# To use this, cd to contrib/benchmark and then run: +# +# ant run-task -Dtask.alg=conf/tokenize.alg +# + +doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +doc.maker.forever=false + + +# +------------------------------------------------------------------------------------- + +{ReadTokens > : * +RepSumByName \ No newline at end of file Property changes on: contrib/benchmark/conf/tokenize.alg ___________________________________________________________________ Name: svn:eol-style + native Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 561576) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy) @@ -21,14 +21,19 @@ import java.io.File; import java.io.FileReader; import java.io.BufferedReader; +import java.util.List; +import java.util.Iterator; import org.apache.lucene.benchmark.byTask.Benchmark; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; +import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermDocs; import junit.framework.TestCase; @@ -223,6 +228,60 @@ lineFile.delete(); } + /** + * Test ReadTokensTask + */ + public void testReadTokens() throws Exception { + + // We will call ReadTokens on this many docs + final int NUM_DOCS = 100; + + // Read tokens from first NUM_DOCS docs from Reuters and + // then build index from the same docs + String algLines1[] = { + "# ----- properties ", + "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer", + "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker", + "# ----- alg ", + "{ReadTokens}: " + NUM_DOCS, + "ResetSystemErase", + "CreateIndex", + "{AddDoc}: " + NUM_DOCS, + "CloseIndex", + }; + + // Run algo + Benchmark benchmark = execBenchmark(algLines1); + + List stats = benchmark.getRunData().getPoints().taskStats(); + + // Count how many tokens all ReadTokens saw + int totalTokenCount1 = 0; + for (Iterator it = stats.iterator(); it.hasNext();) { + TaskStats stat = (TaskStats) it.next(); + if (stat.getTask().getName().equals("ReadTokens")) { + totalTokenCount1 += stat.getCount(); + } + } + + // Separately count how many tokens are actually in the index: + IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory()); + assertEquals(NUM_DOCS, reader.numDocs()); + + TermEnum terms = reader.terms(); + TermDocs termDocs = reader.termDocs(); + int totalTokenCount2 = 0; + while(terms.next()) { + termDocs.seek(terms.term()); + while(termDocs.next()) + totalTokenCount2 += termDocs.freq(); + } + reader.close(); + + // Make sure they are the same + assertEquals(totalTokenCount1, totalTokenCount2); + } + // create the benchmark and execute it. public static Benchmark execBenchmark(String[] algLines) throws Exception { String algText = algLinesToText(algLines); Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (revision 561576) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (working copy) @@ -78,7 +78,7 @@ if (reportStats && depth <= maxDepthLogStart && !shouldNeverLogAtStart()) { System.out.println("------------> starting task: " + getName()); } - if (shouldNotRecordStats() || !reportStats) { + if (!reportStats || shouldNotRecordStats()) { setup(); int count = doLogic(); tearDown(); Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 0) @@ -0,0 +1,168 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.DocMaker; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import java.text.NumberFormat; +import java.io.Reader; +import java.util.List; + + +/** + * Simple task to test performance of tokenizers. It just + * creates a token stream for each field of the document and + * read all tokens out of that stream. + *
Relevant properties: doc.tokenize.log.step. + */ +public class ReadTokensTask extends PerfTask { + + /** + * Default value for property doc.tokenize.log.step - indicating how often + * an "added N docs / M tokens" message should be logged. + */ + public static final int DEFAULT_DOC_LOG_STEP = 500; + + public ReadTokensTask(PerfRunData runData) { + super(runData); + } + + private int logStep = -1; + int count = 0; + int totalTokenCount = 0; + + // volatile data passed between setup(), doLogic(), tearDown(). + private Document doc = null; + + /* + * (non-Javadoc) + * @see PerfTask#setup() + */ + public void setup() throws Exception { + super.setup(); + DocMaker docMaker = getRunData().getDocMaker(); + doc = docMaker.makeDocument(); + } + + /* (non-Javadoc) + * @see PerfTask#tearDown() + */ + public void tearDown() throws Exception { + log(++count); + doc = null; + super.tearDown(); + } + + Token token = new Token("", 0, 0); + + public int doLogic() throws Exception { + List fields = doc.getFields(); + final int numField = fields.size(); + Analyzer analyzer = getRunData().getAnalyzer(); + int tokenCount = 0; + for(int i=0;i0 && (count%logStep)==0) { + double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0; + NumberFormat nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(2); + System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + totalTokenCount + " tokens"); + } + } + + /* Simple StringReader that can be reset to a new string; + * we use this when tokenizing the string value from a + * Field. */ + ReusableStringReader stringReader = new ReusableStringReader(); + + private final static class ReusableStringReader extends Reader { + int upto; + int left; + String s; + void init(String s) { + this.s = s; + left = s.length(); + this.upto = 0; + } + public int read(char[] c) { + return read(c, 0, c.length); + } + public int read(char[] c, int off, int len) { + if (left > len) { + s.getChars(upto, upto+len, c, off); + upto += len; + left -= len; + return len; + } else if (0 == left) { + return -1; + } else { + s.getChars(upto, upto+left, c, off); + int r = left; + left = 0; + upto = s.length(); + return r; + } + } + public void close() {}; + } +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java ___________________________________________________________________ Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (revision 561576) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (working copy) @@ -100,15 +100,19 @@ int count = 0; boolean exhausted = false; + + final int numTasks = tasks.size(); + final PerfTask[] tasksArray = new PerfTask[numTasks]; + for(int k=0;k
  • doc.delete.log.step
  • log.queries
  • task.max.depth.log +
  • doc.tokenize.log.step