Index: contrib/benchmark/conf/tokenize.alg =================================================================== --- contrib/benchmark/conf/tokenize.alg (revision 0) +++ contrib/benchmark/conf/tokenize.alg (revision 0) @@ -0,0 +1,35 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- + +# +# This alg reads all tokens out of a document but does not index them. +# This is useful for benchmarking tokenizers. +# +# To use this, cd to contrib/benchmark and then run: +# +# ant run-task -Dtask.alg=conf/tokenize.alg +# + +doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +doc.maker.forever=false + + +# +------------------------------------------------------------------------------------- + +{ReadTokens > : * Property changes on: contrib/benchmark/conf/tokenize.alg ___________________________________________________________________ Name: svn:eol-style + native Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 559909) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy) @@ -220,6 +220,27 @@ lineFile.delete(); } + /** + * Test ReadTokensTask + */ + public void testReadTokens() throws Exception { + + // We will call WriteLineDocs this many times + final int NUM_TRY_DOCS = 500; + + // Creates a line file with first 500 docs from reuters + String algLines1[] = { + "# ----- properties ", + "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker", + "doc.maker.forever=false", + "# ----- alg ", + "{ReadTokens > : " + NUM_TRY_DOCS, + }; + + // Run algo + Benchmark benchmark = execBenchmark(algLines1); + } + // create the benchmark and execute it. private Benchmark execBenchmark(String[] algLines) throws Exception { String algText = algLinesToText(algLines); Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 0) @@ -0,0 +1,165 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.DocMaker; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import java.text.NumberFormat; +import java.io.Reader; +import java.util.List; + + +/** + * Simple task to test performance of tokenizers. It just + * creates a token stream for each field of the document and + * read all tokens out of that stream. + *
Relevant properties: doc.tokenize.log.step. + */ +public class ReadTokensTask extends PerfTask { + + /** + * Default value for property doc.tokenize.log.step - indicating how often + * an "added N docs / M tokens" message should be logged. + */ + public static final int DEFAULT_DOC_LOG_STEP = 500; + + public ReadTokensTask(PerfRunData runData) { + super(runData); + } + + private int logStep = -1; + int count = 0; + long tokenCount = 0; + + // volatile data passed between setup(), doLogic(), tearDown(). + private Document doc = null; + + /* + * (non-Javadoc) + * @see PerfTask#setup() + */ + public void setup() throws Exception { + super.setup(); + DocMaker docMaker = getRunData().getDocMaker(); + doc = docMaker.makeDocument(); + } + + /* (non-Javadoc) + * @see PerfTask#tearDown() + */ + public void tearDown() throws Exception { + log(++count); + doc = null; + super.tearDown(); + } + + Token token = new Token("", 0, 0); + + public int doLogic() throws Exception { + List fields = doc.getFields(); + final int numField = fields.size(); + Analyzer analyzer = getRunData().getAnalyzer(); + for(int i=0;i0 && (count%logStep)==0) { + double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0; + NumberFormat nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(2); + System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + tokenCount + " tokens"); + } + } + + /* Simple StringReader that can be reset to a new string; + * we use this when tokenizing the string value from a + * Field. */ + ReusableStringReader stringReader = new ReusableStringReader(); + + private final static class ReusableStringReader extends Reader { + int upto; + int left; + String s; + void init(String s) { + this.s = s; + left = s.length(); + this.upto = 0; + } + public int read(char[] c) { + return read(c, 0, c.length); + } + public int read(char[] c, int off, int len) { + if (left > len) { + s.getChars(upto, upto+len, c, off); + upto += len; + left -= len; + return len; + } else if (0 == left) { + return -1; + } else { + s.getChars(upto, upto+left, c, off); + int r = left; + left = 0; + upto = s.length(); + return r; + } + } + public void close() {}; + } +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java ___________________________________________________________________ Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java (revision 559909) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java (working copy) @@ -69,6 +69,7 @@ throw new IllegalStateException("Benchmark was already executed"); } executed = true; + runData.setStartTimeMillis(); algorithm.execute(); } @@ -111,10 +112,10 @@ e.printStackTrace(); } + System.out.println("\nNet elapsed time: " + ((System.currentTimeMillis()-benchmark.runData.getStartTimeMillis())/1000.0) + " sec"); System.out.println("####################"); System.out.println("### D O N E !!! ###"); System.out.println("####################"); - } /** Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java (revision 559909) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java (working copy) @@ -137,8 +137,11 @@ // release unused stuff System.runFinalization(); System.gc(); - + } + + public long setStartTimeMillis() { startTimeMillis = System.currentTimeMillis(); + return startTimeMillis; } /** Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html (revision 559909) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html (working copy) @@ -534,6 +534,7 @@
  • doc.delete.log.step
  • log.queries
  • task.max.depth.log +
  • doc.tokenize.log.step