Index: contrib/benchmark/conf/tokenize.alg
===================================================================
--- contrib/benchmark/conf/tokenize.alg (revision 0)
+++ contrib/benchmark/conf/tokenize.alg (revision 0)
@@ -0,0 +1,35 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This alg reads all tokens out of a document but does not index them.
+# This is useful for benchmarking tokenizers.
+#
+# To use this, cd to contrib/benchmark and then run:
+#
+# ant run-task -Dtask.alg=conf/tokenize.alg
+#
+
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+doc.maker.forever=false
+
+
+#
+-------------------------------------------------------------------------------------
+
+{ReadTokens > : *
Property changes on: contrib/benchmark/conf/tokenize.alg
___________________________________________________________________
Name: svn:eol-style
+ native
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 559909)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy)
@@ -220,6 +220,27 @@
lineFile.delete();
}
+ /**
+ * Test ReadTokensTask
+ */
+ public void testReadTokens() throws Exception {
+
+ // We will call WriteLineDocs this many times
+ final int NUM_TRY_DOCS = 500;
+
+ // Creates a line file with first 500 docs from reuters
+ String algLines1[] = {
+ "# ----- properties ",
+ "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
+ "doc.maker.forever=false",
+ "# ----- alg ",
+ "{ReadTokens > : " + NUM_TRY_DOCS,
+ };
+
+ // Run algo
+ Benchmark benchmark = execBenchmark(algLines1);
+ }
+
// create the benchmark and execute it.
private Benchmark execBenchmark(String[] algLines) throws Exception {
String algText = algLinesToText(algLines);
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (revision 559909)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (working copy)
@@ -78,7 +78,7 @@
if (reportStats && depth <= maxDepthLogStart && !shouldNeverLogAtStart()) {
System.out.println("------------> starting task: " + getName());
}
- if (shouldNotRecordStats() || !reportStats) {
+ if (!reportStats || shouldNotRecordStats()) {
setup();
int count = doLogic();
tearDown();
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 0)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 0)
@@ -0,0 +1,165 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import java.text.NumberFormat;
+import java.io.Reader;
+import java.util.List;
+
+
+/**
+ * Simple task to test performance of tokenizers. It just
+ * creates a token stream for each field of the document and
+ * read all tokens out of that stream.
+ *
Relevant properties: doc.tokenize.log.step.
+ */
+public class ReadTokensTask extends PerfTask {
+
+ /**
+ * Default value for property doc.tokenize.log.step - indicating how often
+ * an "added N docs / M tokens" message should be logged.
+ */
+ public static final int DEFAULT_DOC_LOG_STEP = 500;
+
+ public ReadTokensTask(PerfRunData runData) {
+ super(runData);
+ }
+
+ private int logStep = -1;
+ int count = 0;
+ long tokenCount = 0;
+
+ // volatile data passed between setup(), doLogic(), tearDown().
+ private Document doc = null;
+
+ /*
+ * (non-Javadoc)
+ * @see PerfTask#setup()
+ */
+ public void setup() throws Exception {
+ super.setup();
+ DocMaker docMaker = getRunData().getDocMaker();
+ doc = docMaker.makeDocument();
+ }
+
+ /* (non-Javadoc)
+ * @see PerfTask#tearDown()
+ */
+ public void tearDown() throws Exception {
+ log(++count);
+ doc = null;
+ super.tearDown();
+ }
+
+ Token token = new Token("", 0, 0);
+
+ public int doLogic() throws Exception {
+ List fields = doc.getFields();
+ final int numField = fields.size();
+ Analyzer analyzer = getRunData().getAnalyzer();
+ for(int i=0;i0 && (count%logStep)==0) {
+ double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
+ NumberFormat nf = NumberFormat.getInstance();
+ nf.setMaximumFractionDigits(2);
+ System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + tokenCount + " tokens");
+ }
+ }
+
+ /* Simple StringReader that can be reset to a new string;
+ * we use this when tokenizing the string value from a
+ * Field. */
+ ReusableStringReader stringReader = new ReusableStringReader();
+
+ private final static class ReusableStringReader extends Reader {
+ int upto;
+ int left;
+ String s;
+ void init(String s) {
+ this.s = s;
+ left = s.length();
+ this.upto = 0;
+ }
+ public int read(char[] c) {
+ return read(c, 0, c.length);
+ }
+ public int read(char[] c, int off, int len) {
+ if (left > len) {
+ s.getChars(upto, upto+len, c, off);
+ upto += len;
+ left -= len;
+ return len;
+ } else if (0 == left) {
+ return -1;
+ } else {
+ s.getChars(upto, upto+left, c, off);
+ int r = left;
+ left = 0;
+ upto = s.length();
+ return r;
+ }
+ }
+ public void close() {};
+ }
+}
Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
___________________________________________________________________
Name: svn:eol-style
+ native
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (revision 559909)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (working copy)
@@ -100,15 +100,19 @@
int count = 0;
boolean exhausted = false;
+
+ final int numTasks = tasks.size();
+ final PerfTask[] tasksArray = new PerfTask[numTasks];
+ for(int k=0;kdoc.delete.log.step
log.queries
task.max.depth.log
+ doc.tokenize.log.step