Index: contrib/CHANGES.txt =================================================================== --- contrib/CHANGES.txt (revision 784670) +++ contrib/CHANGES.txt (working copy) @@ -8,7 +8,21 @@ API Changes - (None) + 1. LUCENE-1595: DocMaker has been replaced with a concrete class which accepts + a ContentSource for iterating over a contetn source's documents. Most of the + DocMakers were changed to a ContentSource implementation, and DocMaker + itself offers an easy way for reusing fields, a default document creation + impl etc. The new DocMaker is the default, in case doc.maker is not + specified in the .alg file. + If you have .alg files which create a DocMaker like ReutersDocMaker, you + should change the line to: + content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource + + Also, PerfTask now logs a message in its tearDown(). It accepts log.step + to determine how often to log. Additionally, the logging was taken out of + all the current tasks, and consequently doc.add.log.step. For easy migration + of current .alg files, rename doc.add.log.step to log.step and + doc.delete.log.step to delete.log.step. (Shai Erera via Mark Miller) Bug fixes Index: contrib/benchmark/conf/analyzer.alg =================================================================== --- contrib/benchmark/conf/analyzer.alg (revision 784670) +++ contrib/benchmark/conf/analyzer.alg (working copy) @@ -30,13 +30,12 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/autoCommit.alg =================================================================== --- contrib/benchmark/conf/autoCommit.alg (revision 784670) +++ contrib/benchmark/conf/autoCommit.alg (working copy) @@ -38,7 +38,7 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=5000 +log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml Index: contrib/benchmark/conf/compound-penalty.alg =================================================================== --- contrib/benchmark/conf/compound-penalty.alg (revision 784670) +++ contrib/benchmark/conf/compound-penalty.alg (working copy) @@ -34,14 +34,13 @@ doc.stored=stored:true:true:false:false doc.tokenized=true doc.term.vector=vector:true:true:false:false -doc.add.log.step=500 -doc.delete.log.step=100 +log.step=500 +delete.log.step=100 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/createLineFile.alg =================================================================== --- contrib/benchmark/conf/createLineFile.alg (revision 784670) +++ contrib/benchmark/conf/createLineFile.alg (working copy) @@ -29,13 +29,13 @@ # # Where to get documents from: -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource # Where to write the line file output: line.file.out=work/reuters.lines.txt # Stop after processing the document feed once: -doc.maker.forever=false +content.source.forever=false # ------------------------------------------------------------------------------------- Index: contrib/benchmark/conf/deletes.alg =================================================================== --- contrib/benchmark/conf/deletes.alg (revision 784670) +++ contrib/benchmark/conf/deletes.alg (working copy) @@ -32,14 +32,14 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=10000 -doc.delete.log.step=100 +log.step=10000 +delete.log.step=100 docs.dir=reuters-out #docs.dir=reuters-111 -doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/extractWikipedia.alg =================================================================== --- contrib/benchmark/conf/extractWikipedia.alg (revision 784670) +++ contrib/benchmark/conf/extractWikipedia.alg (working copy) @@ -36,7 +36,7 @@ line.file.out=work/enwiki.txt # Stop after processing the document feed once: -doc.maker.forever=false +content.source.forever=false # ------------------------------------------------------------------------------------- Index: contrib/benchmark/conf/highlight-profile.alg =================================================================== --- contrib/benchmark/conf/highlight-profile.alg (revision 784670) +++ contrib/benchmark/conf/highlight-profile.alg (working copy) @@ -28,11 +28,11 @@ doc.term.vector=true doc.term.vector.offsets=true doc.term.vector.positions=true -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/indexLineFile.alg =================================================================== --- contrib/benchmark/conf/indexLineFile.alg (revision 784670) +++ contrib/benchmark/conf/indexLineFile.alg (working copy) @@ -38,7 +38,7 @@ docs.file=work/reuters.lines.txt # Process documents only once: -doc.maker.forever=false +content.source.forever=false # ------------------------------------------------------------------------------------- Index: contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg =================================================================== --- contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg (revision 784670) +++ contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg (working copy) @@ -30,13 +30,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/indexing-flush-by-RAM.alg =================================================================== --- contrib/benchmark/conf/indexing-flush-by-RAM.alg (revision 784670) +++ contrib/benchmark/conf/indexing-flush-by-RAM.alg (working copy) @@ -30,13 +30,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/indexing-multithreaded.alg =================================================================== --- contrib/benchmark/conf/indexing-multithreaded.alg (revision 784670) +++ contrib/benchmark/conf/indexing-multithreaded.alg (working copy) @@ -30,13 +30,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/indexing.alg =================================================================== --- contrib/benchmark/conf/indexing.alg (revision 784670) +++ contrib/benchmark/conf/indexing.alg (working copy) @@ -30,13 +30,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/micro-standard-flush-by-ram.alg =================================================================== --- contrib/benchmark/conf/micro-standard-flush-by-ram.alg (revision 784670) +++ contrib/benchmark/conf/micro-standard-flush-by-ram.alg (working copy) @@ -29,13 +29,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/micro-standard.alg =================================================================== --- contrib/benchmark/conf/micro-standard.alg (revision 784670) +++ contrib/benchmark/conf/micro-standard.alg (working copy) @@ -28,13 +28,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/readContentSource.alg =================================================================== --- contrib/benchmark/conf/readContentSource.alg (revision 0) +++ contrib/benchmark/conf/readContentSource.alg (revision 0) @@ -0,0 +1,45 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- + +# +# This alg reads the information from a ContentSoruce. It is useful for +# measurinng the performance of a particular ContentSource implementation, or +# gather baselines for operations like indexing (if reading from the content +# source takes 'X' time, we cannot index faster). +# +# To use this, first cd to contrib/benchmark and then run: +# +# ant run-task -Dtask.alg=conf/readContentSource.alg +# + +# Where to get documents from: +content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource +docs.file=temp/enwiki-20070527-pages-articles.xml.bz2 + +# Stop after processing the document feed once: +content.source.forever=false + +# Log messages every: +log.step=100000 + +# ------------------------------------------------------------------------------------- + +# Process all documents, appending each one to the line file: +{ ConsumeContentSource } : * + +RepSumByPref ConsumeContentSource Index: contrib/benchmark/conf/sample.alg =================================================================== --- contrib/benchmark/conf/sample.alg (revision 784670) +++ contrib/benchmark/conf/sample.alg (working copy) @@ -40,13 +40,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/sloppy-phrase.alg =================================================================== --- contrib/benchmark/conf/sloppy-phrase.alg (revision 784670) +++ contrib/benchmark/conf/sloppy-phrase.alg (working copy) @@ -28,13 +28,13 @@ doc.stored=false doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleSloppyPhraseQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker Index: contrib/benchmark/conf/sort-standard.alg =================================================================== --- contrib/benchmark/conf/sort-standard.alg (revision 784670) +++ contrib/benchmark/conf/sort-standard.alg (working copy) @@ -29,11 +29,11 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=100000 +log.step=100000 docs.dir=reuters-out -doc.maker=org.apache.lucene.benchmark.byTask.feeds.SortableSimpleDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker Index: contrib/benchmark/conf/standard-flush-by-RAM.alg =================================================================== --- contrib/benchmark/conf/standard-flush-by-RAM.alg (revision 784670) +++ contrib/benchmark/conf/standard-flush-by-RAM.alg (working copy) @@ -29,13 +29,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/standard-highlights-notv.alg =================================================================== --- contrib/benchmark/conf/standard-highlights-notv.alg (revision 784670) +++ contrib/benchmark/conf/standard-highlights-notv.alg (working copy) @@ -28,11 +28,11 @@ doc.term.vector=false doc.term.vector.offsets=false doc.term.vector.positions=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/standard-highlights-tv.alg =================================================================== --- contrib/benchmark/conf/standard-highlights-tv.alg (revision 784670) +++ contrib/benchmark/conf/standard-highlights-tv.alg (working copy) @@ -28,11 +28,11 @@ doc.term.vector=true doc.term.vector.offsets=true doc.term.vector.positions=true -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/standard.alg =================================================================== --- contrib/benchmark/conf/standard.alg (revision 784670) +++ contrib/benchmark/conf/standard.alg (working copy) @@ -28,13 +28,13 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker Index: contrib/benchmark/conf/tokenize.alg =================================================================== --- contrib/benchmark/conf/tokenize.alg (revision 784670) +++ contrib/benchmark/conf/tokenize.alg (working copy) @@ -25,8 +25,8 @@ # ant run-task -Dtask.alg=conf/tokenize.alg # -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker -doc.maker.forever=false +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource +content.source.forever=false # Index: contrib/benchmark/conf/wikipedia-flush-by-RAM.alg =================================================================== --- contrib/benchmark/conf/wikipedia-flush-by-RAM.alg (revision 784670) +++ contrib/benchmark/conf/wikipedia-flush-by-RAM.alg (working copy) @@ -37,7 +37,7 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=5000 +log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml Index: contrib/benchmark/conf/wikipedia.alg =================================================================== --- contrib/benchmark/conf/wikipedia.alg (revision 784670) +++ contrib/benchmark/conf/wikipedia.alg (working copy) @@ -33,7 +33,7 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=5000 +log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml Index: contrib/benchmark/conf/wikipediaOneRound.alg =================================================================== --- contrib/benchmark/conf/wikipediaOneRound.alg (revision 784670) +++ contrib/benchmark/conf/wikipediaOneRound.alg (working copy) @@ -33,7 +33,7 @@ doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=5000 +log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java (revision 784670) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java (working copy) @@ -17,9 +17,13 @@ * limitations under the License. */ +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; -import org.apache.lucene.benchmark.byTask.feeds.HTMLParser; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.benchmark.byTask.stats.Points; import org.apache.lucene.benchmark.byTask.tasks.ReadTask; @@ -33,11 +37,6 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; -import java.io.File; -import java.util.HashMap; -import java.util.Iterator; - - /** * Data maintained by a performance test run. *
@@ -62,7 +61,6 @@ private Directory directory; private Analyzer analyzer; private DocMaker docMaker; - private HTMLParser htmlParser; // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately. private HashMap readTaskQueryMaker; @@ -82,14 +80,11 @@ "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); // doc maker docMaker = (DocMaker) Class.forName(config.get("doc.maker", - "org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker")).newInstance(); + "org.apache.lucene.benchmark.byTask.feeds.DocMaker")).newInstance(); docMaker.setConfig(config); // query makers readTaskQueryMaker = new HashMap(); qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker")); - // html parser, used for some doc makers - htmlParser = (HTMLParser) Class.forName(config.get("html.parser","org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser")).newInstance(); - docMaker.setHTMLParser(htmlParser); // index stuff reinit(false); @@ -229,9 +224,7 @@ this.analyzer = analyzer; } - /** - * @return Returns the docMaker. - */ + /** Returns the docMaker. */ public DocMaker getDocMaker() { return docMaker; } @@ -243,7 +236,7 @@ return config; } - public void resetInputs() { + public void resetInputs() throws IOException { docMaker.resetInputs(); Iterator it = readTaskQueryMaker.values().iterator(); while (it.hasNext()) { @@ -271,11 +264,4 @@ return qm; } - /** - * @return Returns the htmlParser. - */ - public HTMLParser getHtmlParser() { - return htmlParser; - } - } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (revision 784670) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (working copy) @@ -1,335 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.utils.Format; -import org.apache.lucene.document.DateTools; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - -import java.io.File; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; - - -/** - * Create documents for the test. - * Maintains counters of chars etc. so that sub-classes just need to - * provide textual content, and the create-by-size is handled here. - * - *
- * Config Params (default is in caps): - * doc.stored=true|FALSE+ * Supports the following configuration parameters: + *
files.
+ */
+ protected final void collectFiles(File dir, ArrayList files) {
+ if (!dir.canRead()) {
+ return;
+ }
+
+ File[] dirFiles = dir.listFiles();
+ Arrays.sort(dirFiles);
+ for (int i = 0; i < dirFiles.length; i++) {
+ File file = dirFiles[i];
+ if (file.isDirectory()) {
+ collectFiles(file, files);
+ } else if (file.canRead()) {
+ files.add(file);
+ }
+ }
+ }
+
+ /**
+ * Returns an {@link InputStream} over the requested file. This method
+ * attempts to identify the appropriate {@link InputStream} instance to return
+ * based on the file name (e.g., if it ends with .bz2 or .bzip, return a
+ * 'bzip' {@link InputStream}).
+ */
+ protected InputStream getInputStream(File file) throws IOException {
+ // First, create a FileInputStream, as this will be required by all types.
+ // Wrap with BufferedInputStream for better performance
+ InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE);
+
+ String fileName = file.getName();
+ int idx = fileName.lastIndexOf('.');
+ int type = OTHER;
+ if (idx != -1) {
+ Integer typeInt = (Integer) extensionToType.get(fileName.substring(idx));
+ if (typeInt != null) {
+ type = typeInt.intValue();
+ }
+ }
+ switch (type) {
+ case BZIP:
+ try {
+ // According to BZip2CompressorInputStream's code, it reads the first
+ // two file header chars ('B' and 'Z'). It is important to wrap the
+ // underlying input stream with a buffered one since
+ // Bzip2CompressorInputStream uses the read() method exclusively.
+ is = csFactory.createCompressorInputStream("bzip2", is);
+ } catch (CompressorException e) {
+ IOException ioe = new IOException(e.getMessage());
+ ioe.initCause(e);
+ throw ioe;
+ }
+ break;
+ default: // Do nothing, stay with FileInputStream
+ }
+
+ return is;
+ }
+
+ /**
+ * Returns true whether it's time to log a message (depending on verbose and
+ * the number of documents generated).
+ */
+ protected final boolean shouldLog() {
+ return verbose && logStep > 0 && docsCount % logStep == 0;
+ }
+
+ /** Called when reading from this content source is no longer required. */
+ public abstract void close() throws IOException;
+
+ /** Returns the number of bytes generated since last reset. */
+ public final long getBytesCount() { return bytesCount; }
+
+ /** Returns the number of generated documents since last reset. */
+ public final int getDocsCount() { return docsCount; }
+
+ public final Config getConfig() { return config; }
+
+ /** Returns the next {@link DocData} from the content source. */
+ public abstract DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException;
+
+ /** Returns the total number of bytes that were generated by this source. */
+ public final long getTotalBytesCount() { return totalBytesCount; }
+
+ /** Returns the total number of generated documents. */
+ public final int getTotalDocsCount() { return totalDocsCount; }
+
+ /**
+ * Resets the input for this content source, so that the test would behave as
+ * if it was just started, input-wise.
+ * + * NOTE: the default implementation resets the number of bytes and + * documents generated since the last reset, so it's important to call + * super.resetInputs in case you override this method. + */ + public void resetInputs() throws IOException { + bytesCount = 0; + docsCount = 0; + } + + /** + * Sets the {@link Config} for this content source. If you override this + * method, you must call super.setConfig. + */ + public void setConfig(Config config) { + this.config = config; + forever = config.get("content.source.forever", true); + logStep = config.get("content.source.log.step", 0); + verbose = config.get("content.source.verbose", false); + } + +} Property changes on: contrib\benchmark\src\java\org\apache\lucene\benchmark\byTask\feeds\ContentSource.java ___________________________________________________________________ Added: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (revision 784670) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (working copy) @@ -30,14 +30,7 @@ */ public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser { - public DemoHTMLParser () { - } - - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.io.Reader, java.text.DateFormat) - */ - public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { + public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader); // title @@ -64,16 +57,22 @@ date = new Date(); // now } } - - return new DocData(name, bodyBuf.toString(), title, props, date); + + docData.clear(); + docData.setName(name); + docData.setBody(bodyBuf.toString()); + docData.setTitle(title); + docData.setProps(props); + docData.setDate(date); + return docData; } /* * (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.lang.StringBuffer, java.text.DateFormat) */ - public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException { - return parse(name, date, new StringReader(inputText.toString()), dateFormat); + public DocData parse(DocData docData, String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException { + return parse(docData, name, date, new StringReader(inputText.toString()), dateFormat); } } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java (revision 0) @@ -0,0 +1,246 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.utils.Config; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileFilter; +import java.io.FileReader; +import java.io.IOException; +import java.text.DateFormat; +import java.text.ParsePosition; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.Locale; +import java.util.Stack; + +/** + * A {@link ContentSource} using the Dir collection for its input. Supports + * the following configuration parameters (on top of {@link ContentSource}): + *
reuseFields was set to true, then it attempts to reuse a
+ * Field instance. If such a field does not exist, it creates a new one.
+ */
+ Field getField(String name, Store store, Index index, TermVector termVector) {
+ if (!reuseFields) {
+ return new Field(name, "", store, index, termVector);
+ }
+
+ Field f = (Field) fields.get(name);
+ if (f == null) {
+ f = new Field(name, "", store, index, termVector);
+ fields.put(name, f);
+ }
+ return f;
+ }
+ }
- /** Reset inputs so that the test run would behave, input wise, as if it just started. */
- public void resetInputs();
+ private int numDocsCreated = 0;
+ private boolean storeBytes = false;
+
+ // leftovers are thread local, because it is unsafe to share residues between threads
+ private ThreadLocal leftovr = new ThreadLocal();
+ private ThreadLocal docState = new ThreadLocal();
+
+ public static final String BODY_FIELD = "body";
+ public static final String TITLE_FIELD = "doctitle";
+ public static final String DATE_FIELD = "docdate";
+ public static final String ID_FIELD = "docid";
+ public static final String BYTES_FIELD = "bytes";
+ public static final String NAME_FIELD = "docname";
+
+ protected Config config;
+
+ protected Store storeVal = Store.NO;
+ protected Index indexVal = Index.ANALYZED;
+ protected TermVector termVecVal = TermVector.NO;
- /** Return how many real unique texts are available, 0 if not applicable. */
- public int numUniqueTexts();
+ protected ContentSource source;
+ protected boolean reuseFields;
+ protected DocState localDocState;
- /** Return total bytes of all available unique texts, 0 if not applicable */
- public long numUniqueBytes();
+ private int lastPrintedNumUniqueTexts = 0;
- /** Return number of docs made since last reset. */
- public int getCount();
+ private long lastPrintedNumUniqueBytes = 0;
- /** Return total byte size of docs made since last reset. */
- public long getByteCount();
+ private int printNum = 0;
- /** Print some statistics on docs available/added/etc. */
- public void printDocStatistics();
+ // create a doc
+ // use only part of the body, modify it to keep the rest (or use all if size==0).
+ // reset the docdata properties so they are not added more than once.
+ private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
+ int docid = incrNumDocsCreated();
+ DocState ds = reuseFields ? getDocState() : localDocState;
+ Document doc = reuseFields ? ds.doc : new Document();
+ doc.clear();
+
+ // Set ID_FIELD
+ Field idField = ds.getField(ID_FIELD, storeVal, indexVal, termVecVal);
+ idField.setValue("doc" + docid);
+ doc.add(idField);
+
+ // Set NAME_FIELD
+ String name = docData.getName();
+ if (name == null) name = "";
+ name = cnt < 0 ? name : name + "_" + cnt;
+ Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal);
+ nameField.setValue(name);
+ doc.add(nameField);
+
+ // Set DATE_FIELD
+ String date = docData.getDate();
+ if (date == null) {
+ date = "";
+ }
+ Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
+ dateField.setValue(date);
+ doc.add(dateField);
+
+ // Set TITLE_FIELD
+ String title = docData.getTitle();
+ Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
+ titleField.setValue(title == null ? "" : title);
+ doc.add(titleField);
+
+ String body = docData.getBody();
+ if (body != null && body.length() > 0) {
+ String bdy;
+ if (size <= 0 || size >= body.length()) {
+ bdy = body; // use all
+ docData.setBody(""); // nothing left
+ } else {
+ // attempt not to break words - if whitespace found within next 20 chars...
+ for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
+ if (Character.isWhitespace(body.charAt(n))) {
+ size = n;
+ break;
+ }
+ }
+ bdy = body.substring(0, size); // use part
+ docData.setBody(body.substring(size)); // some left
+ }
+ Field bodyField = ds.getField(BODY_FIELD, storeVal, indexVal, termVecVal);
+ bodyField.setValue(bdy);
+ doc.add(bodyField);
+
+ if (storeBytes) {
+ Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
+ bytesField.setValue(bdy.getBytes("UTF-8"));
+ doc.add(bytesField);
+ }
+ }
- /** Set the html parser to use, when appropriate */
- public void setHTMLParser(HTMLParser htmlParser);
+ Properties props = docData.getProps();
+ if (props != null) {
+ for (Iterator iterator = props.entrySet().iterator(); iterator.hasNext();) {
+ Entry entry = (Entry) iterator.next();
+ Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
+ f.setValue((String) entry.getValue());
+ doc.add(f);
+ }
+ docData.setProps(null);
+ }
+ //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
+ return doc;
+ }
+
+ private void resetLeftovers() {
+ leftovr.set(null);
+ }
+
+ protected DocState getDocState() {
+ DocState ds = (DocState) docState.get();
+ if (ds == null) {
+ ds = new DocState(true, storeVal, indexVal, termVecVal);
+ docState.set(ds);
+ }
+ return ds;
+ }
+
+ protected synchronized int incrNumDocsCreated() {
+ return numDocsCreated++;
+ }
+
+ /**
+ * Closes the {@link DocMaker}. The base implementation closes the
+ * {@link ContentSource}, and it can be overridden to do more work (but make
+ * sure to call super.close()).
+ */
+ public void close() throws IOException {
+ source.close();
+ }
- /** Returns the htmlParser. */
- public HTMLParser getHtmlParser();
+ /**
+ * Returns the number of bytes generated by the content source since last
+ * reset.
+ */
+ public synchronized long getBytesCount() {
+ return source.getBytesCount();
+ }
-}
\ No newline at end of file
+ /**
+ * Returns the total number of bytes that were generated by the content source
+ * defined to that doc maker.
+ */
+ public long getTotalBytesCount() {
+ return source.getTotalBytesCount();
+ }
+
+ /**
+ * Creates a {@link Document} object ready for indexing. This method uses the
+ * {@link ContentSource} to get the next document from the source, and creates
+ * a {@link Document} object from the returned fields. If
+ * reuseFields was set to true, it will reuse {@link Document}
+ * and {@link Field} instances.
+ */
+ public Document makeDocument() throws Exception {
+ resetLeftovers();
+ DocData docData = source.getNextDocData(reuseFields ? getDocState().docData : localDocState.docData);
+ Document doc = createDocument(docData, 0, -1);
+ return doc;
+ }
+
+ /**
+ * Same as {@link #makeDocument()}, only this method creates a document of the
+ * given size input by size.
+ */
+ public Document makeDocument(int size) throws Exception {
+ LeftOver lvr = (LeftOver) leftovr.get();
+ if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null
+ || lvr.docdata.getBody().length() == 0) {
+ resetLeftovers();
+ }
+ DocData docData = reuseFields ? getDocState().docData : localDocState.docData;
+ DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata);
+ int cnt = (lvr == null ? 0 : lvr.cnt);
+ while (dd.getBody() == null || dd.getBody().length() < size) {
+ DocData dd2 = dd;
+ dd = source.getNextDocData(new DocData());
+ cnt = 0;
+ dd.setBody(dd2.getBody() + dd.getBody());
+ }
+ Document doc = createDocument(dd, size, cnt);
+ if (dd.getBody() == null || dd.getBody().length() == 0) {
+ resetLeftovers();
+ } else {
+ if (lvr == null) {
+ lvr = new LeftOver();
+ leftovr.set(lvr);
+ }
+ lvr.docdata = dd;
+ lvr.cnt = ++cnt;
+ }
+ return doc;
+ }
+
+ public void printDocStatistics() {
+ boolean print = false;
+ String col = " ";
+ StringBuffer sb = new StringBuffer();
+ String newline = System.getProperty("line.separator");
+ sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
+ int nut = source.getTotalDocsCount();
+ if (nut > lastPrintedNumUniqueTexts) {
+ print = true;
+ sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
+ lastPrintedNumUniqueTexts = nut;
+ }
+ long nub = getTotalBytesCount();
+ if (nub > lastPrintedNumUniqueBytes) {
+ print = true;
+ sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
+ lastPrintedNumUniqueBytes = nub;
+ }
+ if (source.getDocsCount() > 0) {
+ print = true;
+ sb.append("num docs added since last inputs reset: ").append(Format.format(0,source.getDocsCount(),col)).append(newline);
+ sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getBytesCount(),col)).append(newline);
+ }
+ if (print) {
+ System.out.println(sb.append(newline).toString());
+ printNum++;
+ }
+ }
+
+ /** Reset inputs so that the test run would behave, input wise, as if it just started. */
+ public synchronized void resetInputs() throws IOException {
+ printDocStatistics();
+ // re-initiate since properties by round may have changed.
+ setConfig(config);
+ source.resetInputs();
+ numDocsCreated = 0;
+ resetLeftovers();
+ }
+
+ /** Set the configuration parameters of this doc maker. */
+ public void setConfig(Config config) {
+ this.config = config;
+ try {
+ String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource");
+ source = (ContentSource) Class.forName(sourceClass).newInstance();
+ source.setConfig(config);
+ } catch (Exception e) {
+ // Should not get here. Throw runtime exception.
+ throw new RuntimeException(e);
+ }
+
+ boolean stored = config.get("doc.stored", false);
+ boolean tokenized = config.get("doc.tokenized", true);
+ boolean termVec = config.get("doc.term.vector", false);
+ storeVal = (stored ? Field.Store.YES : Field.Store.NO);
+ indexVal = (tokenized ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED);
+ boolean termVecPositions = config.get("doc.term.vector.positions", false);
+ boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
+ if (termVecPositions && termVecOffsets) {
+ termVecVal = TermVector.WITH_POSITIONS_OFFSETS;
+ } else if (termVecPositions) {
+ termVecVal = TermVector.WITH_POSITIONS;
+ } else if (termVecOffsets) {
+ termVecVal = TermVector.WITH_OFFSETS;
+ } else if (termVec) {
+ termVecVal = TermVector.YES;
+ } else {
+ termVecVal = TermVector.NO;
+ }
+ storeBytes = config.get("doc.store.body.bytes", false);
+
+ reuseFields = config.get("doc.reuse.fields", true);
+ if (!reuseFields) {
+ localDocState = new DocState(false, storeVal, indexVal, termVecVal);
+ } else {
+ // In a multi-rounds run, it is important to reset DocState since settings
+ // of fields may change between rounds, and this is the only way to reset
+ // the cache of all threads.
+ docState = new ThreadLocal();
+ }
+ }
+
+}
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (revision 0)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (revision 0)
@@ -0,0 +1,294 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.helpers.XMLReaderFactory;
+
+/**
+ * A {@link ContentSource} which reads the English Wikipedia dump. You can read
+ * the .bz2 file directly (it will be decompressed on the fly). Config
+ * properties:
+ * + * Config properties: + *
- * Config properties:
+ * Supports the following configuration parameters (on top of + * {@link ContentSource}): + *
- * Config properties:
doc.add.log.step.
* doc.add.log.step - indicating how often
- * an "added N docs" message should be logged.
- */
- public static final int DEFAULT_ADD_DOC_LOG_STEP = 500;
-
public AddDocTask(PerfRunData runData) {
super(runData);
}
- private int logStep = -1;
private int docSize = 0;
- int count = 0;
// volatile data passed between setup(), doLogic(), tearDown().
private Document doc = null;
- /*
- * (non-Javadoc)
- * @see PerfTask#setup()
- */
public void setup() throws Exception {
super.setup();
DocMaker docMaker = getRunData().getDocMaker();
@@ -62,33 +47,20 @@
}
}
- /* (non-Javadoc)
- * @see PerfTask#tearDown()
- */
public void tearDown() throws Exception {
- log(++count);
doc = null;
super.tearDown();
}
+ protected String getLogMessage(int recsCount) {
+ return "added " + recsCount + " docs";
+ }
+
public int doLogic() throws Exception {
getRunData().getIndexWriter().addDocument(doc);
return 1;
}
- protected void log (int count) {
- if (logStep<0) {
- // init once per instance
- logStep = getRunData().getConfig().get("doc.add.log.step",DEFAULT_ADD_DOC_LOG_STEP);
- }
- if (logStep>0 && (count%logStep)==0) {
- double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
- NumberFormat nf = NumberFormat.getInstance();
- nf.setMaximumFractionDigits(2);
- System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs");
- }
- }
-
/**
* Set the params (docSize only)
* @param params docSize, or 0 for no limit.
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java (revision 0)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java (revision 0)
@@ -0,0 +1,67 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
+import org.apache.lucene.benchmark.byTask.feeds.DocData;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+/**
+ * Consumes a {@link org.apache.lucene.benchmark.byTask.feeds.ContentSource}.
+ * Supports the following parameters:
+ *
+ * - content.source - the content source to use. (mandatory)
+ *
+ */
+public class ConsumeContentSourceTask extends PerfTask {
+
+ private ContentSource source;
+ private DocData dd = new DocData();
+
+ public ConsumeContentSourceTask(PerfRunData runData) {
+ super(runData);
+ Config config = runData.getConfig();
+ String sourceClass = config.get("content.source", null);
+ if (sourceClass == null) {
+ throw new IllegalArgumentException("content.source must be defined");
+ }
+ try {
+ source = (ContentSource) Class.forName(sourceClass).newInstance();
+ source.setConfig(config);
+ source.resetInputs();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ protected String getLogMessage(int recsCount) {
+ return "read " + recsCount + " documents from the content source";
+ }
+
+ public void close() throws Exception {
+ source.close();
+ super.close();
+ }
+
+ public int doLogic() throws Exception {
+ dd = source.getNextDocData(dd);
+ return 1;
+ }
+
+}
Property changes on: contrib\benchmark\src\java\org\apache\lucene\benchmark\byTask\tasks\ConsumeContentSourceTask.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java (revision 784670)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java (working copy)
@@ -22,7 +22,7 @@
/**
* Delete a document by docid.
*
Other side effects: none.
- *
Relevant properties: doc.delete.log.step , doc.delete.step.
+ *
Relevant properties: doc.delete.step, delete.log.step.
*
If no docid param is supplied, deletes doc with id = last-deleted-doc + doc.delete.step.
*
Takes optional param: document id.
*/
@@ -33,19 +33,16 @@
*/
public static final int DEFAULT_DOC_DELETE_STEP = 8;
- /**
- * Default value for property doc.delete.log.step - indicating how often
- * an "deleted N docs" message should be logged.
- */
- public static final int DEFAULT_DELETE_DOC_LOG_STEP = 500;
-
public DeleteDocTask(PerfRunData runData) {
super(runData);
+ // Override log.step, which is read by PerfTask
+ int deleteLogStep = runData.getConfig().get("delete.log.step", -1);
+ if (deleteLogStep != -1) {
+ logStep = deleteLogStep;
+ }
}
- private int logStep = -1;
private int deleteStep = -1;
- private static int numDeleted = 0;
private static int lastDeleted = -1;
private int docid = -1;
@@ -62,10 +59,6 @@
*/
public void setup() throws Exception {
super.setup();
- // one time static initializations
- if (logStep<0) {
- logStep = getRunData().getConfig().get("doc.delete.log.step",DEFAULT_DELETE_DOC_LOG_STEP);
- }
if (deleteStep<0) {
deleteStep = getRunData().getConfig().get("doc.delete.step",DEFAULT_DOC_DELETE_STEP);
}
@@ -73,19 +66,9 @@
docid = (byStep ? lastDeleted + deleteStep : docid);
}
- /* (non-Javadoc)
- * @see PerfTask#tearDown()
- */
- public void tearDown() throws Exception {
- log(++numDeleted);
- super.tearDown();
+ protected String getLogMessage(int recsCount) {
+ return "deleted " + recsCount + " docs, last deleted: " + lastDeleted;
}
-
- private void log (int count) {
- if (logStep>0 && (count%logStep)==0) {
- System.out.println("--> processed (delete) "+count+" docs, last deleted: "+lastDeleted);
- }
- }
/**
* Set the params (docid only)
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (revision 784670)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (working copy)
@@ -17,54 +17,80 @@
* limitations under the License.
*/
+import java.text.NumberFormat;
+
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.stats.Points;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
+import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
/**
- * A (abstract) task to be tested for performance.
- *
- * Every performance task extends this class, and provides its own doLogic() method,
- * which performss the actual task.
- *
- * Tasks performing some work that should be measured for the task, can overide setup() and/or tearDown() and
- * placed that work there.
- *
+ * An abstract task to be tested for performance.
+ * Every performance task extends this class, and provides its own
+ * {@link #doLogic()} method, which performss the actual task.
+ * Tasks performing some work that should be measured for the task, can overide
+ * {@link #setup()} and/or {@link #tearDown()} and place that work there.
* Relevant properties: task.max.depth.log.
*/
public abstract class PerfTask implements Cloneable {
+ private static final int DEFAULT_LOG_STEP = 1000;
+
private PerfRunData runData;
// propeties that all tasks have
private String name;
private int depth = 0;
+ protected int logStep;
+ private int logStepCount = 0;
private int maxDepthLogStart = 0;
private boolean disableCounting = false;
protected String params = null;
protected static final String NEW_LINE = System.getProperty("line.separator");
- /**
- * Should not be used externally
- */
+ /** Should not be used externally */
private PerfTask() {
- name = Format.simpleName(getClass());
+ name = Format.simpleName(getClass());
if (name.endsWith("Task")) {
- name = name.substring(0,name.length()-4);
+ name = name.substring(0, name.length() - 4);
}
}
+ /**
+ * @deprecated will be removed in 3.0. checks if there are any obsolete
+ * settings, like doc.add.log.step and doc.delete.log.step and
+ * alerts the user.
+ */
+ private void checkObsoleteSettings(Config config) {
+ if (config.get("doc.add.log.step", null) != null) {
+ throw new RuntimeException("doc.add.log.step is not supported anymore. " +
+ "Use log.step and refer to CHANGES to read on the recent API changes " +
+ "done to Benchmark's DocMaker and Task-based logging.");
+ }
+
+ if (config.get("doc.delete.log.step", null) != null) {
+ throw new RuntimeException("doc.delete.log.step is not supported anymore. " +
+ "Use delete.log.step and refer to CHANGES to read on the recent API changes " +
+ "done to Benchmark's DocMaker and Task-based logging.");
+ }
+ }
+
public PerfTask(PerfRunData runData) {
this();
this.runData = runData;
- this.maxDepthLogStart = runData.getConfig().get("task.max.depth.log",0);
+ Config config = runData.getConfig();
+ this.maxDepthLogStart = config.get("task.max.depth.log",0);
+ logStep = config.get("log.step", DEFAULT_LOG_STEP);
+ // To avoid the check 'if (logStep > 0)' in tearDown(). This effectively
+ // turns logging off.
+ if (logStep <= 0) {
+ logStep = Integer.MAX_VALUE;
+ }
+ checkObsoleteSettings(config);
}
- /* (non-Javadoc)
- * @see java.lang.Object#clone()
- */
protected Object clone() throws CloneNotSupportedException {
// tasks having non primitive data structures should overide this.
// otherwise parallel running of a task sequence might not run crrectly.
@@ -173,6 +199,10 @@
return maxDepthLogStart;
}
+ protected String getLogMessage(int recsCount) {
+ return "processed " + recsCount + " records";
+ }
+
/**
* Tasks that should never log at start can overide this.
* @return true if this task should never log when it start.
@@ -207,7 +237,14 @@
* Notice that higher level (sequence) tasks containing this task would then
* measure larger time than the sum of their contained tasks.
*/
- public void tearDown () throws Exception {
+ public void tearDown() throws Exception {
+ if (++logStepCount % logStep == 0) {
+ double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0;
+ NumberFormat nf = NumberFormat.getInstance();
+ nf.setMaximumFractionDigits(2);
+ System.out.println(nf.format(time) + " sec --> "
+ + Thread.currentThread().getName() + " " + getLogMessage(logStepCount));
+ }
}
/**
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 784670)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (working copy)
@@ -17,58 +17,44 @@
* limitations under the License.
*/
-import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.analysis.Token;
+import java.io.Reader;
+import java.util.List;
+
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import java.text.NumberFormat;
-import java.io.Reader;
-import java.util.List;
-
/**
* Simple task to test performance of tokenizers. It just
* creates a token stream for each field of the document and
* read all tokens out of that stream.
- *
Relevant properties: doc.tokenize.log.step.
*/
public class ReadTokensTask extends PerfTask {
- /**
- * Default value for property doc.tokenize.log.step - indicating how often
- * an "added N docs / M tokens" message should be logged.
- */
- public static final int DEFAULT_DOC_LOG_STEP = 500;
-
public ReadTokensTask(PerfRunData runData) {
super(runData);
}
- private int logStep = -1;
- int count = 0;
- int totalTokenCount = 0;
+ private int totalTokenCount = 0;
// volatile data passed between setup(), doLogic(), tearDown().
private Document doc = null;
- /*
- * (non-Javadoc)
- * @see PerfTask#setup()
- */
public void setup() throws Exception {
super.setup();
DocMaker docMaker = getRunData().getDocMaker();
doc = docMaker.makeDocument();
}
- /* (non-Javadoc)
- * @see PerfTask#tearDown()
- */
+ protected String getLogMessage(int recsCount) {
+ return "read " + recsCount + " docs; " + totalTokenCount + " tokens";
+ }
+
public void tearDown() throws Exception {
- log(++count);
doc = null;
super.tearDown();
}
@@ -117,19 +103,6 @@
return tokenCount;
}
- private void log(int count) {
- if (logStep<0) {
- // init once per instance
- logStep = getRunData().getConfig().get("doc.tokenize.log.step", DEFAULT_DOC_LOG_STEP);
- }
- if (logStep>0 && (count%logStep)==0) {
- double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
- NumberFormat nf = NumberFormat.getInstance();
- nf.setMaximumFractionDigits(2);
- System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + totalTokenCount + " tokens");
- }
- }
-
/* Simple StringReader that can be reset to a new string;
* we use this when tokenizing the string value from a
* Field. */
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (revision 784670)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (working copy)
@@ -62,6 +62,7 @@
for(int i=0;iOther side effects: none.
- *
Relevant properties: doc.add.log.step.
*
Takes optional param: document size.
*/
public class UpdateDocTask extends PerfTask {
@@ -38,17 +34,11 @@
super(runData);
}
- private int logStep = -1;
private int docSize = 0;
- int count = 0;
// volatile data passed between setup(), doLogic(), tearDown().
private Document doc = null;
- /*
- * (non-Javadoc)
- * @see PerfTask#setup()
- */
public void setup() throws Exception {
super.setup();
DocMaker docMaker = getRunData().getDocMaker();
@@ -59,38 +49,24 @@
}
}
- /* (non-Javadoc)
- * @see PerfTask#tearDown()
- */
public void tearDown() throws Exception {
- log(++count);
doc = null;
super.tearDown();
}
public int doLogic() throws Exception {
- final String docID = doc.get(BasicDocMaker.ID_FIELD);
+ final String docID = doc.get(DocMaker.ID_FIELD);
if (docID == null) {
throw new IllegalStateException("document must define the docid field");
}
- getRunData().getIndexWriter().updateDocument(new Term(BasicDocMaker.ID_FIELD, docID),
- doc);
+ getRunData().getIndexWriter().updateDocument(new Term(DocMaker.ID_FIELD, docID), doc);
return 1;
}
- private void log (int count) {
- if (logStep<0) {
- // init once per instance
- logStep = getRunData().getConfig().get("doc.add.log.step",AddDocTask.DEFAULT_ADD_DOC_LOG_STEP);
- }
- if (logStep>0 && (count%logStep)==0) {
- double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
- NumberFormat nf = NumberFormat.getInstance();
- nf.setMaximumFractionDigits(2);
- System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (update) "+count+" docs");
- }
+ protected String getLogMessage(int recsCount) {
+ return "updated " + recsCount + " docs";
}
-
+
/**
* Set the params (docSize only)
* @param params docSize, or 0 for no limit.
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (revision 784670)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (working copy)
@@ -25,7 +25,6 @@
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
@@ -45,23 +44,13 @@
* bzip.compression - whether the output should be bzip-compressed. This is
* recommended when the output file is expected to be large. (optional, default:
* false).
- * doc.writeline.log.step - controls how many records to process before
- * logging the status of the task. NOTE: to disable logging, set this
- * value to 0 or negative. (optional, default:1000).
*
*/
public class WriteLineDocTask extends PerfTask {
- /**
- * Default value for property doc.add.log.step - indicating how often
- * an "added N docs" message should be logged.
- */
- public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
public final static char SEP = '\t';
- private int logStep = -1;
private int docSize = 0;
- int count = 0;
private BufferedWriter lineFileOut = null;
private DocMaker docMaker;
@@ -93,30 +82,23 @@
}
lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
docMaker = runData.getDocMaker();
- logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
- // To avoid the check 'if (logStep > 0)' in log(). This effectively turns
- // logging off.
- if (logStep <= 0) {
- logStep = Integer.MAX_VALUE;
- }
}
- public void tearDown() throws Exception {
- log(++count);
- super.tearDown();
+ protected String getLogMessage(int recsCount) {
+ return "Wrote " + recsCount + " line docs";
}
-
+
public int doLogic() throws Exception {
Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
- Field f = doc.getField(BasicDocMaker.BODY_FIELD);
+ Field f = doc.getField(DocMaker.BODY_FIELD);
String body = f != null ? f.stringValue().replace('\t', ' ') : null;
if (body != null) {
- f = doc.getField(BasicDocMaker.TITLE_FIELD);
+ f = doc.getField(DocMaker.TITLE_FIELD);
String title = f != null ? f.stringValue().replace('\t', ' ') : "";
- f = doc.getField(BasicDocMaker.DATE_FIELD);
+ f = doc.getField(DocMaker.DATE_FIELD);
String date = f != null ? f.stringValue().replace('\t', ' ') : "";
lineFileOut.write(title, 0, title.length());
@@ -129,17 +111,6 @@
return 1;
}
- private void log(int count) {
- // logStep is initialized in the ctor to a positive value. If the config
- // file indicates no logging, or contains an invalid value, logStep is init
- // to Integer.MAX_VALUE, so that logging will not occur (at least for the
- // first Integer.MAX_VALUE records).
- if (count % logStep == 0) {
- System.out.println("--> " + Thread.currentThread().getName()
- + " processed (write line) " + count + " docs");
- }
- }
-
public void close() throws Exception {
lineFileOut.close();
super.close();
@@ -156,9 +127,6 @@
docSize = (int) Float.parseFloat(params);
}
- /* (non-Javadoc)
- * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
- */
public boolean supportsParams() {
return true;
}
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java (revision 0)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java (revision 0)
@@ -0,0 +1,173 @@
+package org.apache.lucene.benchmark.byTask.utils;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Implements a {@link Reader} over a {@link StringBuffer} instance. Although
+ * one can use {@link java.io.StringReader} by passing it
+ * {@link StringBuffer#toString()}, it is better to use this class, as it
+ * doesn't mark the passed-in {@link StringBuffer} as shared (which will cause
+ * inner char[] allocations at the next append() attempt).
+ * Notes:
+ *
+ * - This implementation assumes the underlying {@link StringBuffer} is not
+ * changed during the use of this {@link Reader} implementation.
+ *
- This implementation is thread-safe.
+ *
- The implementation looks very much like {@link java.io.StringReader} (for
+ * the right reasons).
+ *
- If one wants to reuse that instance, then the following needs to be done:
+ *
+ * StringBuffer sb = new StringBuffer("some text");
+ * Reader reader = new StringBufferReader(sb);
+ * ... read from reader - dont close it ! ...
+ * sb.setLength(0);
+ * sb.append("some new text");
+ * reader.reset();
+ * ... read the new string from the reader ...
+ *
+ *
+ */
+public class StringBufferReader extends Reader {
+
+ // TODO (3.0): change to StringBuffer (including the name of the class)
+
+ // The StringBuffer to read from.
+ private StringBuffer sb;
+
+ // The length of 'sb'.
+ private int length;
+
+ // The next position to read from the StringBuffer.
+ private int next = 0;
+
+ // The mark position. The default value 0 means the start of the text.
+ private int mark = 0;
+
+ public StringBufferReader(StringBuffer sb) {
+ set(sb);
+ }
+
+ /** Check to make sure that the stream has not been closed. */
+ private void ensureOpen() throws IOException {
+ if (sb == null) {
+ throw new IOException("Stream has already been closed");
+ }
+ }
+
+ public void close() {
+ synchronized (lock) {
+ sb = null;
+ }
+ }
+
+ /**
+ * Mark the present position in the stream. Subsequent calls to reset() will
+ * reposition the stream to this point.
+ *
+ * @param readAheadLimit Limit on the number of characters that may be read
+ * while still preserving the mark. Because the stream's input comes
+ * from a StringBuffer, there is no actual limit, so this argument
+ * must not be negative, but is otherwise ignored.
+ * @exception IllegalArgumentException If readAheadLimit is < 0
+ * @exception IOException If an I/O error occurs
+ */
+ public void mark(int readAheadLimit) throws IOException {
+ if (readAheadLimit < 0){
+ throw new IllegalArgumentException("Read-ahead limit cannpt be negative: " + readAheadLimit);
+ }
+ synchronized (lock) {
+ ensureOpen();
+ mark = next;
+ }
+ }
+
+ public boolean markSupported() {
+ return true;
+ }
+
+ public int read() throws IOException {
+ synchronized (lock) {
+ ensureOpen();
+ return next >= length ? -1 : sb.charAt(next++);
+ }
+ }
+
+ public int read(char cbuf[], int off, int len) throws IOException {
+ synchronized (lock) {
+ ensureOpen();
+
+ // Validate parameters
+ if (off < 0 || off > cbuf.length || len < 0 || off + len > cbuf.length) {
+ throw new IndexOutOfBoundsException("off=" + off + " len=" + len + " cbuf.length=" + cbuf.length);
+ }
+
+ if (len == 0) {
+ return 0;
+ }
+
+ if (next >= length) {
+ return -1;
+ }
+
+ int n = Math.min(length - next, len);
+ sb.getChars(next, next + n, cbuf, off);
+ next += n;
+ return n;
+ }
+ }
+
+ public boolean ready() throws IOException {
+ synchronized (lock) {
+ ensureOpen();
+ return true;
+ }
+ }
+
+ public void reset() throws IOException {
+ synchronized (lock) {
+ ensureOpen();
+ next = mark;
+ length = sb.length();
+ }
+ }
+
+ public void set(StringBuffer sb) {
+ synchronized (lock) {
+ this.sb = sb;
+ length = sb.length();
+ }
+ }
+ public long skip(long ns) throws IOException {
+ synchronized (lock) {
+ ensureOpen();
+ if (next >= length) {
+ return 0;
+ }
+
+ // Bound skip by beginning and end of the source
+ long n = Math.min(length - next, ns);
+ n = Math.max(-next, n);
+ next += n;
+ return n;
+ }
+ }
+
+}
Property changes on: contrib\benchmark\src\java\org\apache\lucene\benchmark\byTask\utils\StringBufferReader.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (revision 784670)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (working copy)
@@ -17,18 +17,17 @@
* limitations under the License.
*/
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Properties;
+
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.Properties;
-
/**
* Extract the downloaded Wikipedia dump into separate files for indexing.
*/
@@ -51,7 +50,6 @@
}
}
-
public File directory(int count, File directory) {
if (directory == null) {
directory = outputDir;
@@ -99,7 +97,8 @@
long start = System.currentTimeMillis();
try {
while ((doc = docMaker.makeDocument()) != null) {
- create(doc.get(BasicDocMaker.ID_FIELD), doc.get(BasicDocMaker.TITLE_FIELD), doc.get(BasicDocMaker.DATE_FIELD), doc.get(BasicDocMaker.BODY_FIELD));
+ create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc
+ .get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD));
}
} catch (NoMoreDataException e) {
//continue
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 784670)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy)
@@ -17,6 +17,7 @@
package org.apache.lucene.benchmark.byTask;
+import java.io.IOException;
import java.io.StringReader;
import java.io.File;
import java.io.FileReader;
@@ -26,7 +27,7 @@
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
-import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource;
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
@@ -114,7 +115,7 @@
};
CountingSearchTestTask.numSearches = 0;
- Benchmark benchmark = execBenchmark(algLines);
+ execBenchmark(algLines);
assertTrue(CountingSearchTestTask.numSearches > 0);
long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
@@ -124,7 +125,7 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=true",
- "doc.maker="+Reuters20DocMaker.class.getName(),
+ "content.source="+Reuters20ContentSource.class.getName(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
@@ -162,7 +163,7 @@
String algLines[] = {
"doc.stored=true",//doc storage is required in order to have text to highlight
"doc.term.vector.offsets=true",
- "doc.maker="+Reuters20DocMaker.class.getName(),
+ "content.source="+Reuters20ContentSource.class.getName(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
@@ -199,7 +200,7 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=false",
- "doc.maker="+Reuters20DocMaker.class.getName(),
+ "content.source="+Reuters20ContentSource.class.getName(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
@@ -227,14 +228,14 @@
/**
* Test Exhasting Doc Maker logic
*/
- public void testExhaustDocMaker() throws Exception {
+ public void testExhaustContentSource() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker",
- "doc.add.log.step=1",
+ "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
+ "content.source.log.step=1",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
@@ -274,10 +275,10 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
- "doc.add.log.step=3",
+ "content.source="+Reuters20ContentSource.class.getName(),
+ "content.source.log.step=3",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=FSDirectory",
"doc.stored=false",
"doc.tokenized=false",
@@ -292,7 +293,7 @@
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
- int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+ int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
@@ -309,8 +310,8 @@
// Creates a line file with first 500 docs from reuters
String algLines1[] = {
"# ----- properties ",
- "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
- "doc.maker.forever=false",
+ "content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource",
+ "content.source.forever=false",
"line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
"# ----- alg ",
"{WriteLineDoc()}:" + NUM_TRY_DOCS,
@@ -335,7 +336,7 @@
"analyzer=org.apache.lucene.analysis.SimpleAnalyzer",
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
"docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
- "doc.maker.forever=false",
+ "content.source.forever=false",
"doc.reuse.fields=false",
"autocommit=false",
"ram.flush.mb=4",
@@ -373,7 +374,7 @@
String algLines1[] = {
"# ----- properties ",
"analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
- "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
+ "content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource",
"# ----- alg ",
"{ReadTokens}: " + NUM_DOCS,
"ResetSystemErase",
@@ -421,10 +422,10 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
- "doc.add.log.step=3",
+ "content.source="+Reuters20ContentSource.class.getName(),
+ "content.source.log.step=3",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
@@ -442,7 +443,7 @@
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
- int ndocsExpected = 2 * 20; // Reuters20DocMaker exhausts after 20 docs.
+ int ndocsExpected = 2 * 20; // Reuters20ContentSource exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
@@ -477,16 +478,19 @@
}
/** use reuters and the exhaust mechanism, but to be faster, add 20 docs only... */
- public static class Reuters20DocMaker extends ReutersDocMaker {
- private int nDocs=0;
- protected synchronized DocData getNextDocData() throws Exception {
- if (nDocs>=20 && !forever) {
+ public static class Reuters20ContentSource extends ReutersContentSource {
+ private int nDocs = 0;
+
+ public synchronized DocData getNextDocData(DocData docData)
+ throws NoMoreDataException, IOException {
+ if (nDocs >= 20 && !forever) {
throw new NoMoreDataException();
}
nDocs++;
- return super.getNextDocData();
+ return super.getNextDocData(docData);
}
- public synchronized void resetInputs() {
+
+ public synchronized void resetInputs() throws IOException {
super.resetInputs();
nDocs = 0;
}
@@ -499,10 +503,10 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
- "doc.add.log.step=3",
+ "content.source="+Reuters20ContentSource.class.getName(),
+ "content.source.log.step=3",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
@@ -521,7 +525,7 @@
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
- int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+ int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
@@ -533,12 +537,12 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
+ "content.source="+Reuters20ContentSource.class.getName(),
"ram.flush.mb=-1",
"max.buffered=2",
- "doc.add.log.step=3",
+ "content.source.log.step=3",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
@@ -557,7 +561,7 @@
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
- int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+ int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
@@ -577,10 +581,10 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
- "doc.add.log.step=3",
+ "content.source="+Reuters20ContentSource.class.getName(),
+ "content.source.log.step=3",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"merge.scheduler=" + MyMergeScheduler.class.getName(),
"doc.stored=false",
@@ -601,7 +605,7 @@
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
- int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+ int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
@@ -620,12 +624,12 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
- "doc.add.log.step=3",
+ "content.source="+Reuters20ContentSource.class.getName(),
+ "content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=2",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"merge.policy=" + MyMergePolicy.class.getName(),
"doc.stored=false",
@@ -646,7 +650,7 @@
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
- int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+ int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
@@ -658,13 +662,13 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
- "doc.add.log.step=3",
+ "content.source="+Reuters20ContentSource.class.getName(),
+ "content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=2",
"compound=cmpnd:true:false",
"doc.term.vector=vector:false:true",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"merge.factor=3",
@@ -702,12 +706,12 @@
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
- "doc.add.log.step=3",
+ "content.source="+Reuters20ContentSource.class.getName(),
+ "content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=3",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"merge.policy=org.apache.lucene.index.LogDocMergePolicy",
"doc.stored=false",
@@ -728,7 +732,7 @@
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
- int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+ int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
@@ -780,10 +784,10 @@
String dis = disable ? "-" : "";
return new String[] {
"# ----- properties ",
- "doc.maker="+Reuters20DocMaker.class.getName(),
- "doc.add.log.step=30",
+ "content.source="+Reuters20ContentSource.class.getName(),
+ "content.source.log.step=30",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (revision 784670)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (working copy)
@@ -111,35 +111,11 @@
doIndexAndSearchTest(file, false, null);
}
- public void testBZip2WithBzipCompressionDisabled() throws Exception {
- File file = new File(getWorkDir(), "one-line.bz2");
- createBZ2LineFile(file);
-
- try {
- doIndexAndSearchTest(file, true, "false");
- fail("Some exception should have been thrown !");
- } catch (Exception e) {
- // expected.
- }
- }
-
public void testRegularFile() throws Exception {
File file = new File(getWorkDir(), "one-line");
createRegularLineFile(file);
doIndexAndSearchTest(file, false, null);
}
-
- public void testRegularFileWithBZipCompressionEnabled() throws Exception {
- File file = new File(getWorkDir(), "one-line");
- createRegularLineFile(file);
-
- try {
- doIndexAndSearchTest(file, true, "true");
- fail("Some exception should have been thrown !");
- } catch (Exception e) {
- // expected.
- }
- }
public void testInvalidFormat() throws Exception {
String[] testCases = new String[] {
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java (revision 0)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java (revision 0)
@@ -0,0 +1,332 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.ParseException;
+import java.util.Date;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.DateTools;
+
+public class TrecContentSourceTest extends TestCase {
+
+ /** A TrecDocMaker which works on a String and not files. */
+ private static class StringableTrecSource extends TrecContentSource {
+
+ private String docs = null;
+
+ public StringableTrecSource(String docs, boolean forever) {
+ this.docs = docs;
+ this.forever = forever;
+ }
+
+ protected void openNextFile() throws NoMoreDataException, IOException {
+ if (reader != null) {
+ if (!forever) {
+ throw new NoMoreDataException();
+ }
+ ++iteration;
+ }
+
+ reader = new BufferedReader(new StringReader(docs));
+ }
+
+ public void setConfig(Config config) {
+ htmlParser = new DemoHTMLParser();
+ }
+ }
+
+ private void assertDocData(DocData dd, String expName, String expTitle,
+ String expBody, Date expDate)
+ throws ParseException {
+ assertNotNull(dd);
+ assertEquals(expName, dd.getName());
+ assertEquals(expTitle, dd.getTitle());
+ assertTrue(dd.getBody().indexOf(expBody) != -1);
+ Date date = dd.getDate() != null ? DateTools.stringToDate(dd.getDate()) : null;
+ assertEquals(expDate, date);
+ }
+
+ private void assertNoMoreDataException(StringableTrecSource stdm) throws Exception {
+ boolean thrown = false;
+ try {
+ stdm.getNextDocData(null);
+ } catch (NoMoreDataException e) {
+ thrown = true;
+ }
+ assertTrue("Expecting NoMoreDataException", thrown);
+ }
+
+ public void testOneDocument() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000 \r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ " ";
+ StringableTrecSource source = new StringableTrecSource(docs, false);
+ source.setConfig(null);
+
+ DocData dd = source.getNextDocData(new DocData());
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+ .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+ assertNoMoreDataException(source);
+ }
+
+ public void testTwoDocuments() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000 \r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "TEST-001 \r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001 title\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ " ";
+ StringableTrecSource source = new StringableTrecSource(docs, false);
+ source.setConfig(null);
+
+ DocData dd = source.getNextDocData(new DocData());
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+ .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+ dd = source.getNextDocData(dd);
+ assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
+ .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+
+ assertNoMoreDataException(source);
+ }
+
+ // If a Date: attribute is missing, make sure the document is not skipped, but
+ // rather that null Data is assigned.
+ public void testMissingDate() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000 \r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "TEST-001 \r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001 title\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ " ";
+ StringableTrecSource source = new StringableTrecSource(docs, false);
+ source.setConfig(null);
+
+ DocData dd = source.getNextDocData(new DocData());
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+
+ dd = source.getNextDocData(dd);
+ assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
+ .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+
+ assertNoMoreDataException(source);
+ }
+
+ // When a 'bad date' is input (unparsable date), make sure the DocData date is
+ // assigned null.
+ public void testBadDate() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000 \r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Bad Date\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ " ";
+ StringableTrecSource source = new StringableTrecSource(docs, false);
+ source.setConfig(null);
+
+ DocData dd = source.getNextDocData(new DocData());
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+
+ assertNoMoreDataException(source);
+ }
+
+ public void testForever() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000 \r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ " \r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ " ";
+ StringableTrecSource source = new StringableTrecSource(docs, true);
+ source.setConfig(null);
+
+ DocData dd = source.getNextDocData(new DocData());
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+ .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+ // same document, but the second iteration changes the name.
+ dd = source.getNextDocData(dd);
+ assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
+ .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+ // Don't test that NoMoreDataException is thrown, since the forever flag is
+ // turned on.
+ }
+
+}
Property changes on: contrib\benchmark\src\test\org\apache\lucene\benchmark\byTask\feeds\TrecContentSourceTest.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java (revision 784670)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java (working copy)
@@ -1,321 +0,0 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.BufferedReader;
-import java.io.StringReader;
-import java.util.Date;
-
-import junit.framework.TestCase;
-
-public class TrecDocMakerTest extends TestCase {
-
- /** A TrecDocMaker which works on a String and not files. */
- private static class StringableTrecDocMaker extends TrecDocMaker {
-
- private String docs = null;
-
- public StringableTrecDocMaker(String docs, boolean forever) {
- this.docs = docs;
- this.forever = forever;
- }
-
- protected void openNextFile() throws NoMoreDataException, Exception {
- if (reader != null) {
- if (!forever) {
- throw new NoMoreDataException();
- }
- ++iteration;
- }
-
- reader = new BufferedReader(new StringReader(docs));
- }
-
- }
-
- private void assertDocData(DocData dd, String expName, String expTitle, String expBody, Date expDate) {
- assertNotNull(dd);
- assertEquals(expName, dd.getName());
- assertEquals(expTitle, dd.getTitle());
- assertTrue(dd.getBody().indexOf(expBody) != -1);
- assertEquals(expDate, dd.getDate());
- }
-
- private void assertNoMoreDataException(StringableTrecDocMaker stdm) throws Exception {
- boolean thrown = false;
- try {
- stdm.getNextDocData();
- } catch (NoMoreDataException e) {
- thrown = true;
- }
- assertTrue("Expecting NoMoreDataException", thrown);
- }
-
- public void testOneDocument() throws Exception {
- String docs = "\r\n" +
- "TEST-000 \r\n" +
- "\r\n" +
- "http://lucene.apache.org.trecdocmaker.test\r\n" +
- "HTTP/1.1 200 OK\r\n" +
- "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
- "Server: Apache/1.3.27 (Unix)\r\n" +
- "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
- "Content-Length: 614\r\n" +
- "Connection: close\r\n" +
- "Content-Type: text/html\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 title\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 text\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- " ";
- StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
- stdm.setHTMLParser(new DemoHTMLParser());
-
- DocData dd = stdm.getNextDocData();
- assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
- .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
-
- assertNoMoreDataException(stdm);
- }
-
- public void testTwoDocuments() throws Exception {
- String docs = "\r\n" +
- "TEST-000 \r\n" +
- "\r\n" +
- "http://lucene.apache.org.trecdocmaker.test\r\n" +
- "HTTP/1.1 200 OK\r\n" +
- "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
- "Server: Apache/1.3.27 (Unix)\r\n" +
- "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
- "Content-Length: 614\r\n" +
- "Connection: close\r\n" +
- "Content-Type: text/html\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 title\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 text\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- " \r\n" +
- "\r\n" +
- "TEST-001 \r\n" +
- "\r\n" +
- "http://lucene.apache.org.trecdocmaker.test\r\n" +
- "HTTP/1.1 200 OK\r\n" +
- "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
- "Server: Apache/1.3.27 (Unix)\r\n" +
- "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" +
- "Content-Length: 614\r\n" +
- "Connection: close\r\n" +
- "Content-Type: text/html\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-001 title\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-001 text\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- " ";
- StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
- stdm.setHTMLParser(new DemoHTMLParser());
-
- DocData dd = stdm.getNextDocData();
- assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
- .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
-
- dd = stdm.getNextDocData();
- assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
- .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
-
- assertNoMoreDataException(stdm);
- }
-
- // If a Date: attribute is missing, make sure the document is not skipped, but
- // rather that null Data is assigned.
- public void testMissingDate() throws Exception {
- String docs = "\r\n" +
- "TEST-000 \r\n" +
- "\r\n" +
- "http://lucene.apache.org.trecdocmaker.test\r\n" +
- "HTTP/1.1 200 OK\r\n" +
- "Server: Apache/1.3.27 (Unix)\r\n" +
- "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
- "Content-Length: 614\r\n" +
- "Connection: close\r\n" +
- "Content-Type: text/html\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 title\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 text\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- " \r\n" +
- "\r\n" +
- "TEST-001 \r\n" +
- "\r\n" +
- "http://lucene.apache.org.trecdocmaker.test\r\n" +
- "HTTP/1.1 200 OK\r\n" +
- "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
- "Server: Apache/1.3.27 (Unix)\r\n" +
- "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
- "Content-Length: 614\r\n" +
- "Connection: close\r\n" +
- "Content-Type: text/html\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-001 title\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-001 text\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- " ";
- StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
- stdm.setHTMLParser(new DemoHTMLParser());
-
- DocData dd = stdm.getNextDocData();
- assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
-
- dd = stdm.getNextDocData();
- assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
- .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
-
- assertNoMoreDataException(stdm);
- }
-
- // When a 'bad date' is input (unparsable date), make sure the DocData date is
- // assigned null.
- public void testBadDate() throws Exception {
- String docs = "\r\n" +
- "TEST-000 \r\n" +
- "\r\n" +
- "http://lucene.apache.org.trecdocmaker.test\r\n" +
- "HTTP/1.1 200 OK\r\n" +
- "Date: Bad Date\r\n" +
- "Server: Apache/1.3.27 (Unix)\r\n" +
- "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
- "Content-Length: 614\r\n" +
- "Connection: close\r\n" +
- "Content-Type: text/html\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 title\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 text\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- " ";
- StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
- stdm.setHTMLParser(new DemoHTMLParser());
-
- DocData dd = stdm.getNextDocData();
- assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
-
- assertNoMoreDataException(stdm);
- }
-
- public void testForever() throws Exception {
- String docs = "\r\n" +
- "TEST-000 \r\n" +
- "\r\n" +
- "http://lucene.apache.org.trecdocmaker.test\r\n" +
- "HTTP/1.1 200 OK\r\n" +
- "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
- "Server: Apache/1.3.27 (Unix)\r\n" +
- "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
- "Content-Length: 614\r\n" +
- "Connection: close\r\n" +
- "Content-Type: text/html\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 title\r\n" +
- " \r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- "TEST-000 text\r\n" +
- "\r\n" +
- "\r\n" +
- "\r\n" +
- " ";
- StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, true);
- stdm.setHTMLParser(new DemoHTMLParser());
-
- DocData dd = stdm.getNextDocData();
- assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
- .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
-
- // same document, but the second iteration changes the name.
- dd = stdm.getNextDocData();
- assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", stdm
- .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
-
- // Don't test that NoMoreDataException is thrown, since the forever flag is
- // turned on.
- }
-
-}
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (revision 784670)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (working copy)
@@ -27,8 +27,8 @@
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
@@ -40,7 +40,7 @@
public class WriteLineDocTaskTest extends BenchmarkTestCase {
// class has to be public so that Class.forName.newInstance() will work
- public static final class WriteLineDocMaker extends BasicDocMaker {
+ public static final class WriteLineDocMaker extends DocMaker {
protected DocData getNextDocData() throws NoMoreDataException, Exception {
throw new UnsupportedOperationException("not implemented");
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java (revision 784670)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java (working copy)
@@ -23,7 +23,7 @@
import java.io.PrintWriter;
import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic;
-import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource;
import org.apache.lucene.benchmark.quality.Judge;
import org.apache.lucene.benchmark.quality.QualityQuery;
import org.apache.lucene.benchmark.quality.QualityQueryParser;
@@ -155,10 +155,10 @@
// 1. alg definition
String algLines[] = {
"# ----- properties ",
- "doc.maker="+ReutersDocMaker.class.getName(),
- "doc.add.log.step=2500",
+ "content.source="+ReutersContentSource.class.getName(),
+ "content.source.log.step=2500",
"doc.term.vector=false",
- "doc.maker.forever=false",
+ "content.source.forever=false",
"directory=FSDirectory",
"doc.stored=true",
"doc.tokenized=true",
Index: src/java/org/apache/lucene/document/Document.java
===================================================================
--- src/java/org/apache/lucene/document/Document.java (revision 784670)
+++ src/java/org/apache/lucene/document/Document.java (working copy)
@@ -88,7 +88,12 @@
public final void add(Fieldable field) {
fields.add(field);
}
-
+
+ /** Removes all fields from the document. */
+ public final void clear() {
+ fields.clear();
+ }
+
/**
* Removes field with the specified name from the document.
* If multiple fields exist with this name, this method removes the first field that has been added.