Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 522262)
+++ CHANGES.txt (working copy)
@@ -4,6 +4,13 @@
$Id:$
+3/25/07
+
+LUCENE-849:
+1. which HTML Parser is used is configurable with html.parser property.
+2. External classes added to classpath with -Dbenchmark.ext.classpath=path.
+3. '*' as repeating number now means "exhaust doc maker - no repetitions".
+
3/22/07
-Moved withRetrieve() call out of the loop in ReadTask
Index: src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
===================================================================
--- src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 522262)
+++ src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy)
@@ -81,6 +81,49 @@
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
}
+ /**
+ * Test Exhasting Doc Maker logic
+ */
+ public void testExhaustDocMaker() throws Exception {
+ // 1. alg definition (required in every "logic" test)
+ String algLines[] = {
+ "# ----- properties ",
+ "doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker",
+ "doc.add.log.step=1",
+ "doc.term.vector=false",
+ "doc.maker.forever=false",
+ "directory=RAMDirectory",
+ "doc.stored=false",
+ "doc.tokenized=false",
+ "# ----- alg ",
+ "CreateIndex",
+ "{ AddDoc } : * ",
+ "Optimize",
+ "CloseIndex",
+ "OpenReader",
+ "{ CountingSearchTest } : 100",
+ "CloseReader",
+ "[ CountingSearchTest > : 30",
+ "[ CountingSearchTest > : 9",
+ };
+
+ // 2. we test this value later
+ CountingSearchTestTask.numSearches = 0;
+
+ // 3. execute the algorithm (required in every "logic" test)
+ Benchmark benchmark = execBenchmark(algLines);
+
+ // 4. test specific checks after the benchmark run completed.
+ assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
+ assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
+ // now we should be able to open the index for write.
+ IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
+ iw.close();
+ IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
+ assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
+ }
+
+
// create the benchmark and execute it.
private Benchmark execBenchmark(String[] algLines) throws Exception {
String algText = algLinesToText(algLines);
Index: src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
===================================================================
--- src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java (revision 522262)
+++ src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java (working copy)
@@ -117,8 +117,12 @@
colonOk = false;
// get repetitions number
stok.nextToken();
- if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
- ((TaskSequence)prevTask).setRepetitions((int)stok.nval);
+ if ((char)stok.ttype == '*') {
+ ((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST);
+ } else {
+ if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
+ ((TaskSequence)prevTask).setRepetitions((int)stok.nval);
+ }
// check for rate specification (ops/min)
stok.nextToken();
if (stok.ttype!=':') {
Index: src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
===================================================================
--- src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (revision 522262)
+++ src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (working copy)
@@ -21,11 +21,13 @@
import java.util.Iterator;
import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
/**
* Sequence of parallel or sequential tasks.
*/
public class TaskSequence extends PerfTask {
+ public static int REPEAT_EXHAUST = -2;
private ArrayList tasks;
private int repetitions = 1;
private boolean parallel;
@@ -61,9 +63,13 @@
/**
* @param repetitions The repetitions to set.
+ * @throws Exception
*/
- public void setRepetitions(int repetitions) {
+ public void setRepetitions(int repetitions) throws Exception {
this.repetitions = repetitions;
+ if (repetitions==REPEAT_EXHAUST && isParallel()) {
+ throw new Exception("REPEAT_EXHAUST is not allowed for parallel tasks");
+ }
setSequenceName();
}
@@ -88,10 +94,15 @@
}
int count = 0;
- for (int k=0; k1) {
sb.append(" * " + repetitions);
}
+ if (repetitions==REPEAT_EXHAUST) {
+ sb.append(" * EXHAUST");
+ }
if (rate>0) {
sb.append(", rate: " + rate+"/"+(perMin?"min":"sec"));
}
@@ -237,7 +256,9 @@
private void setSequenceName() {
seqName = super.getName();
- if (repetitions>1) {
+ if (repetitions==REPEAT_EXHAUST) {
+ seqName += "_Exhaust";
+ } else if (repetitions>1) {
seqName += "_"+repetitions;
}
if (rate>0) {
Index: src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
===================================================================
--- src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java (revision 522262)
+++ src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java (working copy)
@@ -25,6 +25,7 @@
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
+import java.util.Date;
import java.util.Locale;
@@ -66,13 +67,16 @@
File f = null;
String name = null;
synchronized (this) {
- f = (File) inputFiles.get(nextFile++);
- name = f.getCanonicalPath()+"_"+iteration;
if (nextFile >= inputFiles.size()) {
- // exhausted files, start a new round
+ // exhausted files, start a new round, unless forever set to false.
+ if (!forever) {
+ throw new NoMoreDataException();
+ }
nextFile = 0;
iteration++;
}
+ f = (File) inputFiles.get(nextFile++);
+ name = f.getCanonicalPath()+"_"+iteration;
}
BufferedReader reader = new BufferedReader(new FileReader(f));
@@ -90,13 +94,9 @@
addBytes(f.length());
- DocData dd = new DocData();
- dd.date = dateFormat.parse(dateStr.trim());
- dd.name = name;
- dd.title = title;
- dd.body = bodyBuf.toString();
- return dd;
+ Date date = dateFormat.parse(dateStr.trim());
+ return new DocData(name, bodyBuf.toString(), title, null, date);
}
Index: src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java
===================================================================
--- src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java (revision 522262)
+++ src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java (working copy)
@@ -18,7 +18,7 @@
*/
/**
- * Create documents for the test
+ * Create documents for the test.
*/
public class SimpleDocMaker extends BasicDocMaker {
@@ -58,12 +58,12 @@
return 0; // not applicable
}
- protected DocData getNextDocData() {
- DocData dd = new DocData();
- dd.body = DOC_TEXT;
- dd.name = "doc"+newdocid();
+ protected DocData getNextDocData() throws NoMoreDataException {
+ if (docID>0 && !forever) {
+ throw new NoMoreDataException();
+ }
addBytes(DOC_TEXT.length());
- return dd;
+ return new DocData("doc"+newdocid(),DOC_TEXT, null, null, null);
}
}
Index: src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
===================================================================
--- src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (revision 0)
+++ src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (revision 0)
@@ -0,0 +1,113 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Date;
+import java.util.Properties;
+
+/**
+ * Output of parsing (e.g. HTML parsing) of an input document.
+ */
+
+public class DocData {
+
+ private String name;
+ private String body;
+ private String title;
+ private Date date;
+ private Properties props;
+
+ public DocData(String name, String body, String title, Properties props, Date date) {
+ this.name = name;
+ this.body = body;
+ this.title = title;
+ this.date = date;
+ this.props = props;
+ }
+
+ /**
+ * @return Returns the name.
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * @param name The name to set.
+ */
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ /**
+ * @return Returns the props.
+ */
+ public Properties getProps() {
+ return props;
+ }
+
+ /**
+ * @param props The props to set.
+ */
+ public void setProps(Properties props) {
+ this.props = props;
+ }
+
+ /**
+ * @return Returns the body.
+ */
+ public String getBody() {
+ return body;
+ }
+
+ /**
+ * @param body The body to set.
+ */
+ public void setBody(String body) {
+ this.body = body;
+ }
+
+ /**
+ * @return Returns the title.
+ */
+ public String getTitle() {
+ return title;
+ }
+
+ /**
+ * @param title The title to set.
+ */
+ public void setTitle(String title) {
+ this.title = title;
+ }
+
+ /**
+ * @return Returns the date.
+ */
+ public Date getDate() {
+ return date;
+ }
+
+ /**
+ * @param date The date to set.
+ */
+ public void setDate(Date date) {
+ this.date = date;
+ }
+
+}
Property changes on: src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:eol-style
+ native
Index: src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
===================================================================
--- src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java (revision 522262)
+++ src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java (working copy)
@@ -23,19 +23,15 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.io.Reader;
-import java.io.StringReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
-import java.util.Properties;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.demo.html.HTMLParser;
/**
@@ -45,7 +41,7 @@
private static final String newline = System.getProperty("line.separator");
- private DateFormat dateFormat;
+ private DateFormat dateFormat [];
private File dataDir = null;
private ArrayList inputFiles = new ArrayList();
private int nextFile = 0;
@@ -53,6 +49,13 @@
private BufferedReader reader;
private GZIPInputStream zis;
+ private static final String DATE_FORMATS [] = {
+ "EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
+ "EEE MMM dd kk:mm:ss yyyy z", //Tue Dec 09 16:45:08 2003 EST
+ "EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
+ "EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
+ };
+
/* (non-Javadoc)
* @see SimpleDocMaker#setConfig(java.util.Properties)
*/
@@ -65,34 +68,44 @@
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
}
// date format: 30-MAR-1987 14:22:36.87
- dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT
- dateFormat.setLenient(true);
- }
+ dateFormat = new SimpleDateFormat[DATE_FORMATS.length];
+ for (int i = 0; i < dateFormat.length; i++) {
+ dateFormat[i] = new SimpleDateFormat(DATE_FORMATS[i],Locale.US);
+ dateFormat[i].setLenient(true);
+ }
+ }
- private void openNextFile() throws Exception {
+ private void openNextFile() throws NoMoreDataException, Exception {
closeInputs();
int retries = 0;
- while (retries<20) {
+ while (true) {
File f = null;
synchronized (this) {
- f = (File) inputFiles.get(nextFile++);
if (nextFile >= inputFiles.size()) {
- // exhausted files, start a new round
+ // exhausted files, start a new round, unless forever set to false.
+ if (!forever) {
+ throw new NoMoreDataException();
+ }
nextFile = 0;
iteration++;
}
+ f = (File) inputFiles.get(nextFile++);
}
System.out.println("opening: "+f+" length: "+f.length());
try {
zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f)));
- break;
+ reader = new BufferedReader(new InputStreamReader(zis));
+ return;
} catch (Exception e) {
retries++;
- System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries);
- continue;
+ if (retries<20) {
+ System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries);
+ continue;
+ } else {
+ throw new NoMoreDataException();
+ }
}
}
- reader = new BufferedReader(new InputStreamReader(zis));
}
private void closeInputs() {
@@ -142,7 +155,7 @@
return sb;
}
- protected DocData getNextDocData() throws Exception {
+ protected DocData getNextDocData() throws NoMoreDataException, Exception {
if (reader==null) {
openNextFile();
}
@@ -162,39 +175,27 @@
// 6. collect until end of doc
sb = read("",null,false,true);
// this is the next document, so parse it
- // TODO use a more robust html parser (current one aborts parsing quite easily).
- HTMLParser p = new HTMLParser(new StringReader(sb.toString()));
- // title
- String title = p.getTitle();
- // properties
- Properties props = p.getMetaTags();
- // body
- Reader r = p.getReader();
- char c[] = new char[1024];
- StringBuffer bodyBuf = new StringBuffer();
- int n;
- while ((n = r.read(c)) >= 0) {
- if (n>0) {
- bodyBuf.append(c,0,n);
- }
- }
- r.close();
- addBytes(bodyBuf.length());
+ Date date = parseDate(dateStr);
+ HTMLParser p = getHtmlParser();
+ DocData docData = p.parse(name, date, sb, dateFormat[0]);
+ addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).
- DocData dd = new DocData();
+ return docData;
+ }
- try {
- dd.date = dateFormat.parse(dateStr.trim());
- } catch (ParseException e) {
- // do not fail test just because a date could not be parsed
- System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
- dd.date = new Date(); // now
+ private Date parseDate(String dateStr) {
+ Date date = null;
+ for (int i=0; i= 0) {
+ if (n>0) {
+ bodyBuf.append(c,0,n);
+ }
+ }
+ r.close();
+ if (date == null && props.getProperty("date")!=null) {
+ try {
+ date = dateFormat.parse(props.getProperty("date").trim());
+ } catch (ParseException e) {
+ // do not fail test just because a date could not be parsed
+ System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
+ date = new Date(); // now
+ }
+ }
+
+ return new DocData(name, bodyBuf.toString(), title, props, date);
+ }
+
+ public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException {
+ // TODO Auto-generated method stub
+ return parse(name, date, new StringReader(inputText.toString()), dateFormat);
+ }
+
+}
Property changes on: src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:eol-style
+ native
Index: src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
===================================================================
--- src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (revision 522262)
+++ src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (working copy)
@@ -26,9 +26,7 @@
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
-import java.util.Date;
import java.util.Iterator;
-import java.util.Properties;
/**
@@ -47,15 +45,8 @@
private int numDocsCreated = 0;
private boolean storeBytes = false;
+ protected boolean forever;
- static class DocData {
- String name;
- Date date;
- String title;
- String body;
- Properties props;
- }
-
private static class LeftOver {
private DocData docdata;
private int cnt;
@@ -80,10 +71,14 @@
/**
* Return the data of the next document.
+ * All current implementations can create docs forever.
+ * When the input data is exhausted, input files are iterated.
+ * This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
* @return data of the next document.
* @exception if cannot create the next doc data
+ * @exception NoMoreDataException if data is exhausted (and 'forever' set to false).
*/
- protected abstract DocData getNextDocData() throws Exception;
+ protected abstract DocData getNextDocData() throws NoMoreDataException, Exception;
/*
* (non-Javadoc)
@@ -103,32 +98,32 @@
int docid = incrNumDocsCreated();
Document doc = new Document();
doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
- if (docData.name!=null) {
- String name = (cnt<0 ? docData.name : docData.name+"_"+cnt);
+ if (docData.getName()!=null) {
+ String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
}
- if (docData.date!=null) {
- String dateStr = DateTools.dateToString(docData.date, DateTools.Resolution.SECOND);
+ if (docData.getDate()!=null) {
+ String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
}
- if (docData.title!=null) {
- doc.add(new Field("doctitle", docData.title, storeVal, indexVal, termVecVal));
+ if (docData.getTitle()!=null) {
+ doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
}
- if (docData.body!=null && docData.body.length()>0) {
+ if (docData.getBody()!=null && docData.getBody().length()>0) {
String bdy;
- if (size<=0 || size>=docData.body.length()) {
- bdy = docData.body; // use all
- docData.body = ""; // nothing left
+ if (size<=0 || size>=docData.getBody().length()) {
+ bdy = docData.getBody(); // use all
+ docData.setBody(""); // nothing left
} else {
// attempt not to break words - if whitespace found within next 20 chars...
- for (int n=size-1; n
+
+External classes: It is sometimes useful to invoke the benchmark
+package with your external alg file that configures the use of your own
+doc/query maker and or html parser. You can work this out without
+modifying the benchmark package code, by passing your class path
+with the benchmark.ext.classpath property:
+
+ - ant run-task -Dtask.alg=[full-path-to-your-alg-file]
+ -Dbenchmark.ext.classpath=/mydir/classes
+ -Dtask.mem=512M
+
+
+
Benchmark "algorithm"
@@ -198,6 +211,14 @@
30 times in a row.
Example - { AddDoc AddDoc } : 30 - would do
addDoc 60 times in a row.
+
Exhaustive repeating: use * instead of
+ a number to repeat forever.
+ This is sometimes useful, for adding as many files as a doc maker can create,
+ without iterating over the same files again, but in the case that the exact
+ number of files is not known in advance. For insance, TREC files extracted
+ from a zip file.
+
Example - { AddDoc } : * - would add docs
+ until the doc maker is "exhausted".
Command parameter: a command can optionally take a single parameter.
@@ -487,6 +508,8 @@
Docs and queries creation:
- analyzer
- doc.maker
+
- doc.maker.forever
+
- html.parser
- doc.stored
- doc.tokenized
- doc.term.vector
Index: build.xml
===================================================================
--- build.xml (revision 522262)
+++ build.xml (working copy)
@@ -97,6 +97,7 @@
+