Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 522262) +++ CHANGES.txt (working copy) @@ -4,6 +4,13 @@ $Id:$ +3/25/07 + +LUCENE-849: +1. which HTML Parser is used is configurable with html.parser property. +2. External classes added to classpath with -Dbenchmark.ext.classpath=path. +3. '*' as repeating number now means "exhaust doc maker - no repetitions". + 3/22/07 -Moved withRetrieve() call out of the loop in ReadTask Index: src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java =================================================================== --- src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 522262) +++ src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy) @@ -81,6 +81,49 @@ assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); } + /** + * Test Exhasting Doc Maker logic + */ + public void testExhaustDocMaker() throws Exception { + // 1. alg definition (required in every "logic" test) + String algLines[] = { + "# ----- properties ", + "doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker", + "doc.add.log.step=1", + "doc.term.vector=false", + "doc.maker.forever=false", + "directory=RAMDirectory", + "doc.stored=false", + "doc.tokenized=false", + "# ----- alg ", + "CreateIndex", + "{ AddDoc } : * ", + "Optimize", + "CloseIndex", + "OpenReader", + "{ CountingSearchTest } : 100", + "CloseReader", + "[ CountingSearchTest > : 30", + "[ CountingSearchTest > : 9", + }; + + // 2. we test this value later + CountingSearchTestTask.numSearches = 0; + + // 3. execute the algorithm (required in every "logic" test) + Benchmark benchmark = execBenchmark(algLines); + + // 4. test specific checks after the benchmark run completed. + assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches); + assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory())); + // now we should be able to open the index for write. + IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false); + iw.close(); + IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); + assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs()); + } + + // create the benchmark and execute it. private Benchmark execBenchmark(String[] algLines) throws Exception { String algText = algLinesToText(algLines); Index: src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java =================================================================== --- src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java (revision 522262) +++ src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java (working copy) @@ -117,8 +117,12 @@ colonOk = false; // get repetitions number stok.nextToken(); - if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString()); - ((TaskSequence)prevTask).setRepetitions((int)stok.nval); + if ((char)stok.ttype == '*') { + ((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST); + } else { + if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString()); + ((TaskSequence)prevTask).setRepetitions((int)stok.nval); + } // check for rate specification (ops/min) stok.nextToken(); if (stok.ttype!=':') { Index: src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java =================================================================== --- src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (revision 522262) +++ src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (working copy) @@ -21,11 +21,13 @@ import java.util.Iterator; import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; /** * Sequence of parallel or sequential tasks. */ public class TaskSequence extends PerfTask { + public static int REPEAT_EXHAUST = -2; private ArrayList tasks; private int repetitions = 1; private boolean parallel; @@ -61,9 +63,13 @@ /** * @param repetitions The repetitions to set. + * @throws Exception */ - public void setRepetitions(int repetitions) { + public void setRepetitions(int repetitions) throws Exception { this.repetitions = repetitions; + if (repetitions==REPEAT_EXHAUST && isParallel()) { + throw new Exception("REPEAT_EXHAUST is not allowed for parallel tasks"); + } setSequenceName(); } @@ -88,10 +94,15 @@ } int count = 0; - for (int k=0; k1) { sb.append(" * " + repetitions); } + if (repetitions==REPEAT_EXHAUST) { + sb.append(" * EXHAUST"); + } if (rate>0) { sb.append(", rate: " + rate+"/"+(perMin?"min":"sec")); } @@ -237,7 +256,9 @@ private void setSequenceName() { seqName = super.getName(); - if (repetitions>1) { + if (repetitions==REPEAT_EXHAUST) { + seqName += "_Exhaust"; + } else if (repetitions>1) { seqName += "_"+repetitions; } if (rate>0) { Index: src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java =================================================================== --- src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java (revision 522262) +++ src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java (working copy) @@ -25,6 +25,7 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Date; import java.util.Locale; @@ -66,13 +67,16 @@ File f = null; String name = null; synchronized (this) { - f = (File) inputFiles.get(nextFile++); - name = f.getCanonicalPath()+"_"+iteration; if (nextFile >= inputFiles.size()) { - // exhausted files, start a new round + // exhausted files, start a new round, unless forever set to false. + if (!forever) { + throw new NoMoreDataException(); + } nextFile = 0; iteration++; } + f = (File) inputFiles.get(nextFile++); + name = f.getCanonicalPath()+"_"+iteration; } BufferedReader reader = new BufferedReader(new FileReader(f)); @@ -90,13 +94,9 @@ addBytes(f.length()); - DocData dd = new DocData(); - dd.date = dateFormat.parse(dateStr.trim()); - dd.name = name; - dd.title = title; - dd.body = bodyBuf.toString(); - return dd; + Date date = dateFormat.parse(dateStr.trim()); + return new DocData(name, bodyBuf.toString(), title, null, date); } Index: src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java =================================================================== --- src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java (revision 522262) +++ src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java (working copy) @@ -18,7 +18,7 @@ */ /** - * Create documents for the test + * Create documents for the test. */ public class SimpleDocMaker extends BasicDocMaker { @@ -58,12 +58,12 @@ return 0; // not applicable } - protected DocData getNextDocData() { - DocData dd = new DocData(); - dd.body = DOC_TEXT; - dd.name = "doc"+newdocid(); + protected DocData getNextDocData() throws NoMoreDataException { + if (docID>0 && !forever) { + throw new NoMoreDataException(); + } addBytes(DOC_TEXT.length()); - return dd; + return new DocData("doc"+newdocid(),DOC_TEXT, null, null, null); } } Index: src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java =================================================================== --- src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (revision 0) +++ src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (revision 0) @@ -0,0 +1,113 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Date; +import java.util.Properties; + +/** + * Output of parsing (e.g. HTML parsing) of an input document. + */ + +public class DocData { + + private String name; + private String body; + private String title; + private Date date; + private Properties props; + + public DocData(String name, String body, String title, Properties props, Date date) { + this.name = name; + this.body = body; + this.title = title; + this.date = date; + this.props = props; + } + + /** + * @return Returns the name. + */ + public String getName() { + return name; + } + + /** + * @param name The name to set. + */ + public void setName(String name) { + this.name = name; + } + + /** + * @return Returns the props. + */ + public Properties getProps() { + return props; + } + + /** + * @param props The props to set. + */ + public void setProps(Properties props) { + this.props = props; + } + + /** + * @return Returns the body. + */ + public String getBody() { + return body; + } + + /** + * @param body The body to set. + */ + public void setBody(String body) { + this.body = body; + } + + /** + * @return Returns the title. + */ + public String getTitle() { + return title; + } + + /** + * @param title The title to set. + */ + public void setTitle(String title) { + this.title = title; + } + + /** + * @return Returns the date. + */ + public Date getDate() { + return date; + } + + /** + * @param date The date to set. + */ + public void setDate(Date date) { + this.date = date; + } + +} Property changes on: src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java =================================================================== --- src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java (revision 522262) +++ src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java (working copy) @@ -23,19 +23,15 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; -import java.io.Reader; -import java.io.StringReader; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Locale; -import java.util.Properties; import java.util.zip.GZIPInputStream; import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.demo.html.HTMLParser; /** @@ -45,7 +41,7 @@ private static final String newline = System.getProperty("line.separator"); - private DateFormat dateFormat; + private DateFormat dateFormat []; private File dataDir = null; private ArrayList inputFiles = new ArrayList(); private int nextFile = 0; @@ -53,6 +49,13 @@ private BufferedReader reader; private GZIPInputStream zis; + private static final String DATE_FORMATS [] = { + "EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT + "EEE MMM dd kk:mm:ss yyyy z", //Tue Dec 09 16:45:08 2003 EST + "EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT + "EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT + }; + /* (non-Javadoc) * @see SimpleDocMaker#setConfig(java.util.Properties) */ @@ -65,34 +68,44 @@ throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); } // date format: 30-MAR-1987 14:22:36.87 - dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT - dateFormat.setLenient(true); - } + dateFormat = new SimpleDateFormat[DATE_FORMATS.length]; + for (int i = 0; i < dateFormat.length; i++) { + dateFormat[i] = new SimpleDateFormat(DATE_FORMATS[i],Locale.US); + dateFormat[i].setLenient(true); + } + } - private void openNextFile() throws Exception { + private void openNextFile() throws NoMoreDataException, Exception { closeInputs(); int retries = 0; - while (retries<20) { + while (true) { File f = null; synchronized (this) { - f = (File) inputFiles.get(nextFile++); if (nextFile >= inputFiles.size()) { - // exhausted files, start a new round + // exhausted files, start a new round, unless forever set to false. + if (!forever) { + throw new NoMoreDataException(); + } nextFile = 0; iteration++; } + f = (File) inputFiles.get(nextFile++); } System.out.println("opening: "+f+" length: "+f.length()); try { zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f))); - break; + reader = new BufferedReader(new InputStreamReader(zis)); + return; } catch (Exception e) { retries++; - System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries); - continue; + if (retries<20) { + System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries); + continue; + } else { + throw new NoMoreDataException(); + } } } - reader = new BufferedReader(new InputStreamReader(zis)); } private void closeInputs() { @@ -142,7 +155,7 @@ return sb; } - protected DocData getNextDocData() throws Exception { + protected DocData getNextDocData() throws NoMoreDataException, Exception { if (reader==null) { openNextFile(); } @@ -162,39 +175,27 @@ // 6. collect until end of doc sb = read("",null,false,true); // this is the next document, so parse it - // TODO use a more robust html parser (current one aborts parsing quite easily). - HTMLParser p = new HTMLParser(new StringReader(sb.toString())); - // title - String title = p.getTitle(); - // properties - Properties props = p.getMetaTags(); - // body - Reader r = p.getReader(); - char c[] = new char[1024]; - StringBuffer bodyBuf = new StringBuffer(); - int n; - while ((n = r.read(c)) >= 0) { - if (n>0) { - bodyBuf.append(c,0,n); - } - } - r.close(); - addBytes(bodyBuf.length()); + Date date = parseDate(dateStr); + HTMLParser p = getHtmlParser(); + DocData docData = p.parse(name, date, sb, dateFormat[0]); + addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text). - DocData dd = new DocData(); + return docData; + } - try { - dd.date = dateFormat.parse(dateStr.trim()); - } catch (ParseException e) { - // do not fail test just because a date could not be parsed - System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr); - dd.date = new Date(); // now + private Date parseDate(String dateStr) { + Date date = null; + for (int i=0; i= 0) { + if (n>0) { + bodyBuf.append(c,0,n); + } + } + r.close(); + if (date == null && props.getProperty("date")!=null) { + try { + date = dateFormat.parse(props.getProperty("date").trim()); + } catch (ParseException e) { + // do not fail test just because a date could not be parsed + System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date")); + date = new Date(); // now + } + } + + return new DocData(name, bodyBuf.toString(), title, props, date); + } + + public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException { + // TODO Auto-generated method stub + return parse(name, date, new StringReader(inputText.toString()), dateFormat); + } + +} Property changes on: src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java =================================================================== --- src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (revision 522262) +++ src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (working copy) @@ -26,9 +26,7 @@ import java.io.File; import java.io.UnsupportedEncodingException; import java.util.ArrayList; -import java.util.Date; import java.util.Iterator; -import java.util.Properties; /** @@ -47,15 +45,8 @@ private int numDocsCreated = 0; private boolean storeBytes = false; + protected boolean forever; - static class DocData { - String name; - Date date; - String title; - String body; - Properties props; - } - private static class LeftOver { private DocData docdata; private int cnt; @@ -80,10 +71,14 @@ /** * Return the data of the next document. + * All current implementations can create docs forever. + * When the input data is exhausted, input files are iterated. + * This re-iteration can be avoided by setting doc.maker.forever to false (default is true). * @return data of the next document. * @exception if cannot create the next doc data + * @exception NoMoreDataException if data is exhausted (and 'forever' set to false). */ - protected abstract DocData getNextDocData() throws Exception; + protected abstract DocData getNextDocData() throws NoMoreDataException, Exception; /* * (non-Javadoc) @@ -103,32 +98,32 @@ int docid = incrNumDocsCreated(); Document doc = new Document(); doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal)); - if (docData.name!=null) { - String name = (cnt<0 ? docData.name : docData.name+"_"+cnt); + if (docData.getName()!=null) { + String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt); doc.add(new Field("docname", name, storeVal, indexVal, termVecVal)); } - if (docData.date!=null) { - String dateStr = DateTools.dateToString(docData.date, DateTools.Resolution.SECOND); + if (docData.getDate()!=null) { + String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND); doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal)); } - if (docData.title!=null) { - doc.add(new Field("doctitle", docData.title, storeVal, indexVal, termVecVal)); + if (docData.getTitle()!=null) { + doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal)); } - if (docData.body!=null && docData.body.length()>0) { + if (docData.getBody()!=null && docData.getBody().length()>0) { String bdy; - if (size<=0 || size>=docData.body.length()) { - bdy = docData.body; // use all - docData.body = ""; // nothing left + if (size<=0 || size>=docData.getBody().length()) { + bdy = docData.getBody(); // use all + docData.setBody(""); // nothing left } else { // attempt not to break words - if whitespace found within next 20 chars... - for (int n=size-1; n +

+External classes: It is sometimes useful to invoke the benchmark +package with your external alg file that configures the use of your own +doc/query maker and or html parser. You can work this out without +modifying the benchmark package code, by passing your class path +with the benchmark.ext.classpath property: +

    +
  • ant run-task -Dtask.alg=[full-path-to-your-alg-file] + -Dbenchmark.ext.classpath=/mydir/classes + -Dtask.mem=512M
  • +
+

+

Benchmark "algorithm"

@@ -198,6 +211,14 @@ 30 times in a row.
Example - { AddDoc AddDoc } : 30 - would do addDoc 60 times in a row. +
Exhaustive repeating: use * instead of + a number to repeat forever. + This is sometimes useful, for adding as many files as a doc maker can create, + without iterating over the same files again, but in the case that the exact + number of files is not known in advance. For insance, TREC files extracted + from a zip file. +
Example - { AddDoc } : * - would add docs + until the doc maker is "exhausted".
  • Command parameter: a command can optionally take a single parameter. @@ -487,6 +508,8 @@
  • Docs and queries creation:
    • analyzer
    • doc.maker +
    • doc.maker.forever +
    • html.parser
    • doc.stored
    • doc.tokenized
    • doc.term.vector Index: build.xml =================================================================== --- build.xml (revision 522262) +++ build.xml (working copy) @@ -97,6 +97,7 @@ +