Index: contrib/benchmark/build.xml
===================================================================
--- contrib/benchmark/build.xml (revision 765035)
+++ contrib/benchmark/build.xml (working copy)
@@ -100,23 +100,14 @@
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (revision 765035)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (working copy)
@@ -17,49 +17,73 @@
* limitations under the License.
*/
-import org.xml.sax.XMLReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
-import java.io.IOException;
-import java.io.FileInputStream;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.benchmark.byTask.utils.Config;
-
/**
- * A LineDocMaker which reads the uncompressed english wikipedia dump.
- *
+ * A {@link LineDocMaker} which reads the uncompressed english wikipedia dump.
* Config properties:
- * keep.image.only.docs=false|true
- *
- * Plus those available in LineDocMaker
- *
- *
+ *
+ * - keep.image.only.docs=false|true
+ *
- [those available in {@link LineDocMaker}]
+ *
+ *
* @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
*/
public class EnwikiDocMaker extends LineDocMaker {
- protected boolean keepImages = true;
+
+ private static final Map ELEMENTS = new HashMap();
+
static final int TITLE = 0;
- static final int DATE = TITLE+1;
- static final int BODY = DATE+1;
+ static final int DATE = TITLE + 1;
+ static final int BODY = DATE + 1;
static final int ID = BODY + 1;
- static final int LENGTH = ID+1;
-
+ static final int LENGTH = ID + 1;
+ // LENGTH is used as the size of the tuple, so whatever constants we need that
+ // should not be part of the tuple, we should define them after LENGTH.
+ static final int PAGE = LENGTH + 1;
+
static final String[] months = {"JAN", "FEB", "MAR", "APR",
"MAY", "JUN", "JUL", "AUG",
"SEP", "OCT", "NOV", "DEC"};
+ static {
+ ELEMENTS.put("page", new Integer(PAGE));
+ ELEMENTS.put("text", new Integer(BODY));
+ ELEMENTS.put("timestamp", new Integer(DATE));
+ ELEMENTS.put("title", new Integer(TITLE));
+ ELEMENTS.put("id", new Integer(ID));
+ }
+
+ /**
+ * Returns the type of the element if defined, otherwise returns -1. This
+ * method is useful in startElement and endElement, by not needing to compare
+ * the element qualified name over and over.
+ */
+ private final static int getElementType(String elem) {
+ Integer val = (Integer) ELEMENTS.get(elem);
+ return val == null ? -1 : val.intValue();
+ }
+
+ protected boolean keepImages = true;
+
public void setConfig(Config config) {
super.setConfig(config);
keepImages = config.get("keep.image.only.docs", true);
}
class Parser extends DefaultHandler implements Runnable {
-
Thread t;
boolean threadDone;
@@ -71,7 +95,7 @@
reader.setContentHandler(this);
reader.setErrorHandler(this);
while(true){
- final FileInputStream localFileIS = fileIS;
+ final InputStream localFileIS = fileIS;
try {
InputSource is = new InputSource(localFileIS);
reader.parse(is);
@@ -133,12 +157,13 @@
t = null;
throw nmde;
}
- if (t != null && threadDone)
+ if (t != null && threadDone) {
// The thread has exited yet did not hit end of
// data, so this means it hit an exception. We
// throw NoMorDataException here to force
// benchmark to stop the current alg:
throw new NoMoreDataException();
+ }
result = tuple;
tuple = null;
notify();
@@ -157,25 +182,27 @@
String time;
String id;
-
-
public void startElement(String namespace,
String simple,
String qualified,
Attributes attributes) {
- if (qualified.equals("page")) {
- title = null;
- body = null;
- time = null;
- id = null;
- } else if (qualified.equals("text")) {
- contents.setLength(0);
- } else if (qualified.equals("timestamp")) {
- contents.setLength(0);
- } else if (qualified.equals("title")) {
- contents.setLength(0);
- } else if (qualified.equals("id")) {
- contents.setLength(0);
+ int elemType = getElementType(qualified);
+ switch (elemType) {
+ case PAGE:
+ title = null;
+ body = null;
+ time = null;
+ id = null;
+ break;
+ // intentional fall-through.
+ case BODY:
+ case DATE:
+ case TITLE:
+ case ID:
+ contents.setLength(0);
+ break;
+ default:
+ // this element should be discarded.
}
}
@@ -214,26 +241,35 @@
public void endElement(String namespace, String simple, String qualified)
throws SAXException {
- if (qualified.equals("title")) {
- title = contents.toString();
- } else if (qualified.equals("text")) {
- body = contents.toString();
- //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
- String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
- if (startsWith.startsWith("#redirect")) {
- body = null;
- }
- } else if (qualified.equals("timestamp")) {
- time = time(contents.toString());
- } else if (qualified.equals("id") && id == null) {//just get the first id
- id = contents.toString();
+ int elemType = getElementType(qualified);
+ switch (elemType) {
+ case PAGE:
+ // the body must be null and we either are keeping image docs or the
+ // title does not start with Image:
+ if (body != null && (keepImages || !title.startsWith("Image:"))) {
+ create(title, time, body, id);
+ }
+ break;
+ case BODY:
+ body = contents.toString();
+ //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
+ String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
+ if (startsWith.startsWith("#redirect")) {
+ body = null;
+ }
+ break;
+ case DATE:
+ time = time(contents.toString());
+ break;
+ case TITLE:
+ title = contents.toString();
+ break;
+ case ID:
+ id = contents.toString();
+ break;
+ default:
+ // this element should be discarded.
}
- else if (qualified.equals("page")) {
- //the body must be null and we either are keeping image docs or the title does not start with Image:
- if (body != null && (keepImages == true || title.startsWith("Image:") == false)) {
- create(title, time, body, id);
- }
- }
}
}
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (revision 765035)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (working copy)
@@ -17,38 +17,44 @@
* limitations under the License.
*/
-import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-
+import java.io.BufferedInputStream;
import java.io.BufferedReader;
+import java.io.FileInputStream;
import java.io.IOException;
-import java.io.FileInputStream;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Random;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
/**
- * A DocMaker reading one line at a time as a Document from
- * a single file. This saves IO cost (over DirDocMaker) of
- * recursing through a directory and opening a new file for
- * every document. It also re-uses its Document and Field
- * instance to improve indexing speed.
- *
+ * A DocMaker reading one line at a time as a Document from a single file. This
+ * saves IO cost (over DirDocMaker) of recursing through a directory and opening
+ * a new file for every document. It also re-uses its Document and Field
+ * instance to improve indexing speed.
+ * The expected format of each line is (arguments are separated by <TAB>):
+ * title, date, body. If a line is read in a different format, a
+ * {@link RuntimeException} will be thrown. In general, you should use this doc
+ * maker with files that were created with {@link WriteLineDocTask}.
+ *
* Config properties:
- * docs.file=<path to the file%gt;
- * doc.reuse.fields=true|false (default true)
- * doc.random.id.limit=N (default -1) -- create random
- * docid in the range 0..N; this is useful
- * with UpdateDoc to test updating random documents; if
- * this is unspecified or -1, then docid is sequentially
- * assigned
+ *
+ * - docs.file=<path to the file>
+ *
- doc.reuse.fields=true|false (default true)
+ *
- bzip.compression=true|false (default false)
+ *
- doc.random.id.limit=N (default -1) -- create random docid in the range
+ * 0..N; this is useful with UpdateDoc to test updating random documents; if
+ * this is unspecified or -1, then docid is sequentially assigned
+ *
*/
public class LineDocMaker extends BasicDocMaker {
- FileInputStream fileIS;
+ InputStream fileIS;
BufferedReader fileIn;
ThreadLocal docState = new ThreadLocal();
private String fileName;
@@ -57,9 +63,12 @@
private final DocState localDocState = new DocState();
private boolean doReuseFields = true;
+ private boolean bzipCompressionEnabled = false;
private Random r;
private int numDocs;
+ private CompressorStreamFactory csFactory = new CompressorStreamFactory();
+
class DocState {
Document doc;
Field bodyField;
@@ -93,7 +102,7 @@
doc.add(idField);
}
- final static String SEP = WriteLineDocTask.SEP;
+ final static char SEP = WriteLineDocTask.SEP;
private int numDocsCreated;
private synchronized int incrNumDocsCreated() {
@@ -101,27 +110,20 @@
}
public Document setFields(String line) {
+ // A line must be in the following format. If it's not, fail !
// title date body
- final String title, date, body;
-
int spot = line.indexOf(SEP);
- if (spot != -1) {
- title = line.substring(0, spot);
- int spot2 = line.indexOf(SEP, 1+spot);
- if (spot2 != -1) {
- date = line.substring(1+spot, spot2);
- body = line.substring(1+spot2, line.length());
- } else
- date = body = "";
- } else
- title = date = body = "";
-
- final String docID;
- if (r != null) {
- docID = "doc" + r.nextInt(numDocs);
- } else {
- docID = "doc" + incrNumDocsCreated();
+ if (spot == -1) {
+ throw new RuntimeException("line: [" + line + "] is in an invalid format !");
}
+ int spot2 = line.indexOf(SEP, 1 + spot);
+ if (spot2 == -1) {
+ throw new RuntimeException("line: [" + line + "] is in an invalid format !");
+ }
+ final String title = line.substring(0, spot);
+ final String date = line.substring(1+spot, spot2);
+ final String body = line.substring(1+spot2, line.length());
+ final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
if (doReuseFields) {
idField.setValue(docID);
@@ -130,7 +132,10 @@
bodyField.setValue(body);
return doc;
} else {
- Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+ Field localIDField = new Field(BasicDocMaker.ID_FIELD,
+ docID,
+ Field.Store.YES,
+ Field.Index.NOT_ANALYZED_NO_NORMS);
Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
title,
@@ -174,16 +179,14 @@
String line;
synchronized(this) {
- while(true) {
- line = fileIn.readLine();
- if (line == null) {
- // Reset the file
- openFile();
- if (!forever)
- throw new NoMoreDataException();
- } else {
- break;
+ line = fileIn.readLine();
+ if (line == null) {
+ if (!forever) {
+ throw new NoMoreDataException();
}
+ // Reset the file
+ openFile();
+ return makeDocument();
}
}
@@ -199,15 +202,24 @@
public synchronized void resetInputs() {
super.resetInputs();
- fileName = config.get("docs.file", null);
- if (fileName == null)
- throw new RuntimeException("docs.file must be set");
openFile();
}
public void setConfig(Config config) {
super.setConfig(config);
+ fileName = config.get("docs.file", null);
+ if (fileName == null) {
+ throw new IllegalArgumentException("docs.file must be set");
+ }
doReuseFields = config.get("doc.reuse.fields", true);
+ String doBZCompress = config.get("bzip.compression", null);
+ if (doBZCompress != null) {
+ // Property was set, use the value.
+ bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue();
+ } else {
+ // Property was not set, attempt to detect based on file's extension
+ bzipCompressionEnabled = fileName.endsWith("bz2");
+ }
numDocs = config.get("doc.random.id.limit", -1);
if (numDocs != -1) {
r = new Random(179);
@@ -216,16 +228,35 @@
synchronized void openFile() {
try {
- if (fileIn != null)
+ if (fileIn != null) {
fileIn.close();
+ }
fileIS = new FileInputStream(fileName);
- fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
+ if (bzipCompressionEnabled) {
+ // According to BZip2CompressorInputStream's code, it reads the first
+ // two file header chars ('B' and 'Z'). We only need to wrap the
+ // underlying stream with a BufferedInputStream, since the code uses
+ // the read() method exclusively.
+ fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES);
+ fileIS = csFactory.createCompressorInputStream("bzip2", fileIS);
+ }
+ // Wrap the stream with a BufferedReader for several reasons:
+ // 1. We need the readLine() method.
+ // 2. Even if bzip.compression is enabled, and is wrapped with
+ // BufferedInputStream, wrapping with a buffer can still improve
+ // performance, since the BIS buffer will be used to read from the
+ // compressed stream, while the BR buffer will be used to read from the
+ // uncompressed stream.
+ fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES);
} catch (IOException e) {
throw new RuntimeException(e);
+ } catch (CompressorException e) {
+ throw new RuntimeException(e);
}
}
public int numUniqueTexts() {
return -1;
}
+
}
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (revision 765035)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (working copy)
@@ -17,18 +17,39 @@
* limitations under the License.
*/
+import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
+import java.io.OutputStream;
import java.io.OutputStreamWriter;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-
+/**
+ * A task which writes documents, one line per document. Each line is in the
+ * following format: title <TAB> date <TAB> body. The output of this
+ * taske can be consumed by
+ * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
+ * to save the IO overhead of opening a file per doument to be indexed.
+ *
+ * Supports the following parameters:
+ *
+ * - line.file.out - the name of the file to write the output to. That
+ * parameter is mandatory. NOTE: the file is re-created.
+ *
- bzip.compression - whether the output should be bzip-compressed. This is
+ * recommended when the output file is expected to be large. (optional, default:
+ * false).
+ *
- doc.writeline.log.step - controls how many records to process before
+ * logging the status of the task. NOTE: to disable logging, set this
+ * value to 0 or negative. (optional, default:1000).
+ *
+ */
public class WriteLineDocTask extends PerfTask {
/**
@@ -36,33 +57,48 @@
* an "added N docs" message should be logged.
*/
public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
+ public final static char SEP = '\t';
- public WriteLineDocTask(PerfRunData runData) {
- super(runData);
- }
-
private int logStep = -1;
private int docSize = 0;
int count = 0;
- private BufferedWriter lineFileOut=null;
+ private BufferedWriter lineFileOut = null;
private DocMaker docMaker;
- public final static String SEP = "\t";
-
- /*
- * (non-Javadoc)
- * @see PerfTask#setup()
- */
- public void setup() throws Exception {
- super.setup();
- if (lineFileOut==null) {
- Config config = getRunData().getConfig();
- String fileName = config.get("line.file.out", null);
- if (fileName == null)
- throw new Exception("line.file.out must be set");
- lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
+ public WriteLineDocTask(PerfRunData runData) throws Exception {
+ super(runData);
+ Config config = runData.getConfig();
+ String fileName = config.get("line.file.out", null);
+ if (fileName == null) {
+ throw new IllegalArgumentException("line.file.out must be set");
}
- docMaker = getRunData().getDocMaker();
+
+ OutputStream out = new FileOutputStream(fileName);
+ boolean doBzipCompression = false;
+ String doBZCompress = config.get("bzip.compression", null);
+ if (doBZCompress != null) {
+ // Property was set, use the value.
+ doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue();
+ } else {
+ // Property was not set, attempt to detect based on file's extension
+ doBzipCompression = fileName.endsWith("bz2");
+ }
+
+ if (doBzipCompression) {
+ // Wrap with BOS since BZip2CompressorOutputStream calls out.write(int)
+ // and does not use the write(byte[]) version. This proved to speed the
+ // compression process by 70% !
+ out = new BufferedOutputStream(out, 1 << 16);
+ out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out);
+ }
+ lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
+ docMaker = runData.getDocMaker();
+ logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
+ // To avoid the check 'if (logStep > 0)' in log(). This effectively turns
+ // logging off.
+ if (logStep <= 0) {
+ logStep = Integer.MAX_VALUE;
+ }
}
public void tearDown() throws Exception {
@@ -71,61 +107,52 @@
}
public int doLogic() throws Exception {
- Document doc;
- if (docSize > 0) {
- doc = docMaker.makeDocument(docSize);
- } else {
- doc = docMaker.makeDocument();
- }
+ Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
Field f = doc.getField(BasicDocMaker.BODY_FIELD);
-
- String body, title, date;
- if (f != null)
- body = f.stringValue().replace('\t', ' ');
- else
- body = null;
+ String body = f != null ? f.stringValue().replace('\t', ' ') : null;
- f = doc.getField(BasicDocMaker.TITLE_FIELD);
- if (f != null)
- title = f.stringValue().replace('\t', ' ');
- else
- title = "";
-
- f = doc.getField(BasicDocMaker.DATE_FIELD);
- if (f != null)
- date = f.stringValue().replace('\t', ' ');
- else
- date = "";
-
if (body != null) {
+ f = doc.getField(BasicDocMaker.TITLE_FIELD);
+ String title = f != null ? f.stringValue().replace('\t', ' ') : "";
+
+ f = doc.getField(BasicDocMaker.DATE_FIELD);
+ String date = f != null ? f.stringValue().replace('\t', ' ') : "";
+
lineFileOut.write(title, 0, title.length());
lineFileOut.write(SEP);
lineFileOut.write(date, 0, date.length());
lineFileOut.write(SEP);
lineFileOut.write(body, 0, body.length());
lineFileOut.newLine();
- lineFileOut.flush();
}
return 1;
}
- private void log (int count) {
- if (logStep<0) {
- // init once per instance
- logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
+ private void log(int count) {
+ // logStep is initialized in the ctor to a positive value. If the config
+ // file indicates no logging, or contains an invalid value, logStep is init
+ // to Integer.MAX_VALUE, so that logging will not occur (at least for the
+ // first Integer.MAX_VALUE records).
+ if (count % logStep == 0) {
+ System.out.println("--> " + Thread.currentThread().getName()
+ + " processed (write line) " + count + " docs");
}
- if (logStep>0 && (count%logStep)==0) {
- System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
- }
}
+ public void close() throws Exception {
+ lineFileOut.close();
+ super.close();
+ }
+
/**
* Set the params (docSize only)
* @param params docSize, or 0 for no limit.
*/
public void setParams(String params) {
- super.setParams(params);
+ if (super.supportsParams()) {
+ super.setParams(params);
+ }
docSize = (int) Float.parseFloat(params);
}
@@ -135,4 +162,5 @@
public boolean supportsParams() {
return true;
}
+
}
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java (revision 0)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java (revision 0)
@@ -0,0 +1,38 @@
+package org.apache.lucene.benchmark;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+
+import junit.framework.TestCase;
+
+/** Base class for all Benchmark unit tests. */
+public class BenchmarkTestCase extends TestCase {
+
+ private static final File workDir;
+
+ static {
+ workDir = new File(System.getProperty("benchmark.work.dir", "test/benchmark")).getAbsoluteFile();
+ workDir.mkdirs();
+ }
+
+ public File getWorkDir() {
+ return workDir;
+ }
+
+}
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java (revision 765035)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java (working copy)
@@ -17,188 +17,33 @@
package org.apache.lucene.benchmark.byTask;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Iterator;
+
import junit.framework.TestCase;
+
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
-import java.io.File;
-import java.io.StringReader;
-import java.lang.reflect.Modifier;
-import java.util.ArrayList;
-import java.util.Iterator;
-
-/**
- * Test very simply that perf tasks are parses as expected.
- */
+/** Test very simply that perf tasks are parses as expected. */
public class TestPerfTasksParse extends TestCase {
- private static final boolean DEBUG = false;
static final String NEW_LINE = System.getProperty("line.separator");
static final String INDENT = " ";
// properties in effect in all tests here
static final String propPart =
- INDENT+"directory=RAMDirectory" + NEW_LINE +
- INDENT+"print.props=false" + NEW_LINE
+ INDENT + "directory=RAMDirectory" + NEW_LINE +
+ INDENT + "print.props=false" + NEW_LINE
;
- /*
- * All known tasks.
- * As new tasks are added, add them here.
- * It would be nice to do that automatically, unfortunately
- * Java does not provide a "get all classes in package" or
- * "get all sub-classes" functionality.
- */
- static String singleTaskAlgs [];
-
- /* (non-Javadoc)
- * @see junit.framework.TestCase#setUp()
- */
- protected void setUp() throws Exception {
- super.setUp();
- if (singleTaskAlgs==null) {
- singleTaskAlgs = findTasks();
- }
- }
-
- // one time initialization
- static String [] findTasks () throws Exception {
- ArrayList tsks = new ArrayList();
- // init with tasks we know about
- tsks.add( " AddDoc " );
- tsks.add( " AddDoc(1000.0) " );
- tsks.add( " ClearStats " );
- tsks.add( " CloseIndex " );
- tsks.add( " CloseReader " );
- tsks.add( " CreateIndex " );
- tsks.add( " DeleteDoc " );
- tsks.add( " DeleteDoc(500.0) " );
- tsks.add( " NewRound " );
- tsks.add( " OpenIndex " );
- tsks.add( " OpenReader " );
- tsks.add( " Optimize " );
- tsks.add( " RepAll " );
- tsks.add( " RepSelectByPref prefix " );
- tsks.add( " RepSumByNameRound " );
- tsks.add( " RepSumByName " );
- tsks.add( " RepSumByPrefRound prefix " );
- tsks.add( " RepSumByPref prefix " );
- tsks.add( " ResetInputs " );
- tsks.add( " ResetSystemErase " );
- tsks.add( " ResetSystemSoft " );
- tsks.add( " Search " );
- tsks.add( " SearchTravRet " );
- tsks.add( " SearchTravRet(100.0) " );
- tsks.add( " SearchTrav " );
- tsks.add( " SearchTrav(50.0) " );
- tsks.add( " SetProp " );
- tsks.add( " SetProp(name,value) " );
- tsks.add( " Warm " );
- tsks.add( "SearchTravRetLoadFieldSelector");
- tsks.add("SearchTravRetLoadFieldSelector(body,title)");
-
- // if tasks.dir property is defined, look for additional tasks.
- // this somewhat covers tasks that would be added in the future, in case
- // the list above is not updated to cover them.
- // some tasks would be tested more than once this way, but that's ok.
- String tasksDir = System.getProperty("tasks.dir");
- if (tasksDir !=null) {
- String pkgPrefix = PerfTask.class.getPackage().getName()+".";
- String taskNames[] = new File(tasksDir).list();
- for (int i = 0; i < taskNames.length; i++) {
- String name = taskNames[i].trim();
- if (!name.endsWith("Task.class"))
- continue; // Task class file only
- name = name.substring(0,name.length()-6);
- Class cls = Class.forName(pkgPrefix+name);
- if (Modifier.isAbstract(cls.getModifiers()) || Modifier.isInterface(cls.getModifiers()))
- continue; // skip sbstract classes
- if (!PerfTask.class.isAssignableFrom(cls))
- continue; // not a task
- name = name.substring(0,name.length()-4);
- if (name.startsWith("Rep") && name.indexOf("Pref")>=0)
- name += " prefix";
- tsks.add(" "+name+" ");
- }
- }
- return (String[]) tsks.toArray(new String[0]);
- }
-
-
- /**
- * @param name test name
- */
public TestPerfTasksParse(String name) {
super(name);
}
- /**
- * Test the parsing of very simple tasks, for all tasks
- */
- public void testAllTasksSimpleParse() {
- doTestAllTasksSimpleParse(false,false);
- }
-
- /**
- * Test the parsing of simple sequential sequences, for all tasks
- */
- public void testAllTasksSimpleParseSequntial() {
- doTestAllTasksSimpleParse(true,false);
- }
-
- /**
- * Test the parsing of simple parallel sequences, for all tasks
- */
- public void testAllTasksSimpleParseParallel() {
- doTestAllTasksSimpleParse(true,true);
- }
-
- // utility for simple parsing testing of all tasks.
- private void doTestAllTasksSimpleParse(boolean parOrSeq, boolean par) {
- for (int i = 0; i < singleTaskAlgs.length; i++) {
- String testedTask = singleTaskAlgs[i];
- if (parOrSeq) {
- if (par) {
- testedTask = "[ " + testedTask + " ] : 2";
- } else {
- testedTask = "{ " + testedTask + " } : 3";
- }
- }
- try {
- String algText = propPart+INDENT+testedTask;
- logTstParsing(algText);
- Benchmark benchmark = new Benchmark(new StringReader(algText));
- Algorithm alg = benchmark.getAlgorithm();
- ArrayList algTasks = alg.extractTasks();
- // must find a task with this name in the algorithm
- boolean foundName = false;
- boolean foundPar = false;
- String theTask = singleTaskAlgs[i].replaceAll(" +"," ").trim();
- for (Iterator iter = algTasks.iterator(); iter.hasNext();) {
- PerfTask task = (PerfTask) iter.next();
- foundName |= (task.toString().indexOf(theTask)>=0);
- foundPar |= (task instanceof TaskSequence && ((TaskSequence)task).isParallel());
- }
- assertTrue("Task "+testedTask+" was not found in "+alg.toString(),foundName);
- if (parOrSeq) {
- if (par) {
- assertTrue("Task "+testedTask+" was supposed to be parallel in "+alg.toString(),foundPar);
- } else {
- assertFalse("Task "+testedTask+" was not supposed to be parallel in "+alg.toString(),foundPar);
- }
- }
- } catch (Exception e) {
- System.out.flush();
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Test the repetiotion parsing for parallel tasks
- */
+ /** Test the repetiotion parsing for parallel tasks */
public void testParseParallelTaskSequenceRepetition() throws Exception {
String taskStr = "AddDoc";
String parsedTasks = "[ "+taskStr+" ] : 1000";
@@ -219,9 +64,7 @@
}
}
- /**
- * Test the repetiotion parsing for sequential tasks
- */
+ /** Test the repetiotion parsing for sequential tasks */
public void testParseTaskSequenceRepetition() throws Exception {
String taskStr = "AddDoc";
String parsedTasks = "{ "+taskStr+" } : 1000";
@@ -242,11 +85,4 @@
}
}
- private void logTstParsing (String txt) {
- if (!DEBUG)
- return;
- System.out.println("Test parsing of");
- System.out.println(txt);
- }
-
}
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (revision 0)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (revision 0)
@@ -0,0 +1,169 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.util.Properties;
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.benchmark.BenchmarkTestCase;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
+import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
+import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
+import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+
+/** Tests the functionality of {@link LineDocMaker}. */
+public class LineDocMakerTest extends BenchmarkTestCase {
+
+ private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
+
+ private void createBZ2LineFile(File file) throws Exception {
+ OutputStream out = new FileOutputStream(file);
+ out = csFactory.createCompressorOutputStream("bzip2", out);
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+ StringBuffer doc = new StringBuffer();
+ doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
+ writer.write(doc.toString());
+ writer.newLine();
+ writer.close();
+ }
+
+ private void createRegularLineFile(File file) throws Exception {
+ OutputStream out = new FileOutputStream(file);
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+ StringBuffer doc = new StringBuffer();
+ doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
+ writer.write(doc.toString());
+ writer.newLine();
+ writer.close();
+ }
+
+ private void doIndexAndSearchTest(File file, boolean setBZCompress,
+ String bz2CompressVal) throws Exception {
+
+ Properties props = new Properties();
+
+ // LineDocMaker specific settings.
+ props.setProperty("docs.file", file.getAbsolutePath());
+ if (setBZCompress) {
+ props.setProperty("bzip.compression", bz2CompressVal);
+ }
+
+ // Indexing configuration.
+ props.setProperty("analyzer", SimpleAnalyzer.class.getName());
+ props.setProperty("doc.maker", LineDocMaker.class.getName());
+ props.setProperty("directory", "RAMDirectory");
+
+ // Create PerfRunData
+ Config config = new Config(props);
+ PerfRunData runData = new PerfRunData(config);
+
+ TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
+ tasks.addTask(new CreateIndexTask(runData));
+ tasks.addTask(new AddDocTask(runData));
+ tasks.addTask(new CloseIndexTask(runData));
+ tasks.doLogic();
+
+ IndexSearcher searcher = new IndexSearcher(runData.getDirectory());
+ TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
+ assertEquals(1, td.totalHits);
+ assertNotNull(td.scoreDocs[0]);
+ searcher.close();
+ }
+
+ /* Tests LineDocMaker with a bzip2 input stream. */
+ public void testBZip2() throws Exception {
+ File file = new File(getWorkDir(), "one-line.bz2");
+ createBZ2LineFile(file);
+ doIndexAndSearchTest(file, true, "true");
+ }
+
+ public void testBZip2AutoDetect() throws Exception {
+ File file = new File(getWorkDir(), "one-line.bz2");
+ createBZ2LineFile(file);
+ doIndexAndSearchTest(file, false, null);
+ }
+
+ public void testBZip2WithBzipCompressionDisabled() throws Exception {
+ File file = new File(getWorkDir(), "one-line.bz2");
+ createBZ2LineFile(file);
+
+ try {
+ doIndexAndSearchTest(file, true, "false");
+ fail("Some exception should have been thrown !");
+ } catch (Exception e) {
+ // expected.
+ }
+ }
+
+ public void testRegularFile() throws Exception {
+ File file = new File(getWorkDir(), "one-line");
+ createRegularLineFile(file);
+ doIndexAndSearchTest(file, false, null);
+ }
+
+ public void testRegularFileWithBZipCompressionEnabled() throws Exception {
+ File file = new File(getWorkDir(), "one-line");
+ createRegularLineFile(file);
+
+ try {
+ doIndexAndSearchTest(file, true, "true");
+ fail("Some exception should have been thrown !");
+ } catch (Exception e) {
+ // expected.
+ }
+ }
+
+ public void testInvalidFormat() throws Exception {
+ String[] testCases = new String[] {
+ "", // empty line
+ "title", // just title
+ "title" + WriteLineDocTask.SEP, // title + SEP
+ "title" + WriteLineDocTask.SEP + "body", // title + SEP + body
+ // note that title + SEP + body + SEP is a valid line, which results in an
+ // empty body
+ };
+
+ for (int i = 0; i < testCases.length; i++) {
+ File file = new File(getWorkDir(), "one-line");
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
+ writer.write(testCases[i]);
+ writer.newLine();
+ writer.close();
+ try {
+ doIndexAndSearchTest(file, false, null);
+ fail("Some exception should have been thrown for: [" + testCases[i] + "]");
+ } catch (Exception e) {
+ // expected.
+ }
+ }
+ }
+
+}
Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
===================================================================
--- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (revision 0)
+++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (revision 0)
@@ -0,0 +1,134 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Properties;
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.lucene.benchmark.BenchmarkTestCase;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.DocData;
+import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+
+/** Tests the functionality of {@link WriteLineDocTask}. */
+public class WriteLineDocTaskTest extends BenchmarkTestCase {
+
+ // class has to be public so that Class.forName.newInstance() will work
+ public static final class WriteLineDocMaker extends BasicDocMaker {
+
+ protected DocData getNextDocData() throws NoMoreDataException, Exception {
+ throw new UnsupportedOperationException("not implemented");
+ }
+
+ public Document makeDocument() throws Exception {
+ Document doc = new Document();
+ doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+ doc.add(new Field(TITLE_FIELD, "title", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+ doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+ return doc;
+ }
+
+ public int numUniqueTexts() {
+ return 0;
+ }
+
+ }
+
+ private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
+
+ private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception {
+ Properties props = new Properties();
+ props.setProperty("doc.maker", WriteLineDocMaker.class.getName());
+ props.setProperty("line.file.out", file.getAbsolutePath());
+ if (setBZCompress) {
+ props.setProperty("bzip.compression", bz2CompressVal);
+ }
+ props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
+ Config config = new Config(props);
+ return new PerfRunData(config);
+ }
+
+ private void doReadTest(File file, boolean bz2File) throws Exception {
+ InputStream in = new FileInputStream(file);
+ if (bz2File) {
+ in = csFactory.createCompressorInputStream("bzip2", in);
+ }
+ BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
+ try {
+ String line = br.readLine();
+ assertNotNull(line);
+ String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
+ assertEquals(3, parts.length);
+ assertEquals("title", parts[0]);
+ assertEquals("date", parts[1]);
+ assertEquals("body", parts[2]);
+ assertNull(br.readLine());
+ } finally {
+ br.close();
+ }
+ }
+
+ /* Tests WriteLineDocTask with a bzip2 format. */
+ public void testBZip2() throws Exception {
+
+ // Create a document in bz2 format.
+ File file = new File(getWorkDir(), "one-line.bz2");
+ PerfRunData runData = createPerfRunData(file, true, "true");
+ WriteLineDocTask wldt = new WriteLineDocTask(runData);
+ wldt.doLogic();
+ wldt.close();
+
+ doReadTest(file, true);
+ }
+
+ public void testBZip2AutoDetect() throws Exception {
+
+ // Create a document in bz2 format.
+ File file = new File(getWorkDir(), "one-line.bz2");
+ PerfRunData runData = createPerfRunData(file, false, null);
+ WriteLineDocTask wldt = new WriteLineDocTask(runData);
+ wldt.doLogic();
+ wldt.close();
+
+ doReadTest(file, true);
+ }
+
+ public void testRegularFile() throws Exception {
+
+ // Create a document in regular format.
+ File file = new File(getWorkDir(), "one-line");
+ PerfRunData runData = createPerfRunData(file, true, "false");
+ WriteLineDocTask wldt = new WriteLineDocTask(runData);
+ wldt.doLogic();
+ wldt.close();
+
+ doReadTest(file, false);
+ }
+
+}