Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (revision 764188)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (working copy)
@@ -17,28 +17,26 @@
* limitations under the License.
*/
-import org.xml.sax.XMLReader;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
-import java.io.IOException;
-import java.io.FileInputStream;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.benchmark.byTask.utils.Config;
-
/**
- * A LineDocMaker which reads the uncompressed english wikipedia dump.
- *
+ * A {@link LineDocMaker} which reads the uncompressed english wikipedia dump.
* Config properties:
- * keep.image.only.docs=false|true
- *
- * Plus those available in LineDocMaker
- *
- *
+ *
+ * - keep.image.only.docs=false|true
+ *
- [those available in {@link LineDocMaker}]
+ *
+ *
* @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
*/
public class EnwikiDocMaker extends LineDocMaker {
@@ -71,7 +69,7 @@
reader.setContentHandler(this);
reader.setErrorHandler(this);
while(true){
- final FileInputStream localFileIS = fileIS;
+ final InputStream localFileIS = fileIS;
try {
InputSource is = new InputSource(localFileIS);
reader.parse(is);
@@ -157,8 +155,6 @@
String time;
String id;
-
-
public void startElement(String namespace,
String simple,
String qualified,
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (revision 764188)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (working copy)
@@ -17,38 +17,39 @@
* limitations under the License.
*/
-import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-
+import java.io.BufferedInputStream;
import java.io.BufferedReader;
+import java.io.FileInputStream;
import java.io.IOException;
-import java.io.FileInputStream;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Random;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.tools.bzip2.CBZip2InputStream;
/**
- * A DocMaker reading one line at a time as a Document from
- * a single file. This saves IO cost (over DirDocMaker) of
- * recursing through a directory and opening a new file for
- * every document. It also re-uses its Document and Field
- * instance to improve indexing speed.
- *
+ * A DocMaker reading one line at a time as a Document from a single file. This
+ * saves IO cost (over DirDocMaker) of recursing through a directory and opening
+ * a new file for every document. It also re-uses its Document and Field
+ * instance to improve indexing speed.
+ *
* Config properties:
- * docs.file=<path to the file%gt;
- * doc.reuse.fields=true|false (default true)
- * doc.random.id.limit=N (default -1) -- create random
- * docid in the range 0..N; this is useful
- * with UpdateDoc to test updating random documents; if
- * this is unspecified or -1, then docid is sequentially
- * assigned
+ *
+ * - docs.file=<path to the file%gt;
+ *
- doc.reuse.fields=true|false (default true)
+ *
- bzip.compression=true|false (default false)
+ *
- doc.random.id.limit=N (default -1) -- create random docid in the range
+ * 0..N; this is useful with UpdateDoc to test updating random documents; if
+ * this is unspecified or -1, then docid is sequentially assigned
+ *
*/
public class LineDocMaker extends BasicDocMaker {
- FileInputStream fileIS;
+ InputStream fileIS;
BufferedReader fileIn;
ThreadLocal docState = new ThreadLocal();
private String fileName;
@@ -57,6 +58,7 @@
private final DocState localDocState = new DocState();
private boolean doReuseFields = true;
+ private boolean doBzipCompression = false;
private Random r;
private int numDocs;
@@ -93,7 +95,7 @@
doc.add(idField);
}
- final static String SEP = WriteLineDocTask.SEP;
+ final static char SEP = WriteLineDocTask.SEP;
private int numDocsCreated;
private synchronized int incrNumDocsCreated() {
@@ -130,7 +132,10 @@
bodyField.setValue(body);
return doc;
} else {
- Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+ Field localIDField = new Field(BasicDocMaker.ID_FIELD,
+ docID,
+ Field.Store.YES,
+ Field.Index.NOT_ANALYZED_NO_NORMS);
Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
title,
@@ -174,16 +179,14 @@
String line;
synchronized(this) {
- while(true) {
- line = fileIn.readLine();
- if (line == null) {
- // Reset the file
- openFile();
- if (!forever)
- throw new NoMoreDataException();
- } else {
- break;
+ line = fileIn.readLine();
+ if (line == null) {
+ if (!forever) {
+ throw new NoMoreDataException();
}
+ // Reset the file
+ openFile();
+ return makeDocument();
}
}
@@ -199,15 +202,17 @@
public synchronized void resetInputs() {
super.resetInputs();
- fileName = config.get("docs.file", null);
- if (fileName == null)
- throw new RuntimeException("docs.file must be set");
openFile();
}
public void setConfig(Config config) {
super.setConfig(config);
+ fileName = config.get("docs.file", null);
+ if (fileName == null) {
+ throw new IllegalArgumentException("docs.file must be set");
+ }
doReuseFields = config.get("doc.reuse.fields", true);
+ doBzipCompression = config.get("bzip.compression", false);
numDocs = config.get("doc.random.id.limit", -1);
if (numDocs != -1) {
r = new Random(179);
@@ -216,9 +221,26 @@
synchronized void openFile() {
try {
- if (fileIn != null)
+ if (fileIn != null) {
fileIn.close();
+ }
fileIS = new FileInputStream(fileName);
+ if (doBzipCompression) {
+ // According to CBZip2InputStream's documentation, we should first
+ // consume the first two file header chars ('B' and 'Z'), as well as
+ // wrap the unerlying stream with a BufferedInputStream, since CBZip2IS
+ // uses the read() method exclusively.
+ fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES);
+ fileIS.read(); fileIS.read();
+ fileIS = new CBZip2InputStream(fileIS);
+ }
+ // Wrap the stream with a BufferedReader for several reasons:
+ // 1. We need the readLine() method.
+ // 2. Even if bzip.compression is enabled, and is wrapped with
+ // BufferedInputStream, wrapping with a buffer can still improve
+ // performance, since the BIS buffer will be used to read from the
+ // compressed stream, while the BR buffer will be used to read from the
+ // uncompressed stream.
fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
} catch (IOException e) {
throw new RuntimeException(e);
@@ -228,4 +250,5 @@
public int numUniqueTexts() {
return -1;
}
+
}
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (revision 764188)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (working copy)
@@ -19,16 +19,36 @@
import java.io.BufferedWriter;
import java.io.FileOutputStream;
+import java.io.OutputStream;
import java.io.OutputStreamWriter;
import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.tools.bzip2.CBZip2OutputStream;
-
+/**
+ * A task which writes documents, one line per document. Each line is in the
+ * following format: title <TAB> date <TAB>. The output of this
+ * taske can be consumed by
+ * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
+ * to save the IO overhead of opening a file per doument to be indexed.
+ *
+ * Supports the following parameters:
+ *
+ * - line.file.out - the name of the file to write the output to. That
+ * parameter is mandatory. NOTE: the file is re-created.
+ *
- bzip.compression - whether the output should be bzip-compressed. This is
+ * recommended when the output file is expected to be large. (optional, default:
+ * false).
+ *
- doc.writeline.log.step - controls how many records to process before
+ * logging the status of the task. NOTE: to disable logging, set this
+ * value to 0 or negative. (optional, default:1000).
+ *
+ */
public class WriteLineDocTask extends PerfTask {
/**
@@ -36,33 +56,35 @@
* an "added N docs" message should be logged.
*/
public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
+ public final static char SEP = '\t';
- public WriteLineDocTask(PerfRunData runData) {
- super(runData);
- }
-
private int logStep = -1;
private int docSize = 0;
int count = 0;
private BufferedWriter lineFileOut=null;
private DocMaker docMaker;
- public final static String SEP = "\t";
- /*
- * (non-Javadoc)
- * @see PerfTask#setup()
- */
- public void setup() throws Exception {
- super.setup();
- if (lineFileOut==null) {
- Config config = getRunData().getConfig();
- String fileName = config.get("line.file.out", null);
- if (fileName == null)
- throw new Exception("line.file.out must be set");
- lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
+ public WriteLineDocTask(PerfRunData runData) throws Exception {
+ super(runData);
+ Config config = runData.getConfig();
+ String fileName = config.get("line.file.out", null);
+ if (fileName == null) {
+ throw new IllegalArgumentException("line.file.out must be set");
}
- docMaker = getRunData().getDocMaker();
+ OutputStream out = new FileOutputStream(fileName);
+ if (config.get("bzip.compression", false)) {
+ // Write the file header chars. This must happen before CBZip2OS is instantiated.
+ out.write('B'); out.write('Z');
+ out = new CBZip2OutputStream(out);
+ }
+ lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
+ docMaker = runData.getDocMaker();
+ logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
+ // To avoid the check 'if (logStep > 0)' in log().
+ if (logStep <= 0) {
+ logStep = Integer.MAX_VALUE;
+ }
}
public void tearDown() throws Exception {
@@ -71,34 +93,18 @@
}
public int doLogic() throws Exception {
- Document doc;
- if (docSize > 0) {
- doc = docMaker.makeDocument(docSize);
- } else {
- doc = docMaker.makeDocument();
- }
+ Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
Field f = doc.getField(BasicDocMaker.BODY_FIELD);
-
- String body, title, date;
- if (f != null)
- body = f.stringValue().replace('\t', ' ');
- else
- body = null;
+ String body = f != null ? f.stringValue().replace('\t', ' ') : null;
- f = doc.getField(BasicDocMaker.TITLE_FIELD);
- if (f != null)
- title = f.stringValue().replace('\t', ' ');
- else
- title = "";
-
- f = doc.getField(BasicDocMaker.DATE_FIELD);
- if (f != null)
- date = f.stringValue().replace('\t', ' ');
- else
- date = "";
-
if (body != null) {
+ f = doc.getField(BasicDocMaker.TITLE_FIELD);
+ String title = f != null ? f.stringValue().replace('\t', ' ') : "";
+
+ f = doc.getField(BasicDocMaker.DATE_FIELD);
+ String date = f != null ? f.stringValue().replace('\t', ' ') : "";
+
lineFileOut.write(title, 0, title.length());
lineFileOut.write(SEP);
lineFileOut.write(date, 0, date.length());
@@ -111,21 +117,29 @@
}
private void log (int count) {
- if (logStep<0) {
- // init once per instance
- logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
+ // logStep is initialized in the ctor to a positive value. If the config
+ // file indicates no logging, or contains an invalid value, logStep is init
+ // to Integer.MAX_VALUE, so that logging will not occur (at least for the
+ // first Integer.MAX_VALUE records).
+ if (count % logStep == 0) {
+ System.out.println("--> " + Thread.currentThread().getName()
+ + " processed (write line) " + count + " docs");
}
- if (logStep>0 && (count%logStep)==0) {
- System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
- }
}
+ public void close() throws Exception {
+ lineFileOut.close();
+ super.close();
+ }
+
/**
* Set the params (docSize only)
* @param params docSize, or 0 for no limit.
*/
public void setParams(String params) {
- super.setParams(params);
+ if (super.supportsParams()) {
+ super.setParams(params);
+ }
docSize = (int) Float.parseFloat(params);
}
@@ -135,4 +149,5 @@
public boolean supportsParams() {
return true;
}
+
}