Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (revision 764188) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (working copy) @@ -17,28 +17,26 @@ * limitations under the License. */ -import org.xml.sax.XMLReader; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.Document; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; -import java.io.IOException; -import java.io.FileInputStream; - -import org.apache.lucene.document.Document; -import org.apache.lucene.benchmark.byTask.utils.Config; - /** - * A LineDocMaker which reads the uncompressed english wikipedia dump. - * + * A {@link LineDocMaker} which reads the uncompressed english wikipedia dump. * Config properties: - * keep.image.only.docs=false|true - *
- * Plus those available in LineDocMaker - * - * + * + * * @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker */ public class EnwikiDocMaker extends LineDocMaker { @@ -71,7 +69,7 @@ reader.setContentHandler(this); reader.setErrorHandler(this); while(true){ - final FileInputStream localFileIS = fileIS; + final InputStream localFileIS = fileIS; try { InputSource is = new InputSource(localFileIS); reader.parse(is); @@ -157,8 +155,6 @@ String time; String id; - - public void startElement(String namespace, String simple, String qualified, Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (revision 764188) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (working copy) @@ -17,38 +17,39 @@ * limitations under the License. */ -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - +import java.io.BufferedInputStream; import java.io.BufferedReader; +import java.io.FileInputStream; import java.io.IOException; -import java.io.FileInputStream; +import java.io.InputStream; import java.io.InputStreamReader; import java.util.Random; +import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.tools.bzip2.CBZip2InputStream; /** - * A DocMaker reading one line at a time as a Document from - * a single file. This saves IO cost (over DirDocMaker) of - * recursing through a directory and opening a new file for - * every document. It also re-uses its Document and Field - * instance to improve indexing speed. - * + * A DocMaker reading one line at a time as a Document from a single file. This + * saves IO cost (over DirDocMaker) of recursing through a directory and opening + * a new file for every document. It also re-uses its Document and Field + * instance to improve indexing speed.
+ * * Config properties: - * docs.file=<path to the file%gt; - * doc.reuse.fields=true|false (default true) - * doc.random.id.limit=N (default -1) -- create random - * docid in the range 0..N; this is useful - * with UpdateDoc to test updating random documents; if - * this is unspecified or -1, then docid is sequentially - * assigned + * */ public class LineDocMaker extends BasicDocMaker { - FileInputStream fileIS; + InputStream fileIS; BufferedReader fileIn; ThreadLocal docState = new ThreadLocal(); private String fileName; @@ -57,6 +58,7 @@ private final DocState localDocState = new DocState(); private boolean doReuseFields = true; + private boolean doBzipCompression = false; private Random r; private int numDocs; @@ -93,7 +95,7 @@ doc.add(idField); } - final static String SEP = WriteLineDocTask.SEP; + final static char SEP = WriteLineDocTask.SEP; private int numDocsCreated; private synchronized int incrNumDocsCreated() { @@ -130,7 +132,10 @@ bodyField.setValue(body); return doc; } else { - Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); + Field localIDField = new Field(BasicDocMaker.ID_FIELD, + docID, + Field.Store.YES, + Field.Index.NOT_ANALYZED_NO_NORMS); Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD, title, @@ -174,16 +179,14 @@ String line; synchronized(this) { - while(true) { - line = fileIn.readLine(); - if (line == null) { - // Reset the file - openFile(); - if (!forever) - throw new NoMoreDataException(); - } else { - break; + line = fileIn.readLine(); + if (line == null) { + if (!forever) { + throw new NoMoreDataException(); } + // Reset the file + openFile(); + return makeDocument(); } } @@ -199,15 +202,17 @@ public synchronized void resetInputs() { super.resetInputs(); - fileName = config.get("docs.file", null); - if (fileName == null) - throw new RuntimeException("docs.file must be set"); openFile(); } public void setConfig(Config config) { super.setConfig(config); + fileName = config.get("docs.file", null); + if (fileName == null) { + throw new IllegalArgumentException("docs.file must be set"); + } doReuseFields = config.get("doc.reuse.fields", true); + doBzipCompression = config.get("bzip.compression", false); numDocs = config.get("doc.random.id.limit", -1); if (numDocs != -1) { r = new Random(179); @@ -216,9 +221,26 @@ synchronized void openFile() { try { - if (fileIn != null) + if (fileIn != null) { fileIn.close(); + } fileIS = new FileInputStream(fileName); + if (doBzipCompression) { + // According to CBZip2InputStream's documentation, we should first + // consume the first two file header chars ('B' and 'Z'), as well as + // wrap the unerlying stream with a BufferedInputStream, since CBZip2IS + // uses the read() method exclusively. + fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES); + fileIS.read(); fileIS.read(); + fileIS = new CBZip2InputStream(fileIS); + } + // Wrap the stream with a BufferedReader for several reasons: + // 1. We need the readLine() method. + // 2. Even if bzip.compression is enabled, and is wrapped with + // BufferedInputStream, wrapping with a buffer can still improve + // performance, since the BIS buffer will be used to read from the + // compressed stream, while the BR buffer will be used to read from the + // uncompressed stream. fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES); } catch (IOException e) { throw new RuntimeException(e); @@ -228,4 +250,5 @@ public int numUniqueTexts() { return -1; } + } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (revision 764188) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (working copy) @@ -19,16 +19,36 @@ import java.io.BufferedWriter; import java.io.FileOutputStream; +import java.io.OutputStream; import java.io.OutputStreamWriter; import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; -import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.tools.bzip2.CBZip2OutputStream; - +/** + * A task which writes documents, one line per document. Each line is in the + * following format: title <TAB> date <TAB>. The output of this + * taske can be consumed by + * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended + * to save the IO overhead of opening a file per doument to be indexed.
+ * + * Supports the following parameters: + * + */ public class WriteLineDocTask extends PerfTask { /** @@ -36,33 +56,35 @@ * an "added N docs" message should be logged. */ public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000; + public final static char SEP = '\t'; - public WriteLineDocTask(PerfRunData runData) { - super(runData); - } - private int logStep = -1; private int docSize = 0; int count = 0; private BufferedWriter lineFileOut=null; private DocMaker docMaker; - public final static String SEP = "\t"; - /* - * (non-Javadoc) - * @see PerfTask#setup() - */ - public void setup() throws Exception { - super.setup(); - if (lineFileOut==null) { - Config config = getRunData().getConfig(); - String fileName = config.get("line.file.out", null); - if (fileName == null) - throw new Exception("line.file.out must be set"); - lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8")); + public WriteLineDocTask(PerfRunData runData) throws Exception { + super(runData); + Config config = runData.getConfig(); + String fileName = config.get("line.file.out", null); + if (fileName == null) { + throw new IllegalArgumentException("line.file.out must be set"); } - docMaker = getRunData().getDocMaker(); + OutputStream out = new FileOutputStream(fileName); + if (config.get("bzip.compression", false)) { + // Write the file header chars. This must happen before CBZip2OS is instantiated. + out.write('B'); out.write('Z'); + out = new CBZip2OutputStream(out); + } + lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16); + docMaker = runData.getDocMaker(); + logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); + // To avoid the check 'if (logStep > 0)' in log(). + if (logStep <= 0) { + logStep = Integer.MAX_VALUE; + } } public void tearDown() throws Exception { @@ -71,34 +93,18 @@ } public int doLogic() throws Exception { - Document doc; - if (docSize > 0) { - doc = docMaker.makeDocument(docSize); - } else { - doc = docMaker.makeDocument(); - } + Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument(); Field f = doc.getField(BasicDocMaker.BODY_FIELD); - - String body, title, date; - if (f != null) - body = f.stringValue().replace('\t', ' '); - else - body = null; + String body = f != null ? f.stringValue().replace('\t', ' ') : null; - f = doc.getField(BasicDocMaker.TITLE_FIELD); - if (f != null) - title = f.stringValue().replace('\t', ' '); - else - title = ""; - - f = doc.getField(BasicDocMaker.DATE_FIELD); - if (f != null) - date = f.stringValue().replace('\t', ' '); - else - date = ""; - if (body != null) { + f = doc.getField(BasicDocMaker.TITLE_FIELD); + String title = f != null ? f.stringValue().replace('\t', ' ') : ""; + + f = doc.getField(BasicDocMaker.DATE_FIELD); + String date = f != null ? f.stringValue().replace('\t', ' ') : ""; + lineFileOut.write(title, 0, title.length()); lineFileOut.write(SEP); lineFileOut.write(date, 0, date.length()); @@ -111,21 +117,29 @@ } private void log (int count) { - if (logStep<0) { - // init once per instance - logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); + // logStep is initialized in the ctor to a positive value. If the config + // file indicates no logging, or contains an invalid value, logStep is init + // to Integer.MAX_VALUE, so that logging will not occur (at least for the + // first Integer.MAX_VALUE records). + if (count % logStep == 0) { + System.out.println("--> " + Thread.currentThread().getName() + + " processed (write line) " + count + " docs"); } - if (logStep>0 && (count%logStep)==0) { - System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs"); - } } + public void close() throws Exception { + lineFileOut.close(); + super.close(); + } + /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. */ public void setParams(String params) { - super.setParams(params); + if (super.supportsParams()) { + super.setParams(params); + } docSize = (int) Float.parseFloat(params); } @@ -135,4 +149,5 @@ public boolean supportsParams() { return true; } + }