Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (revision 764281) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (working copy) @@ -17,49 +17,73 @@ * limitations under the License. */ -import org.xml.sax.XMLReader; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.Document; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; -import java.io.IOException; -import java.io.FileInputStream; - -import org.apache.lucene.document.Document; -import org.apache.lucene.benchmark.byTask.utils.Config; - /** - * A LineDocMaker which reads the uncompressed english wikipedia dump. - * + * A {@link LineDocMaker} which reads the uncompressed english wikipedia dump. * Config properties: - * keep.image.only.docs=false|true - *
- * Plus those available in LineDocMaker - * - * + * + * * @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker */ public class EnwikiDocMaker extends LineDocMaker { - protected boolean keepImages = true; + + private static final Map ELEMENTS = new HashMap(); + static final int TITLE = 0; - static final int DATE = TITLE+1; - static final int BODY = DATE+1; + static final int DATE = TITLE + 1; + static final int BODY = DATE + 1; static final int ID = BODY + 1; - static final int LENGTH = ID+1; - + static final int LENGTH = ID + 1; + // LENGTH is used as the size of the tuple, so whatever constants we need that + // should not be part of the tuple, we should define them after LENGTH. + static final int PAGE = LENGTH + 1; + static final String[] months = {"JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"}; + static { + ELEMENTS.put("page", new Integer(PAGE)); + ELEMENTS.put("text", new Integer(BODY)); + ELEMENTS.put("timestamp", new Integer(DATE)); + ELEMENTS.put("title", new Integer(TITLE)); + ELEMENTS.put("id", new Integer(ID)); + } + + /** + * Returns the type of the element if defined, otherwise returns -1. This + * method is useful in startElement and endElement, by not needing to compare + * the element qualified name over and over. + */ + private final static int getElementType(String elem) { + Integer val = (Integer) ELEMENTS.get(elem); + return val == null ? -1 : val.intValue(); + } + + protected boolean keepImages = true; + public void setConfig(Config config) { super.setConfig(config); keepImages = config.get("keep.image.only.docs", true); } class Parser extends DefaultHandler implements Runnable { - Thread t; boolean threadDone; @@ -71,7 +95,7 @@ reader.setContentHandler(this); reader.setErrorHandler(this); while(true){ - final FileInputStream localFileIS = fileIS; + final InputStream localFileIS = fileIS; try { InputSource is = new InputSource(localFileIS); reader.parse(is); @@ -133,12 +157,13 @@ t = null; throw nmde; } - if (t != null && threadDone) + if (t != null && threadDone) { // The thread has exited yet did not hit end of // data, so this means it hit an exception. We // throw NoMorDataException here to force // benchmark to stop the current alg: throw new NoMoreDataException(); + } result = tuple; tuple = null; notify(); @@ -157,25 +182,27 @@ String time; String id; - - public void startElement(String namespace, String simple, String qualified, Attributes attributes) { - if (qualified.equals("page")) { - title = null; - body = null; - time = null; - id = null; - } else if (qualified.equals("text")) { - contents.setLength(0); - } else if (qualified.equals("timestamp")) { - contents.setLength(0); - } else if (qualified.equals("title")) { - contents.setLength(0); - } else if (qualified.equals("id")) { - contents.setLength(0); + int elemType = getElementType(qualified); + switch (elemType) { + case PAGE: + title = null; + body = null; + time = null; + id = null; + break; + // intentional fall-through. + case BODY: + case DATE: + case TITLE: + case ID: + contents.setLength(0); + break; + default: + // this element should be discarded. } } @@ -214,26 +241,35 @@ public void endElement(String namespace, String simple, String qualified) throws SAXException { - if (qualified.equals("title")) { - title = contents.toString(); - } else if (qualified.equals("text")) { - body = contents.toString(); - //workaround that startswith doesn't have an ignore case option, get at least 20 chars. - String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); - if (startsWith.startsWith("#redirect")) { - body = null; - } - } else if (qualified.equals("timestamp")) { - time = time(contents.toString()); - } else if (qualified.equals("id") && id == null) {//just get the first id - id = contents.toString(); + int elemType = getElementType(qualified); + switch (elemType) { + case PAGE: + // the body must be null and we either are keeping image docs or the + // title does not start with Image: + if (body != null && (keepImages || !title.startsWith("Image:"))) { + create(title, time, body, id); + } + break; + case BODY: + body = contents.toString(); + //workaround that startswith doesn't have an ignore case option, get at least 20 chars. + String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); + if (startsWith.startsWith("#redirect")) { + body = null; + } + break; + case DATE: + time = time(contents.toString()); + break; + case TITLE: + title = contents.toString(); + break; + case ID: + id = contents.toString(); + break; + default: + // this element should be discarded. } - else if (qualified.equals("page")) { - //the body must be null and we either are keeping image docs or the title does not start with Image: - if (body != null && (keepImages == true || title.startsWith("Image:") == false)) { - create(title, time, body, id); - } - } } } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (revision 764281) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (working copy) @@ -17,38 +17,39 @@ * limitations under the License. */ -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - +import java.io.BufferedInputStream; import java.io.BufferedReader; +import java.io.FileInputStream; import java.io.IOException; -import java.io.FileInputStream; +import java.io.InputStream; import java.io.InputStreamReader; import java.util.Random; +import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.tools.bzip2.CBZip2InputStream; /** - * A DocMaker reading one line at a time as a Document from - * a single file. This saves IO cost (over DirDocMaker) of - * recursing through a directory and opening a new file for - * every document. It also re-uses its Document and Field - * instance to improve indexing speed. - * + * A DocMaker reading one line at a time as a Document from a single file. This + * saves IO cost (over DirDocMaker) of recursing through a directory and opening + * a new file for every document. It also re-uses its Document and Field + * instance to improve indexing speed.
+ * * Config properties: - * docs.file=<path to the file%gt; - * doc.reuse.fields=true|false (default true) - * doc.random.id.limit=N (default -1) -- create random - * docid in the range 0..N; this is useful - * with UpdateDoc to test updating random documents; if - * this is unspecified or -1, then docid is sequentially - * assigned + * */ public class LineDocMaker extends BasicDocMaker { - FileInputStream fileIS; + InputStream fileIS; BufferedReader fileIn; ThreadLocal docState = new ThreadLocal(); private String fileName; @@ -57,6 +58,7 @@ private final DocState localDocState = new DocState(); private boolean doReuseFields = true; + private boolean doBzipCompression = false; private Random r; private int numDocs; @@ -93,7 +95,7 @@ doc.add(idField); } - final static String SEP = WriteLineDocTask.SEP; + final static char SEP = WriteLineDocTask.SEP; private int numDocsCreated; private synchronized int incrNumDocsCreated() { @@ -130,7 +132,10 @@ bodyField.setValue(body); return doc; } else { - Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); + Field localIDField = new Field(BasicDocMaker.ID_FIELD, + docID, + Field.Store.YES, + Field.Index.NOT_ANALYZED_NO_NORMS); Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD, title, @@ -174,16 +179,14 @@ String line; synchronized(this) { - while(true) { - line = fileIn.readLine(); - if (line == null) { - // Reset the file - openFile(); - if (!forever) - throw new NoMoreDataException(); - } else { - break; + line = fileIn.readLine(); + if (line == null) { + if (!forever) { + throw new NoMoreDataException(); } + // Reset the file + openFile(); + return makeDocument(); } } @@ -199,15 +202,17 @@ public synchronized void resetInputs() { super.resetInputs(); - fileName = config.get("docs.file", null); - if (fileName == null) - throw new RuntimeException("docs.file must be set"); openFile(); } public void setConfig(Config config) { super.setConfig(config); + fileName = config.get("docs.file", null); + if (fileName == null) { + throw new IllegalArgumentException("docs.file must be set"); + } doReuseFields = config.get("doc.reuse.fields", true); + doBzipCompression = config.get("bzip.compression", false); numDocs = config.get("doc.random.id.limit", -1); if (numDocs != -1) { r = new Random(179); @@ -216,9 +221,26 @@ synchronized void openFile() { try { - if (fileIn != null) + if (fileIn != null) { fileIn.close(); + } fileIS = new FileInputStream(fileName); + if (doBzipCompression) { + // According to CBZip2InputStream's documentation, we should first + // consume the first two file header chars ('B' and 'Z'), as well as + // wrap the underlying stream with a BufferedInputStream, since CBZip2IS + // uses the read() method exclusively. + fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES); + fileIS.read(); fileIS.read(); + fileIS = new CBZip2InputStream(fileIS); + } + // Wrap the stream with a BufferedReader for several reasons: + // 1. We need the readLine() method. + // 2. Even if bzip.compression is enabled, and is wrapped with + // BufferedInputStream, wrapping with a buffer can still improve + // performance, since the BIS buffer will be used to read from the + // compressed stream, while the BR buffer will be used to read from the + // uncompressed stream. fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES); } catch (IOException e) { throw new RuntimeException(e); @@ -228,4 +250,5 @@ public int numUniqueTexts() { return -1; } + } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (revision 764281) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (working copy) @@ -17,18 +17,39 @@ * limitations under the License. */ +import java.io.BufferedOutputStream; import java.io.BufferedWriter; import java.io.FileOutputStream; +import java.io.OutputStream; import java.io.OutputStreamWriter; import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; -import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.tools.bzip2.CBZip2OutputStream; - +/** + * A task which writes documents, one line per document. Each line is in the + * following format: title <TAB> date <TAB>. The output of this + * taske can be consumed by + * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended + * to save the IO overhead of opening a file per doument to be indexed.
+ * + * Supports the following parameters: + * + */ public class WriteLineDocTask extends PerfTask { /** @@ -36,33 +57,40 @@ * an "added N docs" message should be logged. */ public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000; + public final static char SEP = '\t'; - public WriteLineDocTask(PerfRunData runData) { - super(runData); - } - private int logStep = -1; private int docSize = 0; int count = 0; private BufferedWriter lineFileOut=null; private DocMaker docMaker; - public final static String SEP = "\t"; - /* - * (non-Javadoc) - * @see PerfTask#setup() - */ - public void setup() throws Exception { - super.setup(); - if (lineFileOut==null) { - Config config = getRunData().getConfig(); - String fileName = config.get("line.file.out", null); - if (fileName == null) - throw new Exception("line.file.out must be set"); - lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8")); + public WriteLineDocTask(PerfRunData runData) throws Exception { + super(runData); + Config config = runData.getConfig(); + String fileName = config.get("line.file.out", null); + if (fileName == null) { + throw new IllegalArgumentException("line.file.out must be set"); } - docMaker = getRunData().getDocMaker(); + OutputStream out = new FileOutputStream(fileName); + if (config.get("bzip.compression", false)) { + // Wrap with BOS since CBZip2OutputStream calls out.write(int) and does + // not use the write(byte[]) version. This proved to speed the compression + // process by 70% ! + out = new BufferedOutputStream(out, 1 << 16); + // Write the file header chars. This must happen before CBZip2OS is instantiated. + out.write('B'); out.write('Z'); + out = new CBZip2OutputStream(out); + } + lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16); + docMaker = runData.getDocMaker(); + logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); + // To avoid the check 'if (logStep > 0)' in log(). This effectively turns + // logging off. + if (logStep <= 0) { + logStep = Integer.MAX_VALUE; + } } public void tearDown() throws Exception { @@ -71,61 +99,52 @@ } public int doLogic() throws Exception { - Document doc; - if (docSize > 0) { - doc = docMaker.makeDocument(docSize); - } else { - doc = docMaker.makeDocument(); - } + Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument(); Field f = doc.getField(BasicDocMaker.BODY_FIELD); - - String body, title, date; - if (f != null) - body = f.stringValue().replace('\t', ' '); - else - body = null; + String body = f != null ? f.stringValue().replace('\t', ' ') : null; - f = doc.getField(BasicDocMaker.TITLE_FIELD); - if (f != null) - title = f.stringValue().replace('\t', ' '); - else - title = ""; - - f = doc.getField(BasicDocMaker.DATE_FIELD); - if (f != null) - date = f.stringValue().replace('\t', ' '); - else - date = ""; - if (body != null) { + f = doc.getField(BasicDocMaker.TITLE_FIELD); + String title = f != null ? f.stringValue().replace('\t', ' ') : ""; + + f = doc.getField(BasicDocMaker.DATE_FIELD); + String date = f != null ? f.stringValue().replace('\t', ' ') : ""; + lineFileOut.write(title, 0, title.length()); lineFileOut.write(SEP); lineFileOut.write(date, 0, date.length()); lineFileOut.write(SEP); lineFileOut.write(body, 0, body.length()); lineFileOut.newLine(); - lineFileOut.flush(); } return 1; } - private void log (int count) { - if (logStep<0) { - // init once per instance - logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); + private void log(int count) { + // logStep is initialized in the ctor to a positive value. If the config + // file indicates no logging, or contains an invalid value, logStep is init + // to Integer.MAX_VALUE, so that logging will not occur (at least for the + // first Integer.MAX_VALUE records). + if (count % logStep == 0) { + System.out.println("--> " + Thread.currentThread().getName() + + " processed (write line) " + count + " docs"); } - if (logStep>0 && (count%logStep)==0) { - System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs"); - } } + public void close() throws Exception { + lineFileOut.close(); + super.close(); + } + /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. */ public void setParams(String params) { - super.setParams(params); + if (super.supportsParams()) { + super.setParams(params); + } docSize = (int) Float.parseFloat(params); } @@ -135,4 +154,5 @@ public boolean supportsParams() { return true; } + }