Index: contrib/benchmark/build.xml =================================================================== --- contrib/benchmark/build.xml (revision 765035) +++ contrib/benchmark/build.xml (working copy) @@ -100,23 +100,14 @@ - - - - - - - - - - - - + + + Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (revision 765035) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (working copy) @@ -17,49 +17,73 @@ * limitations under the License. */ -import org.xml.sax.XMLReader; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.Document; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; -import java.io.IOException; -import java.io.FileInputStream; - -import org.apache.lucene.document.Document; -import org.apache.lucene.benchmark.byTask.utils.Config; - /** - * A LineDocMaker which reads the uncompressed english wikipedia dump. - * + * A {@link LineDocMaker} which reads the uncompressed english wikipedia dump. * Config properties: - * keep.image.only.docs=false|true - *
- * Plus those available in LineDocMaker - * - * + *
    + *
  • keep.image.only.docs=false|true + *
  • [those available in {@link LineDocMaker}] + *
+ * * @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker */ public class EnwikiDocMaker extends LineDocMaker { - protected boolean keepImages = true; + + private static final Map ELEMENTS = new HashMap(); + static final int TITLE = 0; - static final int DATE = TITLE+1; - static final int BODY = DATE+1; + static final int DATE = TITLE + 1; + static final int BODY = DATE + 1; static final int ID = BODY + 1; - static final int LENGTH = ID+1; - + static final int LENGTH = ID + 1; + // LENGTH is used as the size of the tuple, so whatever constants we need that + // should not be part of the tuple, we should define them after LENGTH. + static final int PAGE = LENGTH + 1; + static final String[] months = {"JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"}; + static { + ELEMENTS.put("page", new Integer(PAGE)); + ELEMENTS.put("text", new Integer(BODY)); + ELEMENTS.put("timestamp", new Integer(DATE)); + ELEMENTS.put("title", new Integer(TITLE)); + ELEMENTS.put("id", new Integer(ID)); + } + + /** + * Returns the type of the element if defined, otherwise returns -1. This + * method is useful in startElement and endElement, by not needing to compare + * the element qualified name over and over. + */ + private final static int getElementType(String elem) { + Integer val = (Integer) ELEMENTS.get(elem); + return val == null ? -1 : val.intValue(); + } + + protected boolean keepImages = true; + public void setConfig(Config config) { super.setConfig(config); keepImages = config.get("keep.image.only.docs", true); } class Parser extends DefaultHandler implements Runnable { - Thread t; boolean threadDone; @@ -71,7 +95,7 @@ reader.setContentHandler(this); reader.setErrorHandler(this); while(true){ - final FileInputStream localFileIS = fileIS; + final InputStream localFileIS = fileIS; try { InputSource is = new InputSource(localFileIS); reader.parse(is); @@ -133,12 +157,13 @@ t = null; throw nmde; } - if (t != null && threadDone) + if (t != null && threadDone) { // The thread has exited yet did not hit end of // data, so this means it hit an exception. We // throw NoMorDataException here to force // benchmark to stop the current alg: throw new NoMoreDataException(); + } result = tuple; tuple = null; notify(); @@ -157,25 +182,27 @@ String time; String id; - - public void startElement(String namespace, String simple, String qualified, Attributes attributes) { - if (qualified.equals("page")) { - title = null; - body = null; - time = null; - id = null; - } else if (qualified.equals("text")) { - contents.setLength(0); - } else if (qualified.equals("timestamp")) { - contents.setLength(0); - } else if (qualified.equals("title")) { - contents.setLength(0); - } else if (qualified.equals("id")) { - contents.setLength(0); + int elemType = getElementType(qualified); + switch (elemType) { + case PAGE: + title = null; + body = null; + time = null; + id = null; + break; + // intentional fall-through. + case BODY: + case DATE: + case TITLE: + case ID: + contents.setLength(0); + break; + default: + // this element should be discarded. } } @@ -214,26 +241,35 @@ public void endElement(String namespace, String simple, String qualified) throws SAXException { - if (qualified.equals("title")) { - title = contents.toString(); - } else if (qualified.equals("text")) { - body = contents.toString(); - //workaround that startswith doesn't have an ignore case option, get at least 20 chars. - String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); - if (startsWith.startsWith("#redirect")) { - body = null; - } - } else if (qualified.equals("timestamp")) { - time = time(contents.toString()); - } else if (qualified.equals("id") && id == null) {//just get the first id - id = contents.toString(); + int elemType = getElementType(qualified); + switch (elemType) { + case PAGE: + // the body must be null and we either are keeping image docs or the + // title does not start with Image: + if (body != null && (keepImages || !title.startsWith("Image:"))) { + create(title, time, body, id); + } + break; + case BODY: + body = contents.toString(); + //workaround that startswith doesn't have an ignore case option, get at least 20 chars. + String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); + if (startsWith.startsWith("#redirect")) { + body = null; + } + break; + case DATE: + time = time(contents.toString()); + break; + case TITLE: + title = contents.toString(); + break; + case ID: + id = contents.toString(); + break; + default: + // this element should be discarded. } - else if (qualified.equals("page")) { - //the body must be null and we either are keeping image docs or the title does not start with Image: - if (body != null && (keepImages == true || title.startsWith("Image:") == false)) { - create(title, time, body, id); - } - } } } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (revision 765035) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (working copy) @@ -17,38 +17,44 @@ * limitations under the License. */ -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - +import java.io.BufferedInputStream; import java.io.BufferedReader; +import java.io.FileInputStream; import java.io.IOException; -import java.io.FileInputStream; +import java.io.InputStream; import java.io.InputStreamReader; import java.util.Random; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; /** - * A DocMaker reading one line at a time as a Document from - * a single file. This saves IO cost (over DirDocMaker) of - * recursing through a directory and opening a new file for - * every document. It also re-uses its Document and Field - * instance to improve indexing speed. - * + * A DocMaker reading one line at a time as a Document from a single file. This + * saves IO cost (over DirDocMaker) of recursing through a directory and opening + * a new file for every document. It also re-uses its Document and Field + * instance to improve indexing speed.
+ * The expected format of each line is (arguments are separated by <TAB>): + * title, date, body. If a line is read in a different format, a + * {@link RuntimeException} will be thrown. In general, you should use this doc + * maker with files that were created with {@link WriteLineDocTask}.

+ * * Config properties: - * docs.file=<path to the file%gt; - * doc.reuse.fields=true|false (default true) - * doc.random.id.limit=N (default -1) -- create random - * docid in the range 0..N; this is useful - * with UpdateDoc to test updating random documents; if - * this is unspecified or -1, then docid is sequentially - * assigned + *
    + *
  • docs.file=<path to the file> + *
  • doc.reuse.fields=true|false (default true) + *
  • bzip.compression=true|false (default false) + *
  • doc.random.id.limit=N (default -1) -- create random docid in the range + * 0..N; this is useful with UpdateDoc to test updating random documents; if + * this is unspecified or -1, then docid is sequentially assigned + *
*/ public class LineDocMaker extends BasicDocMaker { - FileInputStream fileIS; + InputStream fileIS; BufferedReader fileIn; ThreadLocal docState = new ThreadLocal(); private String fileName; @@ -57,9 +63,12 @@ private final DocState localDocState = new DocState(); private boolean doReuseFields = true; + private boolean bzipCompressionEnabled = false; private Random r; private int numDocs; + private CompressorStreamFactory csFactory = new CompressorStreamFactory(); + class DocState { Document doc; Field bodyField; @@ -93,7 +102,7 @@ doc.add(idField); } - final static String SEP = WriteLineDocTask.SEP; + final static char SEP = WriteLineDocTask.SEP; private int numDocsCreated; private synchronized int incrNumDocsCreated() { @@ -101,27 +110,20 @@ } public Document setFields(String line) { + // A line must be in the following format. If it's not, fail ! // title date body - final String title, date, body; - int spot = line.indexOf(SEP); - if (spot != -1) { - title = line.substring(0, spot); - int spot2 = line.indexOf(SEP, 1+spot); - if (spot2 != -1) { - date = line.substring(1+spot, spot2); - body = line.substring(1+spot2, line.length()); - } else - date = body = ""; - } else - title = date = body = ""; - - final String docID; - if (r != null) { - docID = "doc" + r.nextInt(numDocs); - } else { - docID = "doc" + incrNumDocsCreated(); + if (spot == -1) { + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); } + int spot2 = line.indexOf(SEP, 1 + spot); + if (spot2 == -1) { + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); + } + final String title = line.substring(0, spot); + final String date = line.substring(1+spot, spot2); + final String body = line.substring(1+spot2, line.length()); + final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated()); if (doReuseFields) { idField.setValue(docID); @@ -130,7 +132,10 @@ bodyField.setValue(body); return doc; } else { - Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); + Field localIDField = new Field(BasicDocMaker.ID_FIELD, + docID, + Field.Store.YES, + Field.Index.NOT_ANALYZED_NO_NORMS); Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD, title, @@ -174,16 +179,14 @@ String line; synchronized(this) { - while(true) { - line = fileIn.readLine(); - if (line == null) { - // Reset the file - openFile(); - if (!forever) - throw new NoMoreDataException(); - } else { - break; + line = fileIn.readLine(); + if (line == null) { + if (!forever) { + throw new NoMoreDataException(); } + // Reset the file + openFile(); + return makeDocument(); } } @@ -199,15 +202,24 @@ public synchronized void resetInputs() { super.resetInputs(); - fileName = config.get("docs.file", null); - if (fileName == null) - throw new RuntimeException("docs.file must be set"); openFile(); } public void setConfig(Config config) { super.setConfig(config); + fileName = config.get("docs.file", null); + if (fileName == null) { + throw new IllegalArgumentException("docs.file must be set"); + } doReuseFields = config.get("doc.reuse.fields", true); + String doBZCompress = config.get("bzip.compression", null); + if (doBZCompress != null) { + // Property was set, use the value. + bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue(); + } else { + // Property was not set, attempt to detect based on file's extension + bzipCompressionEnabled = fileName.endsWith("bz2"); + } numDocs = config.get("doc.random.id.limit", -1); if (numDocs != -1) { r = new Random(179); @@ -216,16 +228,35 @@ synchronized void openFile() { try { - if (fileIn != null) + if (fileIn != null) { fileIn.close(); + } fileIS = new FileInputStream(fileName); - fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES); + if (bzipCompressionEnabled) { + // According to BZip2CompressorInputStream's code, it reads the first + // two file header chars ('B' and 'Z'). We only need to wrap the + // underlying stream with a BufferedInputStream, since the code uses + // the read() method exclusively. + fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES); + fileIS = csFactory.createCompressorInputStream("bzip2", fileIS); + } + // Wrap the stream with a BufferedReader for several reasons: + // 1. We need the readLine() method. + // 2. Even if bzip.compression is enabled, and is wrapped with + // BufferedInputStream, wrapping with a buffer can still improve + // performance, since the BIS buffer will be used to read from the + // compressed stream, while the BR buffer will be used to read from the + // uncompressed stream. + fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES); } catch (IOException e) { throw new RuntimeException(e); + } catch (CompressorException e) { + throw new RuntimeException(e); } } public int numUniqueTexts() { return -1; } + } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (revision 765035) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (working copy) @@ -17,18 +17,39 @@ * limitations under the License. */ +import java.io.BufferedOutputStream; import java.io.BufferedWriter; import java.io.FileOutputStream; +import java.io.OutputStream; import java.io.OutputStreamWriter; +import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; -import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; - +/** + * A task which writes documents, one line per document. Each line is in the + * following format: title <TAB> date <TAB> body. The output of this + * taske can be consumed by + * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended + * to save the IO overhead of opening a file per doument to be indexed.
+ * + * Supports the following parameters: + *
    + *
  • line.file.out - the name of the file to write the output to. That + * parameter is mandatory. NOTE: the file is re-created. + *
  • bzip.compression - whether the output should be bzip-compressed. This is + * recommended when the output file is expected to be large. (optional, default: + * false). + *
  • doc.writeline.log.step - controls how many records to process before + * logging the status of the task. NOTE: to disable logging, set this + * value to 0 or negative. (optional, default:1000). + *
+ */ public class WriteLineDocTask extends PerfTask { /** @@ -36,33 +57,48 @@ * an "added N docs" message should be logged. */ public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000; + public final static char SEP = '\t'; - public WriteLineDocTask(PerfRunData runData) { - super(runData); - } - private int logStep = -1; private int docSize = 0; int count = 0; - private BufferedWriter lineFileOut=null; + private BufferedWriter lineFileOut = null; private DocMaker docMaker; - public final static String SEP = "\t"; - - /* - * (non-Javadoc) - * @see PerfTask#setup() - */ - public void setup() throws Exception { - super.setup(); - if (lineFileOut==null) { - Config config = getRunData().getConfig(); - String fileName = config.get("line.file.out", null); - if (fileName == null) - throw new Exception("line.file.out must be set"); - lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8")); + public WriteLineDocTask(PerfRunData runData) throws Exception { + super(runData); + Config config = runData.getConfig(); + String fileName = config.get("line.file.out", null); + if (fileName == null) { + throw new IllegalArgumentException("line.file.out must be set"); } - docMaker = getRunData().getDocMaker(); + + OutputStream out = new FileOutputStream(fileName); + boolean doBzipCompression = false; + String doBZCompress = config.get("bzip.compression", null); + if (doBZCompress != null) { + // Property was set, use the value. + doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue(); + } else { + // Property was not set, attempt to detect based on file's extension + doBzipCompression = fileName.endsWith("bz2"); + } + + if (doBzipCompression) { + // Wrap with BOS since BZip2CompressorOutputStream calls out.write(int) + // and does not use the write(byte[]) version. This proved to speed the + // compression process by 70% ! + out = new BufferedOutputStream(out, 1 << 16); + out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out); + } + lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16); + docMaker = runData.getDocMaker(); + logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); + // To avoid the check 'if (logStep > 0)' in log(). This effectively turns + // logging off. + if (logStep <= 0) { + logStep = Integer.MAX_VALUE; + } } public void tearDown() throws Exception { @@ -71,61 +107,52 @@ } public int doLogic() throws Exception { - Document doc; - if (docSize > 0) { - doc = docMaker.makeDocument(docSize); - } else { - doc = docMaker.makeDocument(); - } + Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument(); Field f = doc.getField(BasicDocMaker.BODY_FIELD); - - String body, title, date; - if (f != null) - body = f.stringValue().replace('\t', ' '); - else - body = null; + String body = f != null ? f.stringValue().replace('\t', ' ') : null; - f = doc.getField(BasicDocMaker.TITLE_FIELD); - if (f != null) - title = f.stringValue().replace('\t', ' '); - else - title = ""; - - f = doc.getField(BasicDocMaker.DATE_FIELD); - if (f != null) - date = f.stringValue().replace('\t', ' '); - else - date = ""; - if (body != null) { + f = doc.getField(BasicDocMaker.TITLE_FIELD); + String title = f != null ? f.stringValue().replace('\t', ' ') : ""; + + f = doc.getField(BasicDocMaker.DATE_FIELD); + String date = f != null ? f.stringValue().replace('\t', ' ') : ""; + lineFileOut.write(title, 0, title.length()); lineFileOut.write(SEP); lineFileOut.write(date, 0, date.length()); lineFileOut.write(SEP); lineFileOut.write(body, 0, body.length()); lineFileOut.newLine(); - lineFileOut.flush(); } return 1; } - private void log (int count) { - if (logStep<0) { - // init once per instance - logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); + private void log(int count) { + // logStep is initialized in the ctor to a positive value. If the config + // file indicates no logging, or contains an invalid value, logStep is init + // to Integer.MAX_VALUE, so that logging will not occur (at least for the + // first Integer.MAX_VALUE records). + if (count % logStep == 0) { + System.out.println("--> " + Thread.currentThread().getName() + + " processed (write line) " + count + " docs"); } - if (logStep>0 && (count%logStep)==0) { - System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs"); - } } + public void close() throws Exception { + lineFileOut.close(); + super.close(); + } + /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. */ public void setParams(String params) { - super.setParams(params); + if (super.supportsParams()) { + super.setParams(params); + } docSize = (int) Float.parseFloat(params); } @@ -135,4 +162,5 @@ public boolean supportsParams() { return true; } + } Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java (revision 0) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.benchmark; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; + +import junit.framework.TestCase; + +/** Base class for all Benchmark unit tests. */ +public class BenchmarkTestCase extends TestCase { + + private static final File workDir; + + static { + workDir = new File(System.getProperty("benchmark.work.dir", "test/benchmark")).getAbsoluteFile(); + workDir.mkdirs(); + } + + public File getWorkDir() { + return workDir; + } + +} Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java (revision 765035) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java (working copy) @@ -17,188 +17,33 @@ package org.apache.lucene.benchmark.byTask; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Iterator; + import junit.framework.TestCase; + import org.apache.lucene.benchmark.byTask.tasks.PerfTask; import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; import org.apache.lucene.benchmark.byTask.utils.Algorithm; -import java.io.File; -import java.io.StringReader; -import java.lang.reflect.Modifier; -import java.util.ArrayList; -import java.util.Iterator; - -/** - * Test very simply that perf tasks are parses as expected. - */ +/** Test very simply that perf tasks are parses as expected. */ public class TestPerfTasksParse extends TestCase { - private static final boolean DEBUG = false; static final String NEW_LINE = System.getProperty("line.separator"); static final String INDENT = " "; // properties in effect in all tests here static final String propPart = - INDENT+"directory=RAMDirectory" + NEW_LINE + - INDENT+"print.props=false" + NEW_LINE + INDENT + "directory=RAMDirectory" + NEW_LINE + + INDENT + "print.props=false" + NEW_LINE ; - /* - * All known tasks. - * As new tasks are added, add them here. - * It would be nice to do that automatically, unfortunately - * Java does not provide a "get all classes in package" or - * "get all sub-classes" functionality. - */ - static String singleTaskAlgs []; - - /* (non-Javadoc) - * @see junit.framework.TestCase#setUp() - */ - protected void setUp() throws Exception { - super.setUp(); - if (singleTaskAlgs==null) { - singleTaskAlgs = findTasks(); - } - } - - // one time initialization - static String [] findTasks () throws Exception { - ArrayList tsks = new ArrayList(); - // init with tasks we know about - tsks.add( " AddDoc " ); - tsks.add( " AddDoc(1000.0) " ); - tsks.add( " ClearStats " ); - tsks.add( " CloseIndex " ); - tsks.add( " CloseReader " ); - tsks.add( " CreateIndex " ); - tsks.add( " DeleteDoc " ); - tsks.add( " DeleteDoc(500.0) " ); - tsks.add( " NewRound " ); - tsks.add( " OpenIndex " ); - tsks.add( " OpenReader " ); - tsks.add( " Optimize " ); - tsks.add( " RepAll " ); - tsks.add( " RepSelectByPref prefix " ); - tsks.add( " RepSumByNameRound " ); - tsks.add( " RepSumByName " ); - tsks.add( " RepSumByPrefRound prefix " ); - tsks.add( " RepSumByPref prefix " ); - tsks.add( " ResetInputs " ); - tsks.add( " ResetSystemErase " ); - tsks.add( " ResetSystemSoft " ); - tsks.add( " Search " ); - tsks.add( " SearchTravRet " ); - tsks.add( " SearchTravRet(100.0) " ); - tsks.add( " SearchTrav " ); - tsks.add( " SearchTrav(50.0) " ); - tsks.add( " SetProp " ); - tsks.add( " SetProp(name,value) " ); - tsks.add( " Warm " ); - tsks.add( "SearchTravRetLoadFieldSelector"); - tsks.add("SearchTravRetLoadFieldSelector(body,title)"); - - // if tasks.dir property is defined, look for additional tasks. - // this somewhat covers tasks that would be added in the future, in case - // the list above is not updated to cover them. - // some tasks would be tested more than once this way, but that's ok. - String tasksDir = System.getProperty("tasks.dir"); - if (tasksDir !=null) { - String pkgPrefix = PerfTask.class.getPackage().getName()+"."; - String taskNames[] = new File(tasksDir).list(); - for (int i = 0; i < taskNames.length; i++) { - String name = taskNames[i].trim(); - if (!name.endsWith("Task.class")) - continue; // Task class file only - name = name.substring(0,name.length()-6); - Class cls = Class.forName(pkgPrefix+name); - if (Modifier.isAbstract(cls.getModifiers()) || Modifier.isInterface(cls.getModifiers())) - continue; // skip sbstract classes - if (!PerfTask.class.isAssignableFrom(cls)) - continue; // not a task - name = name.substring(0,name.length()-4); - if (name.startsWith("Rep") && name.indexOf("Pref")>=0) - name += " prefix"; - tsks.add(" "+name+" "); - } - } - return (String[]) tsks.toArray(new String[0]); - } - - - /** - * @param name test name - */ public TestPerfTasksParse(String name) { super(name); } - /** - * Test the parsing of very simple tasks, for all tasks - */ - public void testAllTasksSimpleParse() { - doTestAllTasksSimpleParse(false,false); - } - - /** - * Test the parsing of simple sequential sequences, for all tasks - */ - public void testAllTasksSimpleParseSequntial() { - doTestAllTasksSimpleParse(true,false); - } - - /** - * Test the parsing of simple parallel sequences, for all tasks - */ - public void testAllTasksSimpleParseParallel() { - doTestAllTasksSimpleParse(true,true); - } - - // utility for simple parsing testing of all tasks. - private void doTestAllTasksSimpleParse(boolean parOrSeq, boolean par) { - for (int i = 0; i < singleTaskAlgs.length; i++) { - String testedTask = singleTaskAlgs[i]; - if (parOrSeq) { - if (par) { - testedTask = "[ " + testedTask + " ] : 2"; - } else { - testedTask = "{ " + testedTask + " } : 3"; - } - } - try { - String algText = propPart+INDENT+testedTask; - logTstParsing(algText); - Benchmark benchmark = new Benchmark(new StringReader(algText)); - Algorithm alg = benchmark.getAlgorithm(); - ArrayList algTasks = alg.extractTasks(); - // must find a task with this name in the algorithm - boolean foundName = false; - boolean foundPar = false; - String theTask = singleTaskAlgs[i].replaceAll(" +"," ").trim(); - for (Iterator iter = algTasks.iterator(); iter.hasNext();) { - PerfTask task = (PerfTask) iter.next(); - foundName |= (task.toString().indexOf(theTask)>=0); - foundPar |= (task instanceof TaskSequence && ((TaskSequence)task).isParallel()); - } - assertTrue("Task "+testedTask+" was not found in "+alg.toString(),foundName); - if (parOrSeq) { - if (par) { - assertTrue("Task "+testedTask+" was supposed to be parallel in "+alg.toString(),foundPar); - } else { - assertFalse("Task "+testedTask+" was not supposed to be parallel in "+alg.toString(),foundPar); - } - } - } catch (Exception e) { - System.out.flush(); - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Test the repetiotion parsing for parallel tasks - */ + /** Test the repetiotion parsing for parallel tasks */ public void testParseParallelTaskSequenceRepetition() throws Exception { String taskStr = "AddDoc"; String parsedTasks = "[ "+taskStr+" ] : 1000"; @@ -219,9 +64,7 @@ } } - /** - * Test the repetiotion parsing for sequential tasks - */ + /** Test the repetiotion parsing for sequential tasks */ public void testParseTaskSequenceRepetition() throws Exception { String taskStr = "AddDoc"; String parsedTasks = "{ "+taskStr+" } : 1000"; @@ -242,11 +85,4 @@ } } - private void logTstParsing (String txt) { - if (!DEBUG) - return; - System.out.println("Test parsing of"); - System.out.println(txt); - } - } Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (revision 0) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (revision 0) @@ -0,0 +1,169 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.util.Properties; + +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.benchmark.BenchmarkTestCase; +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.tasks.AddDocTask; +import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask; +import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask; +import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; +import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; + +/** Tests the functionality of {@link LineDocMaker}. */ +public class LineDocMakerTest extends BenchmarkTestCase { + + private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); + + private void createBZ2LineFile(File file) throws Exception { + OutputStream out = new FileOutputStream(file); + out = csFactory.createCompressorOutputStream("bzip2", out); + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8")); + StringBuffer doc = new StringBuffer(); + doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body"); + writer.write(doc.toString()); + writer.newLine(); + writer.close(); + } + + private void createRegularLineFile(File file) throws Exception { + OutputStream out = new FileOutputStream(file); + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8")); + StringBuffer doc = new StringBuffer(); + doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body"); + writer.write(doc.toString()); + writer.newLine(); + writer.close(); + } + + private void doIndexAndSearchTest(File file, boolean setBZCompress, + String bz2CompressVal) throws Exception { + + Properties props = new Properties(); + + // LineDocMaker specific settings. + props.setProperty("docs.file", file.getAbsolutePath()); + if (setBZCompress) { + props.setProperty("bzip.compression", bz2CompressVal); + } + + // Indexing configuration. + props.setProperty("analyzer", SimpleAnalyzer.class.getName()); + props.setProperty("doc.maker", LineDocMaker.class.getName()); + props.setProperty("directory", "RAMDirectory"); + + // Create PerfRunData + Config config = new Config(props); + PerfRunData runData = new PerfRunData(config); + + TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false); + tasks.addTask(new CreateIndexTask(runData)); + tasks.addTask(new AddDocTask(runData)); + tasks.addTask(new CloseIndexTask(runData)); + tasks.doLogic(); + + IndexSearcher searcher = new IndexSearcher(runData.getDirectory()); + TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10); + assertEquals(1, td.totalHits); + assertNotNull(td.scoreDocs[0]); + searcher.close(); + } + + /* Tests LineDocMaker with a bzip2 input stream. */ + public void testBZip2() throws Exception { + File file = new File(getWorkDir(), "one-line.bz2"); + createBZ2LineFile(file); + doIndexAndSearchTest(file, true, "true"); + } + + public void testBZip2AutoDetect() throws Exception { + File file = new File(getWorkDir(), "one-line.bz2"); + createBZ2LineFile(file); + doIndexAndSearchTest(file, false, null); + } + + public void testBZip2WithBzipCompressionDisabled() throws Exception { + File file = new File(getWorkDir(), "one-line.bz2"); + createBZ2LineFile(file); + + try { + doIndexAndSearchTest(file, true, "false"); + fail("Some exception should have been thrown !"); + } catch (Exception e) { + // expected. + } + } + + public void testRegularFile() throws Exception { + File file = new File(getWorkDir(), "one-line"); + createRegularLineFile(file); + doIndexAndSearchTest(file, false, null); + } + + public void testRegularFileWithBZipCompressionEnabled() throws Exception { + File file = new File(getWorkDir(), "one-line"); + createRegularLineFile(file); + + try { + doIndexAndSearchTest(file, true, "true"); + fail("Some exception should have been thrown !"); + } catch (Exception e) { + // expected. + } + } + + public void testInvalidFormat() throws Exception { + String[] testCases = new String[] { + "", // empty line + "title", // just title + "title" + WriteLineDocTask.SEP, // title + SEP + "title" + WriteLineDocTask.SEP + "body", // title + SEP + body + // note that title + SEP + body + SEP is a valid line, which results in an + // empty body + }; + + for (int i = 0; i < testCases.length; i++) { + File file = new File(getWorkDir(), "one-line"); + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8")); + writer.write(testCases[i]); + writer.newLine(); + writer.close(); + try { + doIndexAndSearchTest(file, false, null); + fail("Some exception should have been thrown for: [" + testCases[i] + "]"); + } catch (Exception e) { + // expected. + } + } + } + +} Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (revision 0) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (revision 0) @@ -0,0 +1,151 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Properties; + +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.lucene.benchmark.BenchmarkTestCase; +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; +import org.apache.lucene.benchmark.byTask.feeds.DocData; +import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; + +/** Tests the functionality of {@link WriteLineDocTask}. */ +public class WriteLineDocTaskTest extends BenchmarkTestCase { + + // class has to be public so that Class.forName.newInstance() will work + public static final class WriteLineDocMaker extends BasicDocMaker { + + protected DocData getNextDocData() throws NoMoreDataException, Exception { + throw new UnsupportedOperationException("not implemented"); + } + + public Document makeDocument() throws Exception { + Document doc = new Document(); + doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); + doc.add(new Field(TITLE_FIELD, "title", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); + doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); + return doc; + } + + public int numUniqueTexts() { + return 0; + } + + } + + private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); + + private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception { + Properties props = new Properties(); + props.setProperty("doc.maker", WriteLineDocMaker.class.getName()); + props.setProperty("line.file.out", file.getAbsolutePath()); + if (setBZCompress) { + props.setProperty("bzip.compression", bz2CompressVal); + } + props.setProperty("directory", "RAMDirectory"); // no accidental FS dir. + Config config = new Config(props); + return new PerfRunData(config); + } + + private void doReadTest(File file, boolean bz2File) throws Exception { + InputStream in = new FileInputStream(file); + if (bz2File) { + in = csFactory.createCompressorInputStream("bzip2", in); + } + BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8")); + try { + String line = br.readLine(); + assertNotNull(line); + String[] parts = line.split(Character.toString(WriteLineDocTask.SEP)); + assertEquals(3, parts.length); + assertEquals("title", parts[0]); + assertEquals("date", parts[1]); + assertEquals("body", parts[2]); + assertNull(br.readLine()); + } finally { + br.close(); + } + } + + /* Tests WriteLineDocTask with a bzip2 format. */ + public void testBZip2() throws Exception { + + // Create a document in bz2 format. + File file = new File(getWorkDir(), "one-line.bz2"); + PerfRunData runData = createPerfRunData(file, true, "true"); + WriteLineDocTask wldt = new WriteLineDocTask(runData); + wldt.doLogic(); + wldt.close(); + + doReadTest(file, true); + } + + public void testBZip2AutoDetect() throws Exception { + + // Create a document in bz2 format. + File file = new File(getWorkDir(), "one-line.bz2"); + PerfRunData runData = createPerfRunData(file, false, null); + WriteLineDocTask wldt = new WriteLineDocTask(runData); + wldt.doLogic(); + wldt.close(); + + doReadTest(file, true); + } + + public void testRegularFile() throws Exception { + + // Create a document in regular format. + File file = new File(getWorkDir(), "one-line"); + PerfRunData runData = createPerfRunData(file, true, "false"); + WriteLineDocTask wldt = new WriteLineDocTask(runData); + wldt.doLogic(); + wldt.close(); + + doReadTest(file, false); + } + + public void testRegularFileWithBZipCompressionEnabled() throws Exception { + + // Create a document in regular format and set bzip.compression to true. + File file = new File(getWorkDir(), "one-line"); + PerfRunData runData = createPerfRunData(file, true, "true"); + WriteLineDocTask wldt = new WriteLineDocTask(runData); + wldt.doLogic(); + wldt.close(); + + try { + doReadTest(file, false); + fail("Some exception was expected here !"); + } catch (Exception e) { + // expected. + } + } + +}