Index: contrib/benchmark/CHANGES.txt =================================================================== --- contrib/benchmark/CHANGES.txt (revision 797694) +++ contrib/benchmark/CHANGES.txt (working copy) @@ -4,6 +4,16 @@ $Id:$ +7/24/2009 + LUCENE-1595: Deprecate LineDocMaker and EnwikiDocMaker in favor of + using DocMaker directly, with content.source = LineDocSource or + EnwikiContentSource. NOTE: with this change, the "id" field from + the Wikipedia XML export is now indexed as the "docname" field + (previously it was indexed as "docid"). Additionaly, the + SearchWithSort task now accepts all types that SortField can accept + and no longer falls back to SortField.AUTO, which has been + deprecated. (Mike McCandless) + 7/20/2009 LUCENE-1755: Fix WriteLineDocTask to output a document if it contains either a title or body (or both). (Shai Erera via Mark Miller) Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 797694) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy) @@ -299,7 +299,7 @@ } /** - * Test WriteLineDoc and LineDocMaker. + * Test WriteLineDoc and LineDocSource. */ public void testLineDocFile() throws Exception { File lineFile = new File(System.getProperty("tempDir"), "test.reuters.lines.txt"); @@ -334,7 +334,7 @@ String algLines2[] = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.SimpleAnalyzer", - "doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker", + "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'), "content.source.forever=false", "doc.reuse.fields=false", @@ -355,7 +355,7 @@ iw.close(); IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - assertEquals(numLines + " lines were were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs()); + assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs()); ir.close(); lineFile.delete(); Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (revision 797694) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (working copy) @@ -172,7 +172,7 @@ public void testCharsReplace() throws Exception { // WriteLineDocTask replaced only \t characters w/ a space, since that's its // separator char. However, it didn't replace newline characters, which - // resulted in errors in LineDocMaker. + // resulted in errors in LineDocSource. File file = new File(getWorkDir(), "one-line"); PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (revision 797694) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (working copy) @@ -1,145 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileOutputStream; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.util.Properties; - -import org.apache.commons.compress.compressors.CompressorStreamFactory; -import org.apache.lucene.analysis.SimpleAnalyzer; -import org.apache.lucene.benchmark.BenchmarkTestCase; -import org.apache.lucene.benchmark.byTask.PerfRunData; -import org.apache.lucene.benchmark.byTask.tasks.AddDocTask; -import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask; -import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask; -import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; -import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TopDocs; - -/** Tests the functionality of {@link LineDocMaker}. */ -public class LineDocMakerTest extends BenchmarkTestCase { - - private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); - - private void createBZ2LineFile(File file) throws Exception { - OutputStream out = new FileOutputStream(file); - out = csFactory.createCompressorOutputStream("bzip2", out); - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8")); - StringBuffer doc = new StringBuffer(); - doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body"); - writer.write(doc.toString()); - writer.newLine(); - writer.close(); - } - - private void createRegularLineFile(File file) throws Exception { - OutputStream out = new FileOutputStream(file); - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8")); - StringBuffer doc = new StringBuffer(); - doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body"); - writer.write(doc.toString()); - writer.newLine(); - writer.close(); - } - - private void doIndexAndSearchTest(File file, boolean setBZCompress, - String bz2CompressVal) throws Exception { - - Properties props = new Properties(); - - // LineDocMaker specific settings. - props.setProperty("docs.file", file.getAbsolutePath()); - if (setBZCompress) { - props.setProperty("bzip.compression", bz2CompressVal); - } - - // Indexing configuration. - props.setProperty("analyzer", SimpleAnalyzer.class.getName()); - props.setProperty("doc.maker", LineDocMaker.class.getName()); - props.setProperty("directory", "RAMDirectory"); - - // Create PerfRunData - Config config = new Config(props); - PerfRunData runData = new PerfRunData(config); - - TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false); - tasks.addTask(new CreateIndexTask(runData)); - tasks.addTask(new AddDocTask(runData)); - tasks.addTask(new CloseIndexTask(runData)); - tasks.doLogic(); - - IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true); - TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10); - assertEquals(1, td.totalHits); - assertNotNull(td.scoreDocs[0]); - searcher.close(); - } - - /* Tests LineDocMaker with a bzip2 input stream. */ - public void testBZip2() throws Exception { - File file = new File(getWorkDir(), "one-line.bz2"); - createBZ2LineFile(file); - doIndexAndSearchTest(file, true, "true"); - } - - public void testBZip2AutoDetect() throws Exception { - File file = new File(getWorkDir(), "one-line.bz2"); - createBZ2LineFile(file); - doIndexAndSearchTest(file, false, null); - } - - public void testRegularFile() throws Exception { - File file = new File(getWorkDir(), "one-line"); - createRegularLineFile(file); - doIndexAndSearchTest(file, false, null); - } - - public void testInvalidFormat() throws Exception { - String[] testCases = new String[] { - "", // empty line - "title", // just title - "title" + WriteLineDocTask.SEP, // title + SEP - "title" + WriteLineDocTask.SEP + "body", // title + SEP + body - // note that title + SEP + body + SEP is a valid line, which results in an - // empty body - }; - - for (int i = 0; i < testCases.length; i++) { - File file = new File(getWorkDir(), "one-line"); - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8")); - writer.write(testCases[i]); - writer.newLine(); - writer.close(); - try { - doIndexAndSearchTest(file, false, null); - fail("Some exception should have been thrown for: [" + testCases[i] + "]"); - } catch (Exception e) { - // expected. - } - } - } - -} Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java (revision 797694) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java (working copy) @@ -39,8 +39,8 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; -/** Tests the functionality of {@link LineDocMaker}. */ -public class LineDocMakerTest extends BenchmarkTestCase { +/** Tests the functionality of {@link LineDocSource}. */ +public class LineDocSourceTest extends BenchmarkTestCase { private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); @@ -70,7 +70,7 @@ Properties props = new Properties(); - // LineDocMaker specific settings. + // LineDocSource specific settings. props.setProperty("docs.file", file.getAbsolutePath()); if (setBZCompress) { props.setProperty("bzip.compression", bz2CompressVal); @@ -78,7 +78,7 @@ // Indexing configuration. props.setProperty("analyzer", SimpleAnalyzer.class.getName()); - props.setProperty("doc.maker", LineDocMaker.class.getName()); + props.setProperty("content.source", LineDocSource.class.getName()); props.setProperty("directory", "RAMDirectory"); // Create PerfRunData @@ -98,7 +98,7 @@ searcher.close(); } - /* Tests LineDocMaker with a bzip2 input stream. */ + /* Tests LineDocSource with a bzip2 input stream. */ public void testBZip2() throws Exception { File file = new File(getWorkDir(), "one-line.bz2"); createBZ2LineFile(file); Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java (revision 797694) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java (working copy) @@ -56,6 +56,8 @@ SortField sortField0; if (field.equals("doc")) { sortField0 = SortField.FIELD_DOC; + } if (field.equals("score")) { + sortField0 = SortField.FIELD_SCORE; } else if (field.equals("noscore")) { doScore = false; continue; @@ -90,14 +92,22 @@ int type; if (typeString.equals("float")) { type = SortField.FLOAT; + } else if (typeString.equals("double")) { + type = SortField.DOUBLE; + } else if (typeString.equals("byte")) { + type = SortField.BYTE; + } else if (typeString.equals("short")) { + type = SortField.SHORT; } else if (typeString.equals("int")) { type = SortField.INT; + } else if (typeString.equals("long")) { + type = SortField.LONG; } else if (typeString.equals("string")) { type = SortField.STRING; } else if (typeString.equals("string_val")) { type = SortField.STRING_VAL; } else { - type = SortField.AUTO; + throw new RuntimeException("Unrecognized sort field type " + typeString); } return type; } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (revision 797694) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (working copy) @@ -24,6 +24,7 @@ import java.util.Map; import java.util.Properties; import java.util.Map.Entry; +import java.util.Random; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Format; @@ -60,6 +61,9 @@ *
  • doc.reuse.fields - specifies whether Field and Document objects * should be reused (default true). *
  • doc.index.props - specifies whether the properties returned by + *
  • doc.random.id.limit - if specified, docs will be assigned random + * IDs from 0 to this limit. This is useful with UpdateDoc + * for testing performance of IndexWriter.updateDocument. * {@link DocData#getProps()} will be indexed. (default false). * */ @@ -70,11 +74,14 @@ private int cnt; } + private Random r; + private int updateDocIDLimit; + static class DocState { - private Map fields; - private boolean reuseFields; - Document doc; + private final Map fields; + private final boolean reuseFields; + final Document doc; DocData docData = new DocData(); public DocState(boolean reuseFields, Store store, Index index, Index bodyIndex, TermVector termVector) { @@ -92,6 +99,9 @@ fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector)); doc = new Document(); + } else { + fields = null; + doc = null; } } @@ -150,14 +160,14 @@ // use only part of the body, modify it to keep the rest (or use all if size==0). // reset the docdata properties so they are not added more than once. private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { - int docid = incrNumDocsCreated(); - DocState ds = reuseFields ? getDocState() : localDocState; - Document doc = reuseFields ? ds.doc : new Document(); + + final DocState ds = reuseFields ? getDocState() : localDocState; + final Document doc = reuseFields ? ds.doc : new Document(); doc.getFields().clear(); // Set ID_FIELD Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal); - idField.setValue("doc" + docid); + idField.setValue("doc" + (r != null ? r.nextInt(updateDocIDLimit) : incrNumDocsCreated())); doc.add(idField); // Set NAME_FIELD @@ -407,6 +417,11 @@ } indexProperties = config.get("doc.index.props", false); + + updateDocIDLimit = config.get("doc.random.id.limit", -1); + if (updateDocIDLimit != -1) { + r = new Random(179); + } } } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (revision 797694) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (working copy) @@ -28,43 +28,14 @@ * A {@link DocMaker} which reads the English Wikipedia dump. Uses * {@link EnwikiContentSource} as its content source, regardless if a different * content source was defined in the configuration. + * @deprecated Please use {@link DocMaker} instead, with content.source=EnwikiContentSource */ public class EnwikiDocMaker extends DocMaker { - - public Document makeDocument() throws Exception { - DocState ds = reuseFields ? getDocState() : localDocState; - DocData dd = source.getNextDocData(ds.docData); - Document doc = reuseFields ? ds.doc : new Document(); - doc.getFields().clear(); - - Field body = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal); - body.setValue(dd.getBody()); - doc.add(body); - - Field title = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal); - title.setValue(dd.getTitle()); - doc.add(title); - - Field date = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal); - date.setValue(dd.getDate()); - doc.add(date); - - Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO); - id.setValue(dd.getName()); - doc.add(id); - - return doc; - } - - public Document makeDocument(int size) throws Exception { - throw new RuntimeException("cannot change document size with EnwikiDocMaker"); - } - public void setConfig(Config config) { super.setConfig(config); // Override whatever content source was set in the config source = new EnwikiContentSource(); source.setConfig(config); + System.out.println("NOTE: EnwikiDocMaker is deprecated; please use DocMaker instead with content.source=EnwikiContentSource"); } - -} \ No newline at end of file +} Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (revision 797694) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (working copy) @@ -42,51 +42,13 @@ * 0..N; this is useful with UpdateDoc to test updating random documents; if * this is unspecified or -1, then docid is sequentially assigned * + * @deprecated Please use {@link DocMaker} instead, with content.source=LineDocSource */ public class LineDocMaker extends DocMaker { - - private Random r; - private int numDocs; - - public Document makeDocument() throws Exception { - - DocState ds = reuseFields ? getDocState() : localDocState; - DocData dd = source.getNextDocData(ds.docData); - Document doc = reuseFields ? ds.doc : new Document(); - doc.getFields().clear(); - - Field body = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal); - body.setValue(dd.getBody()); - doc.add(body); - - Field title = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal); - title.setValue(dd.getTitle()); - doc.add(title); - - Field date = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal); - date.setValue(dd.getDate()); - doc.add(date); - - String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated()); - Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO); - id.setValue(docID); - doc.add(id); - - return doc; - } - - public Document makeDocument(int size) throws Exception { - throw new RuntimeException("cannot change document size with LineDocMaker"); - } - public void setConfig(Config config) { super.setConfig(config); source = new LineDocSource(); source.setConfig(config); - numDocs = config.get("doc.random.id.limit", -1); - if (numDocs != -1) { - r = new Random(179); - } + System.out.println("NOTE: LineDocMaker is deprecated; please use DocMaker instead with content.source=LineDocSource"); } - }