Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (revision 0)
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (revision 0)
@@ -0,0 +1,53 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Date;
+
+import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource.TrecDocParser;
+
+/**
+ * Parser for the GOV2 collection format
+ */
+public class TrecGov2Parser implements TrecDocParser {
+
+ private static final String DATE = "Date: ";
+ private static final String DOCHDR = "";
+ private static final String TERMINATING_DOCHDR = "";
+
+ private String dateStr;
+
+ public DocData trecDoc(DocData docData, String name, TrecContentSource trecSrc, Reader r, HTMLParser htmlParser) throws IOException, InterruptedException {
+ Date date = dateStr != null ? trecSrc.parseDate(dateStr) : null;
+ return htmlParser.parse(docData, name, date, null, r, null);
+ }
+
+ public void readTrecDoc(TrecContentSource trecSrc, StringBuilder docBuf) throws IOException, NoMoreDataException {
+ // 1. skip until doc header
+ trecSrc.read(docBuf, false, false, null, DOCHDR);
+
+ boolean findTerminatingDocHdr = false;
+
+ // 2. date - look for the date only until /DOCHDR
+ docBuf.setLength(0);
+ trecSrc.read(docBuf, true, false, TERMINATING_DOCHDR, DATE);
+ if (docBuf.length() == 0) {
+ dateStr = null;
+ } else {
+ // Date found.
+ dateStr = docBuf.substring(DATE.length());
+ findTerminatingDocHdr = true;
+ }
+
+ // 3. skip until end of doc header
+ if (findTerminatingDocHdr) {
+ docBuf.setLength(0);
+ trecSrc.read(docBuf, false, false, null, TERMINATING_DOCHDR);
+ }
+
+ // 4. collect into buffer until end of doc
+ docBuf.setLength(0);
+ trecSrc.read(docBuf, false, true, null, TrecContentSource.TERMINATING_DOC);
+ }
+
+}
Property changes on: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java
___________________________________________________________________
Added: svn:executable
+ *
Added: svn:eol-style
+ native
Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (revision 1058759)
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (working copy)
@@ -19,8 +19,8 @@
import java.io.BufferedReader;
import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.text.DateFormat;
@@ -29,7 +29,6 @@
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
-import java.util.zip.GZIPInputStream;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
@@ -54,18 +53,27 @@
*/
public class TrecContentSource extends ContentSource {
+ /**
+ * Parser for trec doc content, invoked after seeing , , ... ,
+ * responsible for reading until and including and parsing it.
+ */
+ public interface TrecDocParser {
+ /** preparation step - read the text from input, after , external synchronization assumed */
+ void readTrecDoc(TrecContentSource trecSrc, StringBuilder docBuf) throws IOException, NoMoreDataException;
+
+ /** finalization part - parse the prepared text and create the result DocData, not synchronization required */
+ DocData trecDoc(DocData docData, String name, TrecContentSource trecSrc, Reader r, HTMLParser htmlParser) throws IOException, InterruptedException;
+ }
+
private static final class DateFormatInfo {
DateFormat[] dfs;
ParsePosition pos;
}
- private static final String DATE = "Date: ";
- private static final String DOCHDR = "";
- private static final String TERMINATING_DOCHDR = "";
- private static final String DOCNO = "";
- private static final String TERMINATING_DOCNO = "";
- private static final String DOC = "";
- private static final String TERMINATING_DOC = "";
+ public static final String DOCNO = "";
+ public static final String TERMINATING_DOCNO = "";
+ public static final String DOC = "";
+ public static final String TERMINATING_DOC = "";
private static final String NEW_LINE = System.getProperty("line.separator");
@@ -80,10 +88,11 @@
private ThreadLocal dateFormats = new ThreadLocal();
private ThreadLocal trecDocReader = new ThreadLocal();
private ThreadLocal trecDocBuffer = new ThreadLocal();
+ private ThreadLocal trecDocParser = new ThreadLocal();
private File dataDir = null;
private ArrayList inputFiles = new ArrayList();
private int nextFile = 0;
- private int rawDocSize;
+ private int[] rawDocSize = new int[1];
// Use to synchronize threads on reading from the TREC documents.
private Object lock = new Object();
@@ -94,6 +103,15 @@
HTMLParser htmlParser;
private boolean excludeDocnameIteration;
+ private TrecDocParser getTrecDocParser() { //TODO need to set by file extension and/or by config property
+ TrecDocParser tdp = trecDocParser.get();
+ if (tdp == null) {
+ tdp = new TrecGov2Parser();
+ trecDocParser.set(tdp);
+ }
+ return tdp;
+ }
+
private DateFormatInfo getDateFormatInfo() {
DateFormatInfo dfi = dateFormats.get();
if (dfi == null) {
@@ -129,10 +147,21 @@
return r;
}
- // read until finding a line that starts with the specified prefix, or a terminating tag has been found.
- private void read(StringBuilder buf, String prefix, boolean collectMatchLine,
- boolean collectAll, String terminatingTag)
- throws IOException, NoMoreDataException {
+ /**
+ * Read until finding a line that starts with one of specified prefixes,
+ * or the specified terminating tag has been found.
+ * @param buf buffer for collecting the data if so specified by collectAll
+ * or collectMatchingLine.
+ * @param collectMatchLine whether to collect the matching line into buffer.
+ * @param collectAll whether to collect all lines into buffer.
+ * @param terminatingTag terminating tag to look for, can be null.
+ * @param prefixes prefixes to look for, can be empty or null.
+ * @return the prefix that matched or null if none.
+ * @throws IOException
+ * @throws NoMoreDataException
+ */
+ String read(StringBuilder buf, boolean collectMatchLine, boolean collectAll,
+ String terminatingTag, String... prefixes) throws IOException, NoMoreDataException {
String sep = "";
while (true) {
String line = reader.readLine();
@@ -142,14 +171,16 @@
continue;
}
- rawDocSize += line.length();
+ rawDocSize[0] += line.length();
- if (line.startsWith(prefix)) {
- if (collectMatchLine) {
- buf.append(sep).append(line);
- sep = NEW_LINE;
+ for (String prefix : prefixes) {
+ if (line.startsWith(prefix)) {
+ if (collectMatchLine) {
+ buf.append(sep).append(line);
+ sep = NEW_LINE;
+ }
+ return prefix;
}
- break;
}
if (terminatingTag != null && line.startsWith(terminatingTag)) {
@@ -157,7 +188,7 @@
// tag was found. set the length to 0 to signal no match was
// found.
buf.setLength(0);
- break;
+ return null;
}
if (collectAll) {
@@ -169,7 +200,6 @@
void openNextFile() throws NoMoreDataException, IOException {
close();
- int retries = 0;
while (true) {
if (nextFile >= inputFiles.size()) {
// exhausted files, start a new round, unless forever set to false.
@@ -184,13 +214,11 @@
System.out.println("opening: " + f + " length: " + f.length());
}
try {
- GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), BUFFER_SIZE);
- reader = new BufferedReader(new InputStreamReader(zis, encoding), BUFFER_SIZE);
- return;
+ InputStream inputStream = getInputStream(f); // support either gzip, bzip2, or regular text file, by extension
+ reader = new BufferedReader(new InputStreamReader(inputStream, encoding), BUFFER_SIZE);
} catch (Exception e) {
- retries++;
- if (retries < 20 && verbose) {
- System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " #retries=" + retries);
+ if (verbose) {
+ System.out.println("Skipping 'bad' file " + f.getAbsolutePath()+" due to "+e.getMessage());
continue;
}
throw new NoMoreDataException();
@@ -237,8 +265,10 @@
@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
- String dateStr = null, name = null;
+ String name = null;
Reader r = null;
+ TrecDocParser trecDocParser = null;
+
// protect reading from the TREC files by multiple threads. The rest of the
// method, i.e., parsing the content and returning the DocData can run
// unprotected.
@@ -249,58 +279,38 @@
StringBuilder docBuf = getDocBuffer();
- // 1. skip until doc start
+ // 1. skip until doc start - required for all TREC formats
docBuf.setLength(0);
- read(docBuf, DOC, false, false, null);
+ read(docBuf, false, false, null, DOC);
- // 2. name
+ // 2. name - required for all TREC formats
docBuf.setLength(0);
- read(docBuf, DOCNO, true, false, null);
+ read(docBuf, true, false, null, DOCNO);
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
- DOCNO.length()));
+ DOCNO.length())).trim();
+
if (!excludeDocnameIteration)
name = name + "_" + iteration;
- // 3. skip until doc header
+ trecDocParser = getTrecDocParser();
docBuf.setLength(0);
- read(docBuf, DOCHDR, false, false, null);
-
- boolean findTerminatingDocHdr = false;
-
- // 4. date - look for the date only until /DOCHDR
- docBuf.setLength(0);
- read(docBuf, DATE, true, false, TERMINATING_DOCHDR);
- if (docBuf.length() != 0) {
- // Date found.
- dateStr = docBuf.substring(DATE.length());
- findTerminatingDocHdr = true;
- }
-
- // 5. skip until end of doc header
- if (findTerminatingDocHdr) {
- docBuf.setLength(0);
- read(docBuf, TERMINATING_DOCHDR, false, false, null);
- }
-
- // 6. collect until end of doc
- docBuf.setLength(0);
- read(docBuf, TERMINATING_DOC, false, true, null);
+ trecDocParser.readTrecDoc(this, docBuf);
- // 7. Set up a Reader over the read content
+ // Set up a Reader over the read content
r = getTrecDocReader(docBuf);
+
// Resetting the thread's reader means it will reuse the instance
// allocated as well as re-read from docBuf.
r.reset();
-
+
// count char length of parsed html text (larger than the plain doc body text).
addBytes(docBuf.length());
}
// This code segment relies on HtmlParser being thread safe. When we get
// here, everything else is already private to that thread, so we're safe.
- Date date = dateStr != null ? parseDate(dateStr) : null;
try {
- docData = htmlParser.parse(docData, name, date, r, null);
+ docData = trecDocParser.trecDoc(docData, name, this, r, htmlParser);
addDoc();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (revision 1058759)
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (working copy)
@@ -29,16 +29,18 @@
/**
* Parse the input Reader and return DocData.
- * A provided name or date is used for the result, otherwise an attempt is
- * made to set them from the parsed data.
+ * The provided name,title,date are used for the result, unless when they're null,
+ * in which case an attempt is made to set them from the parsed data.
+ * @param docData result reused
+ * @param name name of the result doc data.
+ * @param date date of the result doc data. If null, attempt to set by parsed data.
+ * @param title title of the result doc data. If null, attempt to set by parsed data.
+ * @param reader reader of html text to parse.
* @param dateFormat date formatter to use for extracting the date.
- * @param name name of the result doc data. If null, attempt to set by parsed data.
- * @param date date of the result doc data. If null, attempt to set by parsed data.
- * @param reader of html text to parse.
* @return Parsed doc data.
* @throws IOException
* @throws InterruptedException
*/
- public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
+ public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
}
Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (revision 1058759)
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (working copy)
@@ -29,11 +29,14 @@
*/
public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
- public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
+ public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
// title
- String title = p.getTitle();
+ if (title==null) {
+ title = p.getTitle();
+ }
+
// properties
Properties props = p.getMetaTags();
// body