Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (revision 0) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +import java.io.IOException; +import java.io.Reader; +import java.util.Date; + +import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource.TrecDocParser; + +/** + * Parser for the GOV2 collection format + */ +public class TrecGov2Parser implements TrecDocParser { + + private static final String DATE = "Date: "; + private static final String DOCHDR = ""; + private static final String TERMINATING_DOCHDR = ""; + + private String dateStr; + + public DocData trecDoc(DocData docData, String name, TrecContentSource trecSrc, Reader r, HTMLParser htmlParser) throws IOException, InterruptedException { + Date date = dateStr != null ? trecSrc.parseDate(dateStr) : null; + return htmlParser.parse(docData, name, date, null, r, null); + } + + public void readTrecDoc(TrecContentSource trecSrc, StringBuilder docBuf) throws IOException, NoMoreDataException { + // 1. skip until doc header + trecSrc.read(docBuf, false, false, null, DOCHDR); + + boolean findTerminatingDocHdr = false; + + // 2. date - look for the date only until /DOCHDR + docBuf.setLength(0); + trecSrc.read(docBuf, true, false, TERMINATING_DOCHDR, DATE); + if (docBuf.length() == 0) { + dateStr = null; + } else { + // Date found. + dateStr = docBuf.substring(DATE.length()); + findTerminatingDocHdr = true; + } + + // 3. skip until end of doc header + if (findTerminatingDocHdr) { + docBuf.setLength(0); + trecSrc.read(docBuf, false, false, null, TERMINATING_DOCHDR); + } + + // 4. collect into buffer until end of doc + docBuf.setLength(0); + trecSrc.read(docBuf, false, true, null, TrecContentSource.TERMINATING_DOC); + } + +} Property changes on: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java ___________________________________________________________________ Added: svn:executable + * Added: svn:eol-style + native Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (revision 1058759) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (working copy) @@ -19,8 +19,8 @@ import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.text.DateFormat; @@ -29,7 +29,6 @@ import java.util.ArrayList; import java.util.Date; import java.util.Locale; -import java.util.zip.GZIPInputStream; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader; @@ -54,18 +53,27 @@ */ public class TrecContentSource extends ContentSource { + /** + * Parser for trec doc content, invoked after seeing , , ... , + * responsible for reading until and including and parsing it. + */ + public interface TrecDocParser { + /** preparation step - read the text from input, after , external synchronization assumed */ + void readTrecDoc(TrecContentSource trecSrc, StringBuilder docBuf) throws IOException, NoMoreDataException; + + /** finalization part - parse the prepared text and create the result DocData, not synchronization required */ + DocData trecDoc(DocData docData, String name, TrecContentSource trecSrc, Reader r, HTMLParser htmlParser) throws IOException, InterruptedException; + } + private static final class DateFormatInfo { DateFormat[] dfs; ParsePosition pos; } - private static final String DATE = "Date: "; - private static final String DOCHDR = ""; - private static final String TERMINATING_DOCHDR = ""; - private static final String DOCNO = ""; - private static final String TERMINATING_DOCNO = ""; - private static final String DOC = ""; - private static final String TERMINATING_DOC = ""; + public static final String DOCNO = ""; + public static final String TERMINATING_DOCNO = ""; + public static final String DOC = ""; + public static final String TERMINATING_DOC = ""; private static final String NEW_LINE = System.getProperty("line.separator"); @@ -80,10 +88,11 @@ private ThreadLocal dateFormats = new ThreadLocal(); private ThreadLocal trecDocReader = new ThreadLocal(); private ThreadLocal trecDocBuffer = new ThreadLocal(); + private ThreadLocal trecDocParser = new ThreadLocal(); private File dataDir = null; private ArrayList inputFiles = new ArrayList(); private int nextFile = 0; - private int rawDocSize; + private int[] rawDocSize = new int[1]; // Use to synchronize threads on reading from the TREC documents. private Object lock = new Object(); @@ -94,6 +103,15 @@ HTMLParser htmlParser; private boolean excludeDocnameIteration; + private TrecDocParser getTrecDocParser() { //TODO need to set by file extension and/or by config property + TrecDocParser tdp = trecDocParser.get(); + if (tdp == null) { + tdp = new TrecGov2Parser(); + trecDocParser.set(tdp); + } + return tdp; + } + private DateFormatInfo getDateFormatInfo() { DateFormatInfo dfi = dateFormats.get(); if (dfi == null) { @@ -129,10 +147,21 @@ return r; } - // read until finding a line that starts with the specified prefix, or a terminating tag has been found. - private void read(StringBuilder buf, String prefix, boolean collectMatchLine, - boolean collectAll, String terminatingTag) - throws IOException, NoMoreDataException { + /** + * Read until finding a line that starts with one of specified prefixes, + * or the specified terminating tag has been found. + * @param buf buffer for collecting the data if so specified by collectAll + * or collectMatchingLine. + * @param collectMatchLine whether to collect the matching line into buffer. + * @param collectAll whether to collect all lines into buffer. + * @param terminatingTag terminating tag to look for, can be null. + * @param prefixes prefixes to look for, can be empty or null. + * @return the prefix that matched or null if none. + * @throws IOException + * @throws NoMoreDataException + */ + String read(StringBuilder buf, boolean collectMatchLine, boolean collectAll, + String terminatingTag, String... prefixes) throws IOException, NoMoreDataException { String sep = ""; while (true) { String line = reader.readLine(); @@ -142,14 +171,16 @@ continue; } - rawDocSize += line.length(); + rawDocSize[0] += line.length(); - if (line.startsWith(prefix)) { - if (collectMatchLine) { - buf.append(sep).append(line); - sep = NEW_LINE; + for (String prefix : prefixes) { + if (line.startsWith(prefix)) { + if (collectMatchLine) { + buf.append(sep).append(line); + sep = NEW_LINE; + } + return prefix; } - break; } if (terminatingTag != null && line.startsWith(terminatingTag)) { @@ -157,7 +188,7 @@ // tag was found. set the length to 0 to signal no match was // found. buf.setLength(0); - break; + return null; } if (collectAll) { @@ -169,7 +200,6 @@ void openNextFile() throws NoMoreDataException, IOException { close(); - int retries = 0; while (true) { if (nextFile >= inputFiles.size()) { // exhausted files, start a new round, unless forever set to false. @@ -184,13 +214,11 @@ System.out.println("opening: " + f + " length: " + f.length()); } try { - GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), BUFFER_SIZE); - reader = new BufferedReader(new InputStreamReader(zis, encoding), BUFFER_SIZE); - return; + InputStream inputStream = getInputStream(f); // support either gzip, bzip2, or regular text file, by extension + reader = new BufferedReader(new InputStreamReader(inputStream, encoding), BUFFER_SIZE); } catch (Exception e) { - retries++; - if (retries < 20 && verbose) { - System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " #retries=" + retries); + if (verbose) { + System.out.println("Skipping 'bad' file " + f.getAbsolutePath()+" due to "+e.getMessage()); continue; } throw new NoMoreDataException(); @@ -237,8 +265,10 @@ @Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { - String dateStr = null, name = null; + String name = null; Reader r = null; + TrecDocParser trecDocParser = null; + // protect reading from the TREC files by multiple threads. The rest of the // method, i.e., parsing the content and returning the DocData can run // unprotected. @@ -249,58 +279,38 @@ StringBuilder docBuf = getDocBuffer(); - // 1. skip until doc start + // 1. skip until doc start - required for all TREC formats docBuf.setLength(0); - read(docBuf, DOC, false, false, null); + read(docBuf, false, false, null, DOC); - // 2. name + // 2. name - required for all TREC formats docBuf.setLength(0); - read(docBuf, DOCNO, true, false, null); + read(docBuf, true, false, null, DOCNO); name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, - DOCNO.length())); + DOCNO.length())).trim(); + if (!excludeDocnameIteration) name = name + "_" + iteration; - // 3. skip until doc header + trecDocParser = getTrecDocParser(); docBuf.setLength(0); - read(docBuf, DOCHDR, false, false, null); - - boolean findTerminatingDocHdr = false; - - // 4. date - look for the date only until /DOCHDR - docBuf.setLength(0); - read(docBuf, DATE, true, false, TERMINATING_DOCHDR); - if (docBuf.length() != 0) { - // Date found. - dateStr = docBuf.substring(DATE.length()); - findTerminatingDocHdr = true; - } - - // 5. skip until end of doc header - if (findTerminatingDocHdr) { - docBuf.setLength(0); - read(docBuf, TERMINATING_DOCHDR, false, false, null); - } - - // 6. collect until end of doc - docBuf.setLength(0); - read(docBuf, TERMINATING_DOC, false, true, null); + trecDocParser.readTrecDoc(this, docBuf); - // 7. Set up a Reader over the read content + // Set up a Reader over the read content r = getTrecDocReader(docBuf); + // Resetting the thread's reader means it will reuse the instance // allocated as well as re-read from docBuf. r.reset(); - + // count char length of parsed html text (larger than the plain doc body text). addBytes(docBuf.length()); } // This code segment relies on HtmlParser being thread safe. When we get // here, everything else is already private to that thread, so we're safe. - Date date = dateStr != null ? parseDate(dateStr) : null; try { - docData = htmlParser.parse(docData, name, date, r, null); + docData = trecDocParser.trecDoc(docData, name, this, r, htmlParser); addDoc(); } catch (InterruptedException ie) { throw new ThreadInterruptedException(ie); Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (revision 1058759) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (working copy) @@ -29,16 +29,18 @@ /** * Parse the input Reader and return DocData. - * A provided name or date is used for the result, otherwise an attempt is - * made to set them from the parsed data. + * The provided name,title,date are used for the result, unless when they're null, + * in which case an attempt is made to set them from the parsed data. + * @param docData result reused + * @param name name of the result doc data. + * @param date date of the result doc data. If null, attempt to set by parsed data. + * @param title title of the result doc data. If null, attempt to set by parsed data. + * @param reader reader of html text to parse. * @param dateFormat date formatter to use for extracting the date. - * @param name name of the result doc data. If null, attempt to set by parsed data. - * @param date date of the result doc data. If null, attempt to set by parsed data. - * @param reader of html text to parse. * @return Parsed doc data. * @throws IOException * @throws InterruptedException */ - public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException; + public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException; } Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (revision 1058759) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (working copy) @@ -29,11 +29,14 @@ */ public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser { - public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { + public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader); // title - String title = p.getTitle(); + if (title==null) { + title = p.getTitle(); + } + // properties Properties props = p.getMetaTags(); // body