Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java (revision 729833) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java (working copy) @@ -31,6 +31,10 @@ import java.util.Locale; import java.util.zip.GZIPInputStream; +import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; +import org.apache.lucene.benchmark.byTask.feeds.DocData; +import org.apache.lucene.benchmark.byTask.feeds.HTMLParser; +import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.utils.Config; @@ -44,7 +48,14 @@ */ public class TrecDocMaker extends BasicDocMaker { - private static final String newline = System.getProperty("line.separator"); + private static final String DATE = "Date: "; + private static final String DOCHDR = ""; + private static final String TERM_DOCHDR = ""; + private static final String TERM_DOCNO = ""; + private static final String DOCNO = ""; + private static final String TERM_DOC = ""; + private static final String DOC = ""; + private static final String NEW_LINE = System.getProperty("line.separator"); protected ThreadLocal dateFormat = new ThreadLocal(); protected File dataDir = null; @@ -135,25 +146,37 @@ } // read until finding a line that starts with the specified prefix - protected StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws Exception { + protected StringBuffer read(String prefix, StringBuffer sb, + boolean collectMatchLine, boolean collectAll, + String terminatingTag) throws Exception { sb = (sb==null ? new StringBuffer() : sb); String sep = ""; while (true) { String line = reader.readLine(); - if (line==null) { + if (line == null) { openNextFile(); continue; } if (line.startsWith(prefix)) { if (collectMatchLine) { - sb.append(sep+line); - sep = newline; + sb.append(sep).append(line); + sep = NEW_LINE; } break; } + + if (terminatingTag != null && line.startsWith(terminatingTag)) { + // didn't find the prefix that was asked, but the terminating + // tag was found. set the length to 0 to signal no match was + // found. + sb.setLength(0); + break; + } + + if (collectAll) { - sb.append(sep+line); - sep = newline; + sb.append(sep).append(line); + sep = NEW_LINE; } } //System.out.println("read: "+sb); @@ -165,22 +188,31 @@ openNextFile(); } // 1. skip until doc start - read("",null,false,false); + read(DOC,null,false,false,null); // 2. name - StringBuffer sb = read("",null,true,false); - String name = sb.substring("".length()); - name = name.substring(0,name.indexOf(""))+"_"+iteration; + StringBuffer sb = read(DOCNO,null,true,false,null); + String name = sb.substring(DOCNO.length(), sb.indexOf(TERM_DOCNO, DOCNO.length())); + name = name + "_" + iteration; // 3. skip until doc header - read("",null,false,false); + read(DOCHDR,null,false,false,null); + boolean findTerminatingDocHdr = false; // 4. date - sb = read("Date: ",null,true,false); - String dateStr = sb.substring("Date: ".length()); + sb = read(DATE,null,true,false,TERM_DOCHDR); + String dateStr = null; + if (sb.length() != 0) { + // Date found. + dateStr = sb.substring(DATE.length()); + findTerminatingDocHdr = true; + } + // 5. skip until end of doc header - read("",null,false,false); + if (findTerminatingDocHdr) { + read(TERM_DOCHDR,null,false,false,null); + } // 6. collect until end of doc - sb = read("",null,false,true); + sb = read(TERM_DOC,null,false,true,null); // this is the next document, so parse it - Date date = parseDate(dateStr); + Date date = dateStr != null ? parseDate(dateStr) : new Date(); HTMLParser p = getHtmlParser(); DocData docData = p.parse(name, date, sb, getDateFormat(0)); addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).