Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (revision 561541) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (working copy) @@ -20,6 +20,7 @@ import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.XMLReader; +import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; @@ -28,24 +29,23 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; +import java.io.BufferedWriter; import java.io.IOException; /** - * Extract the downloaded Wikipedia dump into separate files for indexing. + * Extract the downloaded Wikipedia dump into a single file for + * indexing. Each wikipedia document becomes one line in the result + * file. */ public class ExtractWikipedia { private File wikipedia; - private File outputDir; + private BufferedWriter output; - public ExtractWikipedia(File wikipedia, File outputDir) { + public ExtractWikipedia(File wikipedia, File output) + throws IOException { this.wikipedia = wikipedia; - this.outputDir = outputDir; - System.out.println("Deleting all files in " + outputDir); - File [] files = outputDir.listFiles(); - for (int i = 0; i < files.length; i++) { - files[i].delete(); - } + this.output = new BufferedWriter( new FileWriter( output ) ); } static public int count = 0; @@ -55,8 +55,7 @@ public class Parser extends DefaultHandler { - public Parser() { - } + public Parser() {} StringBuffer contents = new StringBuffer(); @@ -91,45 +90,22 @@ } } - public File directory (int count, File directory) { - if (directory == null) { - directory = outputDir; - } - int base = BASE; - while (base <= count) { - base *= BASE; - } - if (count < BASE) { - return directory; - } - directory = new File (directory, (Integer.toString(base / BASE))); - directory = new File (directory, (Integer.toString(count / (base / BASE)))); - return directory(count % (base / BASE), directory); - } + public void create(String id, String title, String time, String body) + throws IOException { - public void create(String id, String title, String time, String body) { - - File d = directory(count++, null); - d.mkdirs(); - File f = new File(d, id + ".txt"); - - StringBuffer contents = new StringBuffer(); + id = id.replace('\t', ' '); + title = title.replace('\t', ' '); + time = time.replace('\t', ' '); + body = body.replaceAll("[\t\n]", " "); - contents.append(time); - contents.append("\n\n"); - contents.append(title); - contents.append("\n\n"); - contents.append(body); - contents.append("\n"); + output.write(title); + output.write('\t'); + output.write(time); + output.write('\t'); + output.write(body); + output.newLine(); + output.flush(); - try { - FileWriter writer = new FileWriter(f); - writer.write(contents.toString()); - writer.close(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - } String time(String original) { @@ -147,7 +123,8 @@ return buffer.toString(); } - public void endElement(String namespace, String simple, String qualified) { + public void endElement(String namespace, String simple, String qualified) + throws SAXException { if (qualified.equals("title")) { title = contents.toString(); } else if (qualified.equals("text")) { @@ -162,7 +139,11 @@ id = contents.toString(); } else if (qualified.equals("page")) { if (body != null) { - create(id, title, time, body); + try { + create(id, title, time, body); + } catch ( IOException ioe ) { + throw new SAXException( ioe ); + } } } } @@ -187,7 +168,8 @@ } } - public static void main(String[] args) { + public static void main(String[] args) + throws IOException { if (args.length != 2) { printUsage(); } @@ -195,9 +177,8 @@ File wikipedia = new File(args[0]); if (wikipedia.exists()) { - File outputDir = new File(args[1]); - outputDir.mkdirs(); - ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir); + File output = new File(args[1]); + ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, output); extractor.extract(); } else { printUsage(); Index: contrib/benchmark/build.xml =================================================================== --- contrib/benchmark/build.xml (revision 561541) +++ contrib/benchmark/build.xml (working copy) @@ -23,7 +23,7 @@ - + @@ -44,10 +44,9 @@ - - +