Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (revision 561541)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (working copy)
@@ -20,6 +20,7 @@
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
+import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
@@ -28,24 +29,23 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
+import java.io.BufferedWriter;
import java.io.IOException;
/**
- * Extract the downloaded Wikipedia dump into separate files for indexing.
+ * Extract the downloaded Wikipedia dump into a single file for
+ * indexing. Each wikipedia document becomes one line in the result
+ * file.
*/
public class ExtractWikipedia {
private File wikipedia;
- private File outputDir;
+ private BufferedWriter output;
- public ExtractWikipedia(File wikipedia, File outputDir) {
+ public ExtractWikipedia(File wikipedia, File output)
+ throws IOException {
this.wikipedia = wikipedia;
- this.outputDir = outputDir;
- System.out.println("Deleting all files in " + outputDir);
- File [] files = outputDir.listFiles();
- for (int i = 0; i < files.length; i++) {
- files[i].delete();
- }
+ this.output = new BufferedWriter( new FileWriter( output ) );
}
static public int count = 0;
@@ -55,8 +55,7 @@
public class Parser extends DefaultHandler {
- public Parser() {
- }
+ public Parser() {}
StringBuffer contents = new StringBuffer();
@@ -91,45 +90,22 @@
}
}
- public File directory (int count, File directory) {
- if (directory == null) {
- directory = outputDir;
- }
- int base = BASE;
- while (base <= count) {
- base *= BASE;
- }
- if (count < BASE) {
- return directory;
- }
- directory = new File (directory, (Integer.toString(base / BASE)));
- directory = new File (directory, (Integer.toString(count / (base / BASE))));
- return directory(count % (base / BASE), directory);
- }
+ public void create(String id, String title, String time, String body)
+ throws IOException {
- public void create(String id, String title, String time, String body) {
-
- File d = directory(count++, null);
- d.mkdirs();
- File f = new File(d, id + ".txt");
-
- StringBuffer contents = new StringBuffer();
+ id = id.replace('\t', ' ');
+ title = title.replace('\t', ' ');
+ time = time.replace('\t', ' ');
+ body = body.replaceAll("[\t\n]", " ");
- contents.append(time);
- contents.append("\n\n");
- contents.append(title);
- contents.append("\n\n");
- contents.append(body);
- contents.append("\n");
+ output.write(title);
+ output.write('\t');
+ output.write(time);
+ output.write('\t');
+ output.write(body);
+ output.newLine();
+ output.flush();
- try {
- FileWriter writer = new FileWriter(f);
- writer.write(contents.toString());
- writer.close();
- } catch (IOException ioe) {
- throw new RuntimeException(ioe);
- }
-
}
String time(String original) {
@@ -147,7 +123,8 @@
return buffer.toString();
}
- public void endElement(String namespace, String simple, String qualified) {
+ public void endElement(String namespace, String simple, String qualified)
+ throws SAXException {
if (qualified.equals("title")) {
title = contents.toString();
} else if (qualified.equals("text")) {
@@ -162,7 +139,11 @@
id = contents.toString();
} else if (qualified.equals("page")) {
if (body != null) {
- create(id, title, time, body);
+ try {
+ create(id, title, time, body);
+ } catch ( IOException ioe ) {
+ throw new SAXException( ioe );
+ }
}
}
}
@@ -187,7 +168,8 @@
}
}
- public static void main(String[] args) {
+ public static void main(String[] args)
+ throws IOException {
if (args.length != 2) {
printUsage();
}
@@ -195,9 +177,8 @@
File wikipedia = new File(args[0]);
if (wikipedia.exists()) {
- File outputDir = new File(args[1]);
- outputDir.mkdirs();
- ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
+ File output = new File(args[1]);
+ ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, output);
extractor.extract();
} else {
printUsage();
Index: contrib/benchmark/build.xml
===================================================================
--- contrib/benchmark/build.xml (revision 561541)
+++ contrib/benchmark/build.xml (working copy)
@@ -23,7 +23,7 @@
-
+
@@ -44,10 +44,9 @@
-
-
+