Index: contrib/benchmark/conf/wikipedia.alg =================================================================== --- contrib/benchmark/conf/wikipedia.alg (revision 561541) +++ contrib/benchmark/conf/wikipedia.alg (working copy) @@ -15,51 +15,30 @@ # * limitations under the License. # */ # ------------------------------------------------------------------------------------- -# multi val params are iterated by NewRound's, added to reports, start with column name. + # -# based on micro-standard +# This alg will process the Reuters documents feed to produce a +# single file that contains all documents, one per line. # -# modified to use wikipedia sources and index entire docs -# currently just used to measure ingest rate +# To use this, first cd to contrib/benchmark and then run: +# +# ant run-task -Dtask.alg=conf/wikipedia.alg +# +# Then, to index the documents in the line file, see +# indexLineFile.alg. +# -merge.factor=mrg:10:100:10:100 -max.field.length=2147483647 -max.buffered=buf:10:10:100:100 -compound=true +# Where to get documents from: +doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker -analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer -directory=FSDirectory +# Where to write the line file output: +docs.file=temp/enwiki-20070527-pages-articles.xml +line.file.out=work/enwiki.txt -doc.stored=true -doc.tokenized=true -doc.term.vector=false -doc.add.log.step=500 +# Stop after processing the document feed once: +doc.maker.forever=false -docs.dir=enwiki - -doc.maker=org.apache.lucene.benchmark.byTask.feeds.DirDocMaker - -query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker - -# task at this depth or less would print when they start -task.max.depth.log=2 - -log.queries=false # ------------------------------------------------------------------------------------- -{ "Rounds" - - ResetSystemErase - - { "Populate" - CreateIndex - { "MAddDocs" AddDoc > : 200000 - CloseIndex - } - - NewRound - -} : 8 - -RepSumByName -RepSumByPrefRound MAddDocs +# Process all documents, appending each one to the line file: +{WriteLineDoc()}: 1 Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (revision 561541) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (working copy) @@ -1,211 +0,0 @@ -package org.apache.lucene.benchmark.utils; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.xml.sax.Attributes; -import org.xml.sax.InputSource; -import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; -import org.xml.sax.helpers.XMLReaderFactory; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileWriter; -import java.io.IOException; - -/** - * Extract the downloaded Wikipedia dump into separate files for indexing. - */ -public class ExtractWikipedia { - - private File wikipedia; - private File outputDir; - - public ExtractWikipedia(File wikipedia, File outputDir) { - this.wikipedia = wikipedia; - this.outputDir = outputDir; - System.out.println("Deleting all files in " + outputDir); - File [] files = outputDir.listFiles(); - for (int i = 0; i < files.length; i++) { - files[i].delete(); - } - } - - static public int count = 0; - static String[] months = {"JAN", "FEB", "MAR", "APR", - "MAY", "JUN", "JUL", "AUG", - "SEP", "OCT", "NOV", "DEC"}; - - public class Parser extends DefaultHandler { - - public Parser() { - } - - StringBuffer contents = new StringBuffer(); - - public void characters(char[] ch, int start, int length) { - contents.append(ch, start, length); - } - - String title; - String id; - String body; - String time; - - static final int BASE = 10; - - public void startElement(String namespace, - String simple, - String qualified, - Attributes attributes) { - if (qualified.equals("page")) { - title = null; - id = null; - body = null; - time = null; - } else if (qualified.equals("text")) { - contents.setLength(0); - } else if (qualified.equals("timestamp")) { - contents.setLength(0); - } else if (qualified.equals("title")) { - contents.setLength(0); - } else if (qualified.equals("id")) { - contents.setLength(0); - } - } - - public File directory (int count, File directory) { - if (directory == null) { - directory = outputDir; - } - int base = BASE; - while (base <= count) { - base *= BASE; - } - if (count < BASE) { - return directory; - } - directory = new File (directory, (Integer.toString(base / BASE))); - directory = new File (directory, (Integer.toString(count / (base / BASE)))); - return directory(count % (base / BASE), directory); - } - - public void create(String id, String title, String time, String body) { - - File d = directory(count++, null); - d.mkdirs(); - File f = new File(d, id + ".txt"); - - StringBuffer contents = new StringBuffer(); - - contents.append(time); - contents.append("\n\n"); - contents.append(title); - contents.append("\n\n"); - contents.append(body); - contents.append("\n"); - - try { - FileWriter writer = new FileWriter(f); - writer.write(contents.toString()); - writer.close(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - - } - - String time(String original) { - StringBuffer buffer = new StringBuffer(); - - buffer.append(original.substring(8, 10)); - buffer.append('-'); - buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); - buffer.append('-'); - buffer.append(original.substring(0, 4)); - buffer.append(' '); - buffer.append(original.substring(11, 19)); - buffer.append(".000"); - - return buffer.toString(); - } - - public void endElement(String namespace, String simple, String qualified) { - if (qualified.equals("title")) { - title = contents.toString(); - } else if (qualified.equals("text")) { - body = contents.toString(); - if (body.startsWith("#REDIRECT") || - body.startsWith("#redirect")) { - body = null; - } - } else if (qualified.equals("timestamp")) { - time = time(contents.toString()); - } else if (qualified.equals("id") && id == null) { - id = contents.toString(); - } else if (qualified.equals("page")) { - if (body != null) { - create(id, title, time, body); - } - } - } - } - - public void extract() { - - try { - Parser parser = new Parser(); - if (false) { - SAXParser sp = SAXParserFactory.newInstance().newSAXParser(); - sp.parse(new FileInputStream(wikipedia), parser); - } else { - XMLReader reader = - XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); - reader.setContentHandler(parser); - reader.setErrorHandler(parser); - reader.parse(new InputSource(new FileInputStream(wikipedia))); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - public static void main(String[] args) { - if (args.length != 2) { - printUsage(); - } - - File wikipedia = new File(args[0]); - - if (wikipedia.exists()) { - File outputDir = new File(args[1]); - outputDir.mkdirs(); - ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir); - extractor.extract(); - } else { - printUsage(); - } - } - - private static void printUsage() { - System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia "); - } - -} Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (revision 561541) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (working copy) @@ -18,7 +18,8 @@ */ import java.io.BufferedWriter; -import java.io.FileWriter; +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; @@ -59,7 +60,7 @@ String fileName = config.get("line.file.out", null); if (fileName == null) throw new Exception("line.file.out must be set"); - lineFileOut = new BufferedWriter(new FileWriter(fileName)); + lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8")); } docMaker = getRunData().getDocMaker(); } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (revision 0) @@ -0,0 +1,213 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.xml.sax.XMLReader; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.helpers.XMLReaderFactory; + +import java.io.IOException; + +import org.apache.lucene.document.Document; + +/** + * A LineDocMaker which reads the uncompressed english wikipedia dump. + */ +public class EnwikiDocMaker extends LineDocMaker { + + static final int TITLE = 0; + static final int DATE = TITLE+1; + static final int BODY = DATE+1; + static final int LENGTH = BODY+1; + + static final String[] months = {"JAN", "FEB", "MAR", "APR", + "MAY", "JUN", "JUL", "AUG", + "SEP", "OCT", "NOV", "DEC"}; + + class Parser extends DefaultHandler implements Runnable { + + Thread t; + + public void run() { + + try { + XMLReader reader = + XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); + reader.setContentHandler(this); + reader.setErrorHandler(this); + while(true){ + InputSource is = new InputSource(fileIS); + reader.parse(is); + if (!forever) { + synchronized(this) { + nmde = new NoMoreDataException(); + notify(); + } + return; + } else { + synchronized(this){ + openFile(); + } + } + } + } catch (SAXException sae) { + throw new RuntimeException(sae); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + } + + Parser() { + t = new Thread(this); + t.setDaemon(true); + t.start(); + } + + String[] tuple; + NoMoreDataException nmde; + + String[] next() throws NoMoreDataException { + String[] result; + synchronized(this){ + while(tuple == null && nmde == null){ + try { + wait(); + } catch (InterruptedException ie) { + } + } + if (nmde != null) { + throw nmde; + } + result = tuple; + tuple = null; + notify(); + } + return result; + } + + StringBuffer contents = new StringBuffer(); + + public void characters(char[] ch, int start, int length) { + contents.append(ch, start, length); + } + + String title; + String body; + String time; + + static final int BASE = 10; + + public void startElement(String namespace, + String simple, + String qualified, + Attributes attributes) { + if (qualified.equals("page")) { + title = null; + body = null; + time = null; + } else if (qualified.equals("text")) { + contents.setLength(0); + } else if (qualified.equals("timestamp")) { + contents.setLength(0); + } else if (qualified.equals("title")) { + contents.setLength(0); + } + } + + String time(String original) { + StringBuffer buffer = new StringBuffer(); + + buffer.append(original.substring(8, 10)); + buffer.append('-'); + buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); + buffer.append('-'); + buffer.append(original.substring(0, 4)); + buffer.append(' '); + buffer.append(original.substring(11, 19)); + buffer.append(".000"); + + return buffer.toString(); + } + + public void create(String title, String time, String body) { + String[] t = new String[LENGTH]; + t[TITLE] = title.replace('\t', ' '); + t[DATE] = time.replace('\t', ' '); + t[BODY] = body.replaceAll("[\t\n]", " "); + synchronized(this) { + while(tuple!=null) { + try { + wait(); + } catch (InterruptedException ie) { + } + } + tuple = t; + notify(); + } + } + + public void endElement(String namespace, String simple, String qualified) + throws SAXException { + if (qualified.equals("title")) { + title = contents.toString(); + } else if (qualified.equals("text")) { + body = contents.toString(); + if (body.startsWith("#REDIRECT") || + body.startsWith("#redirect")) { + body = null; + } + } else if (qualified.equals("timestamp")) { + time = time(contents.toString()); + } else if (qualified.equals("page")) { + if (body != null) { + create(title, time, body); + } + } + } + } + + Parser parser = new Parser(); + + class DocState extends LineDocMaker.DocState { + public Document setFields(String[] tuple) { + titleField.setValue(tuple[TITLE]); + dateField.setValue(tuple[DATE]); + bodyField.setValue(tuple[BODY]); + return doc; + } + } + + private DocState getDocState() { + DocState ds = (DocState) docState.get(); + if (ds == null) { + ds = new DocState(); + docState.set(ds); + } + return ds; + } + + public Document makeDocument() throws Exception { + String[] tuple = parser.next(); + return getDocState().setFields(tuple); + } + +} \ No newline at end of file Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java ___________________________________________________________________ Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (revision 561541) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (working copy) @@ -25,7 +25,8 @@ import java.io.BufferedReader; import java.io.IOException; -import java.io.FileReader; +import java.io.FileInputStream; +import java.io.InputStreamReader; /** * A DocMaker reading one line at a time as a Document from @@ -39,13 +40,14 @@ */ public class LineDocMaker extends BasicDocMaker { - private BufferedReader fileIn; - private ThreadLocal docState = new ThreadLocal(); + FileInputStream fileIS; + BufferedReader fileIn; + ThreadLocal docState = new ThreadLocal(); private String fileName; private static int READER_BUFFER_BYTES = 64*1024; - - private class DocState { + + class DocState { Document doc; Field bodyField; Field titleField; @@ -63,7 +65,7 @@ storeVal, Field.Index.TOKENIZED, termVecVal); - dateField = new Field(BasicDocMaker.TITLE_FIELD, + dateField = new Field(BasicDocMaker.DATE_FIELD, "", storeVal, Field.Index.TOKENIZED, @@ -143,11 +145,12 @@ openFile(); } - private void openFile() { + void openFile() { try { if (fileIn != null) fileIn.close(); - fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES); + fileIS = new FileInputStream(fileName); + fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES); } catch (IOException e) { throw new RuntimeException(e); } Index: contrib/benchmark/build.xml =================================================================== --- contrib/benchmark/build.xml (revision 561541) +++ contrib/benchmark/build.xml (working copy) @@ -23,7 +23,7 @@ - + @@ -44,10 +44,9 @@ - - +