Index: contrib/benchmark/.rsync-filter =================================================================== --- contrib/benchmark/.rsync-filter (revision 0) +++ contrib/benchmark/.rsync-filter (revision 0) @@ -0,0 +1,2 @@ +- /work +- /temp Index: contrib/benchmark/conf/wikipedia.alg =================================================================== --- contrib/benchmark/conf/wikipedia.alg (revision 0) +++ contrib/benchmark/conf/wikipedia.alg (revision 0) @@ -0,0 +1,65 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- +# multi val params are iterated by NewRound's, added to reports, start with column name. +# +# based on micro-standard +# +# modified to use wikipedia sources and index entire docs +# currently just used to measure ingest rate + +merge.factor=mrg:10:100:10:100 +max.field.length=2147483647 +max.buffered=buf:10:10:100:100 +compound=true + +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory + +doc.stored=true +doc.tokenized=true +doc.term.vector=false +doc.add.log.step=500 + +docs.dir=enwiki + +doc.maker=org.apache.lucene.benchmark.byTask.feeds.DirDocMaker + +query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker + +# task at this depth or less would print when they start +task.max.depth.log=2 + +log.queries=false +# ------------------------------------------------------------------------------------- + +{ "Rounds" + + ResetSystemErase + + { "Populate" + CreateIndex + { "MAddDocs" AddDoc > : 200000 + CloseIndex + } + + NewRound + +} : 8 + +RepSumByName +RepSumByPrefRound MAddDocs Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (revision 0) @@ -0,0 +1,216 @@ +package org.apache.lucene.benchmark.utils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; + +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.Attributes; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.parsers.SAXParser; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.lang.RuntimeException; + +public class ExtractWikipedia { + + private File wikipedia; + private File outputDir; + + public ExtractWikipedia(File wikipedia, File outputDir) { + this.wikipedia = wikipedia; + this.outputDir = outputDir; + System.out.println("Deleting all files in " + outputDir); + File [] files = outputDir.listFiles(); + for (int i = 0; i < files.length; i++) { + files[i].delete(); + } + } + + static public int count = 0; + static String[] months = { "JAN", "FEB", "MAR", "APR", + "MAY", "JUN", "JUL", "AUG", + "SEP", "OCT", "NOV", "DEC" }; + + public class Parser extends DefaultHandler { + + public Parser() { + } + + StringBuffer contents = new StringBuffer(); + + public void characters( char[] ch, int start, int length ) { + contents.append( ch, start, length ); + } + + String title; + String id; + String body; + String time; + + static final int BASE = 10; + + public void startElement( String namespace, + String simple, + String qualified, + Attributes attributes ) { + + if ( qualified.equals( "page" ) ) { + title = null; + id = null; + body = null; + time = null; + } else if ( qualified.equals( "text" ) ) { + contents.setLength( 0 ); + } else if ( qualified.equals( "timestamp" ) ) { + contents.setLength( 0 ); + } else if ( qualified.equals( "title" ) ) { + contents.setLength( 0 ); + } else if ( qualified.equals( "id" ) ) { + contents.setLength( 0 ); + } + + } + + public File directory ( int count, File directory ) { + + if ( directory == null ) { + directory = outputDir; + } + + int base = BASE; + while ( base <= count ) { + base *= BASE; + } + + if ( count < BASE ) { + return directory; + } + + directory = new File ( directory, (Integer.toString( base / BASE ) ) ); + directory = new File ( directory, (Integer.toString( count / ( base / BASE ) ) ) ); + + return directory( count % (base/ BASE), directory ); + + } + + public void create( String id, String title, String time, String body ) { + + File d = directory( count++, null ); + d.mkdirs(); + File f = new File( d, id + ".txt" ); + + StringBuffer contents = new StringBuffer(); + + contents.append( time ); + contents.append( "\n\n" ); + contents.append( title ); + contents.append( "\n\n" ); + contents.append( body ); + contents.append( "\n" ); + + try { + FileWriter writer = new FileWriter( f ); + writer.write( contents.toString() ); + writer.close(); + } catch ( IOException ioe ) { + throw new RuntimeException( ioe ); + } + + } + + String time( String original ) { + StringBuffer buffer = new StringBuffer(); + + buffer.append( original.substring( 8, 10 ) ); + buffer.append( '-' ); + buffer.append( months[ Integer.valueOf( original.substring( 5, 7 ) ).intValue() - 1 ] ); + buffer.append( '-' ); + buffer.append( original.substring( 0, 4 ) ); + buffer.append( ' ' ); + buffer.append( original.substring( 11, 19 ) ); + buffer.append( ".000" ); + + return buffer.toString(); + } + + public void endElement( String namespace, String simple, String qualified ) { + + if ( qualified.equals( "title" ) ) { + title = contents.toString(); + } else if ( qualified.equals( "text" )) { + body = contents.toString(); + if ( body.startsWith( "#REDIRECT" ) || + body.startsWith( "#redirect" ) ) { + body = null; + } + } else if ( qualified.equals( "timestamp" ) ) { + time = time( contents.toString() ); + } else if ( qualified.equals( "id" ) && id == null ) { + id = contents.toString(); + } else if ( qualified.equals( "page" ) ) { + if ( body != null ) { + create( id, title, time, body ); + } + } + + } + + } + + public void extract() { + + try { + + Parser parser = new Parser(); + SAXParser sp = SAXParserFactory.newInstance().newSAXParser(); + + sp.parse( new FileInputStream( wikipedia ), new Parser() ); + + } catch ( Exception e ) { + throw new RuntimeException( e ); + } + + } + + public static void main(String[] args) { + + if (args.length != 2) { + printUsage(); + } + + File wikipedia = new File(args[0]); + + if (wikipedia.exists()) { + File outputDir = new File(args[1]); + outputDir.mkdirs(); + ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, + outputDir); + extractor.extract(); + } else { + printUsage(); + } + + } + + private static void printUsage() { + System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia "); + } + +} \ No newline at end of file Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java ___________________________________________________________________ Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java (revision 529397) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java (working copy) @@ -29,11 +29,14 @@ /** * Open an index writer. *
Other side effects: index writer object in perfRunData is set. - *
Relevant properties: merge.factor , max.buffered. + *
Relevant properties: merge.factor, max.buffered, + * max.field.length. +. */ public class OpenIndexTask extends PerfTask { public static final int DEFAULT_MAX_BUFFERED = 10; + public static final int DEFAULT_MAX_FIELD_LENGTH = 10000; public static final int DEFAULT_MERGE_PFACTOR = 10; public OpenIndexTask(PerfRunData runData) { @@ -50,9 +53,11 @@ boolean cmpnd = config.get("compound",true); int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR); int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED); + int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH); // must update params for newly opened writer writer.setMaxBufferedDocs(mxbf); + writer.setMaxFieldLength(mxfl); writer.setMergeFactor(mrgf); writer.setUseCompoundFile(cmpnd); // this one redundant? Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (revision 529397) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (working copy) @@ -29,7 +29,8 @@ /** * Create an index. *
Other side effects: index writer object in perfRunData is set. - *
Relevant properties: merge.factor , max.buffered. + *
Relevant properties: merge.factor, max.buffered, + * max.field.length. */ public class CreateIndexTask extends PerfTask { @@ -48,10 +49,12 @@ boolean cmpnd = config.get("compound",true); int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR); int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED); + int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH); iw.setUseCompoundFile(cmpnd); iw.setMergeFactor(mrgf); iw.setMaxBufferedDocs(mxbf); + iw.setMaxFieldLength(mxfl); getRunData().setIndexWriter(iw); return 1; Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java (revision 0) @@ -0,0 +1,225 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.utils.Config; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.FileFilter; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.Locale; +import java.util.Iterator; +import java.util.Stack; +import java.util.Comparator; +import java.util.Arrays; + +/** + * A DocMaker using the Dir collection for its input. + * + * Config properties: + * docs.dir=<path to the docs dir| Default: dir-out> + + * + */ +public class DirDocMaker extends BasicDocMaker { + + private DateFormat dateFormat; + private File dataDir = null; + private int iteration=0; + + static public class Iterator implements java.util.Iterator { + + int count = 0; + + public int getCount(){ + return count; + } + + Stack stack = new Stack(); + + /* this seems silly ... there must be a better way ... not that this is good, but can it matter? */ + + static class Comparator implements java.util.Comparator { + + public int compare( Object _a, Object _b ) { + String a = _a.toString(); + String b = _b.toString(); + + int diff = a.length() - b.length(); + + if ( diff > 0 ) { + while ( diff-- > 0 ) { + b = "0" + b; + } + } else if ( diff < 0 ) { + diff = -diff; + while ( diff-- > 0 ) { + a = "0" + a; + } + } + + /* note it's reversed because we're going to push, which reverses again */ + return b.compareTo( a ); + } + + } + + Comparator c = new Comparator(); + + void push( File[] files ) { + Arrays.sort( files, c ); + for( int i = 0; i < files.length; i++ ) { + // System.err.println( "push " + files[i] ); + stack.push( files[i] ); + } + } + + void push( File f ) { + + push( f.listFiles( new FileFilter() { + public boolean accept( File f ) { return f.isDirectory(); } } ) ); + + + push( f.listFiles( new FileFilter() { + public boolean accept( File f ) { return f.getName().endsWith(".txt"); } } ) ); + + find(); + + } + + void find() { + + if ( stack.empty() ) { + return; + } + + if ( !((File)stack.peek()).isDirectory() ) { + return; + } + + File f = (File)stack.pop(); + + push( f ); + + } + + public Iterator( File f ) { + push( f ); + } + + public void remove() { + throw new RuntimeException( "cannot" ); + } + + public boolean hasNext() { + return stack.size() > 0; + } + + public Object next() { + assert hasNext(); + count++; + Object object = stack.pop(); + // System.err.println( "pop " + object ); + find(); + return object; + } + + } + + private Iterator inputFiles = null; + + /* (non-Javadoc) + * @see SimpleDocMaker#setConfig(java.util.Properties) + */ + public void setConfig(Config config) { + super.setConfig(config); + String d = config.get("docs.dir","dir-out"); + dataDir = new File(new File("work"),d); + + inputFiles = new Iterator( dataDir ); + + if (inputFiles==null) { + throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); + } + // date format: 30-MAR-1987 14:22:36.87 + dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US); + dateFormat.setLenient(true); + } + + protected DocData getNextDocData() throws Exception { + File f = null; + String name = null; + synchronized (this) { + if (!inputFiles.hasNext()) { + // exhausted files, start a new round, unless forever set to false. + if (!forever) { + throw new NoMoreDataException(); + } + inputFiles = new Iterator( dataDir ); + iteration++; + } + f = (File) inputFiles.next(); + // System.err.println( f ); + name = f.getCanonicalPath()+"_"+iteration; + } + + BufferedReader reader = new BufferedReader(new FileReader(f)); + String line = null; + //First line is the date, 3rd is the title, rest is body + String dateStr = reader.readLine(); + reader.readLine();//skip an empty line + String title = reader.readLine(); + reader.readLine();//skip an empty line + StringBuffer bodyBuf = new StringBuffer(1024); + while ((line = reader.readLine()) != null) { + bodyBuf.append(line).append(' '); + } + reader.close(); + + addBytes(f.length()); + + + Date date = dateFormat.parse(dateStr.trim()); + return new DocData(name, bodyBuf.toString(), title, null, date); + } + + + /* + * (non-Javadoc) + * @see DocMaker#resetIinputs() + */ + public synchronized void resetInputs() { + super.resetInputs(); + inputFiles = new Iterator( dataDir ); + iteration = 0; + } + + /* + * (non-Javadoc) + * @see DocMaker#numUniqueTexts() + */ + public int numUniqueTexts() { + return inputFiles.getCount(); + } + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java ___________________________________________________________________ Name: svn:eol-style + native Index: contrib/benchmark/build.xml =================================================================== --- contrib/benchmark/build.xml (revision 529397) +++ contrib/benchmark/build.xml (working copy) @@ -22,8 +22,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -126,6 +154,17 @@ + + Working Directory: ${working.dir} + + + + + + + + +