Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesDocMaker.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesDocMaker.java (revision ) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesDocMaker.java (revision ) @@ -0,0 +1,163 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.Format; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.util.IOUtils; +import org.apache.solr.core.SolrConfig; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; + +import java.io.ByteArrayInputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.BitSet; +import java.util.Random; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * @author David Smiley - dsmiley@mitre.org + */ +public class GeoNamesDocMaker extends DocMaker { + + private GeoPerfData geoPerfData; + + private Random random; + + private SchemaField geoSchemaField; + private int avgPlacesPerDoc; + + private BitSet usedPlaces; + + private int documentCounter = 0; + private int printNum = 0; + private int docsToGenerate; + private boolean oneDocPerPlace; + private boolean useLinearScanAlgorithm; + + @Override + public synchronized void setConfig(Config config) { + this.config = config; + try { + if (geoPerfData == null) + geoPerfData = GeoPerfData.initialize(config); + } catch (IOException e) { + throw new RuntimeException(e); + } + + geoSchemaField = loadGeoSchemaField(config); + + docsToGenerate = config.get("doc.geo.docsToGenerate",geoPerfData.getTotalPlaces()); + avgPlacesPerDoc = config.get("doc.geo.avgPlacesPerDoc", 10); + oneDocPerPlace = config.get("doc.geo.oneDocPerPlace",false); + useLinearScanAlgorithm = false; + if (oneDocPerPlace) { + if (avgPlacesPerDoc != 1) + throw new IllegalArgumentException("If oneDocPerPlace, then avgPlacesPerDoc must be one."); + if (docsToGenerate > geoPerfData.getTotalPlaces()) + throw new IllegalStateException("Asked for more documents than there are places."); + if (docsToGenerate == geoPerfData.getTotalPlaces()) + useLinearScanAlgorithm = true; + } + random = new Random(0); + usedPlaces = new BitSet(geoPerfData.getTotalPlaces()); + documentCounter = 0; + } + + @Override + public void resetInputs() throws IOException { +// super.resetInputs(); + printDocStatistics(); + // re-initiate since properties by round may have changed. + setConfig(config); +// source.resetInputs(); +// numDocsCreated.set(0); +// resetLeftovers(); + } + + @Override + public void close() throws IOException { +// super.close(); + } + + private static final Logger solrLogger = Logger.getLogger("org.apache.solr");//hard reference + static { + solrLogger.setLevel(Level.SEVERE); + } + + static SchemaField loadGeoSchemaField(Config config) { + String geoFieldName = config.get("doc.geo.schemaField",null); + if (geoFieldName == null) + throw new RuntimeException("doc.geo.schemaField is required"); + //(this code is the poster child for why Exception handling in Java leaves a lot to be desired) + FileInputStream schemaStream = null; + try { + try { + //System.setProperty("solr.solr.home","conf"); + String solrConfigText = "" + + "LUCENE_CURRENT" + + ""; + InputStream solrConfigStream = new ByteArrayInputStream(solrConfigText.getBytes("UTF8")); + final String solrHome = "conf/solrconf/"; + SolrConfig solrConfig = new SolrConfig(solrHome, null, solrConfigStream); + schemaStream = new FileInputStream(solrHome+"schema.xml"); + IndexSchema schema = new IndexSchema(solrConfig,null,schemaStream); + return schema.getField(geoFieldName);//does not return null + } finally { + IOUtils.closeSafely(null, schemaStream); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void printDocStatistics() { + /** code modified from superclass */ + boolean print = false; + String col = " "; + StringBuilder sb = new StringBuilder(); + String newline = System.getProperty("line.separator"); + sb.append("------------> ").append(getClass().getSimpleName()).append(" statistics (").append(printNum).append("): ").append(newline); + if (documentCounter > 0) { + print = true; + sb.append("Unique places used: ").append(usedPlaces.cardinality()).append(newline); + sb.append("num docs added since last inputs reset: ").append(Format.format(0, documentCounter, col)).append(newline); + } + if (print) { + System.out.println(sb.append(newline).toString()); + printNum++; + } + } + + @Override + public Document makeDocument() throws Exception { + if (documentCounter >= docsToGenerate) { + throw new NoMoreDataException(); + } + + Document doc = new Document(); + //(don't bother adding an ID; we don't use it.) + int places = avgPlacesPerDoc == 1 ? 1 : random.nextInt(avgPlacesPerDoc *2); + for(int i = 0; i < places; i++) { + GeoPerfData.Place p = useLinearScanAlgorithm ? + geoPerfData.nextPlace(documentCounter) : + geoPerfData.nextRandomPlace(random, usedPlaces, oneDocPerPlace); + Fieldable[] fieldValues = geoSchemaField.createFields(p.lat+","+p.lon,1f); + for (Fieldable fieldValue : fieldValues) { + doc.add(fieldValue); + } + } + documentCounter++; + return doc; + } + + @Override + public Document makeDocument(int size) throws Exception { + throw new UnsupportedOperationException(); + } + +} Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoQueryMaker.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoQueryMaker.java (revision ) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoQueryMaker.java (revision ) @@ -0,0 +1,65 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.search.Query; +import org.apache.lucene.spatial.DistanceUtils; +import org.apache.lucene.spatial.geometry.DistanceUnits; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.SpatialQueryable; +import org.apache.solr.search.SpatialOptions; + +import java.util.Random; + +/** + * @author David Smiley dsmiley@mitre.org + */ +public class GeoQueryMaker implements QueryMaker { + + private GeoPerfData geoPerfData; + private SchemaField geoSchemaField; + private Config config; + private Random random; + private int radiusKm; + + @Override + public void setConfig(Config config) { + this.config = config; + random = new Random(0); + geoPerfData = GeoPerfData.getInstance(); + geoSchemaField = GeoNamesDocMaker.loadGeoSchemaField(config); + radiusKm = config.get("query.geo.radiuskm",100); + } + + @Override + public void resetInputs() { + setConfig(config); + } + + @Override + public String printQueries() { + return "(randomly generated queries)"; + } + + @Override + public Query makeQuery(int size) { + throw new UnsupportedOperationException(); + } + + @Override + public Query makeQuery() { + GeoPerfData.Place place = geoPerfData.nextRandomPlace(random, null, false); + + //int km = (int) StrictMath.pow(10, qc % 5 );//radius up to 4 zeros (10,000) + + SpatialOptions options = new SpatialOptions(); + options.pointStr = place.lat+","+place.lon; + options.distance = radiusKm; + options.units = DistanceUnits.KILOMETERS; + options.radius = DistanceUtils.EARTH_MEAN_RADIUS_KM; + options.bbox = true; + options.field = geoSchemaField; + + return ((SpatialQueryable) geoSchemaField.getType()).createSpatialQuery(null,options); + } + +} Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesContentSource.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesContentSource.java (revision ) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesContentSource.java (revision ) @@ -0,0 +1,127 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +import org.apache.lucene.benchmark.byTask.utils.Config; + +import java.io.*; +import java.util.Properties; + +/** + * @author David Smiley - dsmiley@mitre.org + */ +public class GeoNamesContentSource extends ContentSource { + private File file; + + private BufferedReader reader; //must synchronize + + @Override + public synchronized void close() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + } + + /** Extend DocData to put GeoName data here. */ + public static class GeoNameDocData extends DocData { + String + //geonameid , // integer id of record in geonames database + //name , // name of geographical point (utf8) varchar(200) + asciiname , // name of geographical point in plain ascii characters, varchar(200) + alternatenames , // alternatenames, comma separated varchar(5000) + latitude , // latitude in decimal degrees (wgs84) + longitude , // longitude in decimal degrees (wgs84) + feature_class , // see http, ////www.geonames.org/export/codes.html, char(1) + feature_code , // see http, ////www.geonames.org/export/codes.html, varchar(10) + country_code , // ISO-3166 2-letter country code, 2 characters + cc2 , // alternate country codes, comma separated, ISO-3166 2-letter country code, 60 characters + admin1_code , // fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20) + admin2_code , // code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) + admin3_code , // code for third level administrative division, varchar(20) + admin4_code , // code for fourth level administrative division, varchar(20) + population , // bigint (8 byte int) + elevation , // in meters, integer + gtopo30 , // average elevation of 30'x30' (ca 900mx900m) area in meters, integer + timezone , // the timezone id (see file timeZone.txt) + modification_date ; // date of last modification in yyyy-MM-dd format + } + + @Override + public GeoNameDocData getNextDocData(DocData _docData/*unused*/) throws NoMoreDataException, IOException { + String line; + synchronized (this) { + line = reader.readLine(); + if (line == null) + throw new NoMoreDataException(); + } + addDoc(); + + String[] fields = parseFields(line,20); + GeoNameDocData docData = new GeoNameDocData(); + + Properties props = new Properties(); + docData.setProps(props); + int idx = 0; + docData.setID(Integer.parseInt(fields[idx++])); + docData.setName(fields[idx++]); + docData.asciiname = fields[idx++]; + docData.alternatenames = fields[idx++]; + docData.latitude = fields[idx++]; + docData.longitude = fields[idx++]; + docData.feature_class = fields[idx++]; + docData.feature_code = fields[idx++]; + docData.country_code = fields[idx++]; + docData.cc2 = fields[idx++]; + docData.admin1_code = fields[idx++]; + docData.admin2_code = fields[idx++]; + docData.admin3_code = fields[idx++]; + docData.admin4_code = fields[idx++]; + docData.population = fields[idx++]; + docData.elevation = fields[idx++]; + docData.gtopo30 = fields[idx++]; + docData.timezone = fields[idx++]; + docData.modification_date = fields[idx++]; + return docData; + } + + private String[] parseFields(String line, int numFields) { + String[] fields = new String[numFields]; + int startIdx = 0; + for(int f = 0; f < numFields; f++) { + if (startIdx == line.length())//handles too few tabs, including the "" case + break; + int stopIdx = line.indexOf('\t',startIdx); + if (stopIdx == -1) + stopIdx = line.length(); + if (stopIdx - startIdx > 0) { + fields[f] = line.substring(startIdx,stopIdx); + } + startIdx = stopIdx+1; + } + return fields; + } + + + @Override + public synchronized void resetInputs() throws IOException { + super.resetInputs(); + InputStream is = getInputStream(file);//already buffered, but not a reader. + reader = new BufferedReader(new InputStreamReader(is, encoding)); + } + + @Override + public void setConfig(Config config) { + super.setConfig(config); + if (encoding == null) + encoding = "UTF8"; + else if (!encoding.equals("UTF8")) + throw new IllegalArgumentException("content.source.encoding must be UTF8"); + if (forever) + throw new IllegalArgumentException("content.source.forever is not supported"); + String fileName = config.get("docs.file", null); + if (fileName == null) { + throw new IllegalArgumentException("docs.file must be set"); + } + file = new File(fileName).getAbsoluteFile(); + + } +} Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision 987961) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision ) @@ -17,33 +17,23 @@ * limitations under the License. */ -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; - -import java.util.List; -import java.util.Set; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.TopFieldCollector; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TopScoreDocCollector; -import org.apache.lucene.search.Weight; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Sort; +import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + /** * Read index (abstract) task. * Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve() @@ -62,6 +52,7 @@ public abstract class ReadTask extends PerfTask { private final QueryMaker queryMaker; + private boolean useHitTotal; public ReadTask(PerfRunData runData) { super(runData); @@ -175,6 +166,8 @@ } } } + if (useHitTotal) + res = hits==null?0:hits.totalHits; } if (closeSearcher) { @@ -252,6 +245,7 @@ public void setup() throws Exception { super.setup(); numHits = getRunData().getConfig().get("search.num.hits", DEFAULT_SEARCH_NUM_HITS); + useHitTotal = getRunData().getConfig().get("search.useHitTotal",false); } /** Index: lucene/contrib/benchmark/build.xml =================================================================== --- lucene/contrib/benchmark/build.xml (revision 993408) +++ lucene/contrib/benchmark/build.xml (revision ) @@ -19,6 +19,7 @@ property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/> + @@ -141,10 +142,15 @@ + + + + + @@ -254,7 +260,17 @@ + + + + + + + + + + - + Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoPerfData.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoPerfData.java (revision ) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoPerfData.java (revision ) @@ -0,0 +1,133 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +import org.apache.lucene.benchmark.byTask.utils.Config; + +import java.io.IOException; +import java.util.*; + +/** + * Singleton, threadsafe, immutable. + * @author David Smiley dsmiley@mitre.org + */ +class GeoPerfData { + + private final int zeroPopSubst; + + private final ArrayList allPlaces;//pop descending + /** Parallel array to allPlaces containing the cumulative population of all places prior to this one plus this one. */ + private long[] cumPop; + + private long totalPop; + + static synchronized GeoPerfData initialize(Config config) throws IOException { + if (instance != null) + throw new IllegalStateException("already initialized"); + instance = new GeoPerfData(config); + return instance; + } + + static class Place { + //int id; + float lat, lon; + int pop; + } + + private static volatile GeoPerfData instance; + + static synchronized GeoPerfData getInstance() { + if (instance == null) + throw new IllegalStateException("GeoPerfData not yet initialized"); + return instance; + } + + private GeoPerfData(Config config) throws IOException { + zeroPopSubst = config.get("content.geo.zeroPopSubst", 100); + int maxPlaces = config.get("content.geo.maxPlaces",Integer.MAX_VALUE); + allPlaces = new ArrayList(Math.min(1000000,maxPlaces)); + + //initialize data + GeoNamesContentSource source = new GeoNamesContentSource(); + try { + source.setConfig(config); + source.resetInputs(); + + while(allPlaces.size() < maxPlaces) { + GeoNamesContentSource.GeoNameDocData docData = source.getNextDocData(null); + if (docData == null) + break;//won't happen? + processInputLine(docData); + } + } catch (NoMoreDataException e) { + //ignore; expected + } finally { + source.close(); + } + postProcessLines(); + + System.out.println("Geo names places: "+allPlaces.size()); + System.out.println("Geo names total population: "+totalPop); + } + + private void processInputLine(GeoNamesContentSource.GeoNameDocData docData) { + Place p = new Place(); + p.lat = Float.parseFloat(docData.latitude); + p.lon = Float.parseFloat(docData.longitude); + p.pop = Integer.parseInt(docData.population); + if (p.pop <= 0) + p.pop = zeroPopSubst; + allPlaces.add(p); + } + + private void postProcessLines() { + //sort ascending + Collections.sort(allPlaces,new Comparator() { + @Override + public int compare(Place o1, Place o2) { + return o1.pop - o2.pop; + } + }); + + //gather cumulative population parallel array + cumPop = new long[allPlaces.size()]; + totalPop = 0; + for (int i = 0; i < cumPop.length; i++) { + totalPop += allPlaces.get(i).pop; + cumPop[i] = totalPop; + } + } + + long getTotalPop() { + return totalPop; + } + + int getTotalPlaces() { + return allPlaces.size(); + } + + Place nextRandomPlace(Random random, BitSet usedPlaces, boolean forceUnused) { + long personIdx = (long) (random.nextDouble() * totalPop); // [0 - totalPop) + int cumPopIdx = Arrays.binarySearch(cumPop, personIdx); + if (cumPopIdx < 0) { + //got (-(insertion point) - 1) + cumPopIdx = -1*(cumPopIdx + 1); + } + if (usedPlaces != null) { + if (forceUnused) { + cumPopIdx = usedPlaces.nextClearBit(cumPopIdx); + if (cumPopIdx == allPlaces.size()) { + cumPopIdx = usedPlaces.nextClearBit(0); + if (cumPopIdx == allPlaces.size()) { + throw new IllegalStateException("No more unused places!"); + } + } + } + usedPlaces.set(cumPopIdx); + } + return allPlaces.get(cumPopIdx); + } + + Place nextPlace(int idx) { + return allPlaces.get(idx); + } + +} Index: lucene/contrib/benchmark/conf/geoname-spatial.alg =================================================================== --- lucene/contrib/benchmark/conf/geoname-spatial.alg (revision ) +++ lucene/contrib/benchmark/conf/geoname-spatial.alg (revision ) @@ -0,0 +1,71 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- +# Using Geonames.org data for geospatial filter queries. + +ram.flush.mb=128 + +directory=FSDirectory +compound=false + +docs.file=temp/US.txt + +#content.geo.maxPlaces=1000000 +#content.geo.maxPlaces=10000000 +#content.geo.zeroPopSubst=100 +content.source.forever = false + +doc.maker=org.apache.lucene.benchmark.byTask.feeds.GeoNamesDocMaker +#doc.geo.docsToGenerate=1000000 +doc.geo.avgPlacesPerDoc=1 +doc.geo.oneDocPerPlace=true +doc.geo.schemaField=geohash + +query.maker=org.apache.lucene.benchmark.byTask.feeds.GeoQueryMaker +#query.geo.radiuskm=____km:11:44:230:1800 + +query.geo.radiuskm=____km:350 + +search.useHitTotal=true + +log.step.AddDoc=100000 + +{ "Populate" + ResetSystemErase + CreateIndex + { "MAddDocs" AddDoc > : * + Optimize + CloseIndex +}:0 +{ "Rounds" + + ResetSystemSoft + + OpenReader + + -{ "Warm" Search > : 5 + + { "Search" Search > : 40 + + CloseReader + + NewRound + +} : 1 + +RepSelectByPref Search +#RepSumByPrefRound Search Index: lucene/contrib/benchmark/conf/solrconf/schema.xml =================================================================== --- lucene/contrib/benchmark/conf/solrconf/schema.xml (revision ) +++ lucene/contrib/benchmark/conf/solrconf/schema.xml (revision ) @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file