Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesDocMaker.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesDocMaker.java (revision )
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesDocMaker.java (revision )
@@ -0,0 +1,163 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.Format;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.util.IOUtils;
+import org.apache.solr.core.SolrConfig;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
+
+import java.io.ByteArrayInputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.BitSet;
+import java.util.Random;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * @author David Smiley - dsmiley@mitre.org
+ */
+public class GeoNamesDocMaker extends DocMaker {
+
+ private GeoPerfData geoPerfData;
+
+ private Random random;
+
+ private SchemaField geoSchemaField;
+ private int avgPlacesPerDoc;
+
+ private BitSet usedPlaces;
+
+ private int documentCounter = 0;
+ private int printNum = 0;
+ private int docsToGenerate;
+ private boolean oneDocPerPlace;
+ private boolean useLinearScanAlgorithm;
+
+ @Override
+ public synchronized void setConfig(Config config) {
+ this.config = config;
+ try {
+ if (geoPerfData == null)
+ geoPerfData = GeoPerfData.initialize(config);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ geoSchemaField = loadGeoSchemaField(config);
+
+ docsToGenerate = config.get("doc.geo.docsToGenerate",geoPerfData.getTotalPlaces());
+ avgPlacesPerDoc = config.get("doc.geo.avgPlacesPerDoc", 10);
+ oneDocPerPlace = config.get("doc.geo.oneDocPerPlace",false);
+ useLinearScanAlgorithm = false;
+ if (oneDocPerPlace) {
+ if (avgPlacesPerDoc != 1)
+ throw new IllegalArgumentException("If oneDocPerPlace, then avgPlacesPerDoc must be one.");
+ if (docsToGenerate > geoPerfData.getTotalPlaces())
+ throw new IllegalStateException("Asked for more documents than there are places.");
+ if (docsToGenerate == geoPerfData.getTotalPlaces())
+ useLinearScanAlgorithm = true;
+ }
+ random = new Random(0);
+ usedPlaces = new BitSet(geoPerfData.getTotalPlaces());
+ documentCounter = 0;
+ }
+
+ @Override
+ public void resetInputs() throws IOException {
+// super.resetInputs();
+ printDocStatistics();
+ // re-initiate since properties by round may have changed.
+ setConfig(config);
+// source.resetInputs();
+// numDocsCreated.set(0);
+// resetLeftovers();
+ }
+
+ @Override
+ public void close() throws IOException {
+// super.close();
+ }
+
+ private static final Logger solrLogger = Logger.getLogger("org.apache.solr");//hard reference
+ static {
+ solrLogger.setLevel(Level.SEVERE);
+ }
+
+ static SchemaField loadGeoSchemaField(Config config) {
+ String geoFieldName = config.get("doc.geo.schemaField",null);
+ if (geoFieldName == null)
+ throw new RuntimeException("doc.geo.schemaField is required");
+ //(this code is the poster child for why Exception handling in Java leaves a lot to be desired)
+ FileInputStream schemaStream = null;
+ try {
+ try {
+ //System.setProperty("solr.solr.home","conf");
+ String solrConfigText = "" +
+ "LUCENE_CURRENT" +
+ "";
+ InputStream solrConfigStream = new ByteArrayInputStream(solrConfigText.getBytes("UTF8"));
+ final String solrHome = "conf/solrconf/";
+ SolrConfig solrConfig = new SolrConfig(solrHome, null, solrConfigStream);
+ schemaStream = new FileInputStream(solrHome+"schema.xml");
+ IndexSchema schema = new IndexSchema(solrConfig,null,schemaStream);
+ return schema.getField(geoFieldName);//does not return null
+ } finally {
+ IOUtils.closeSafely(null, schemaStream);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void printDocStatistics() {
+ /** code modified from superclass */
+ boolean print = false;
+ String col = " ";
+ StringBuilder sb = new StringBuilder();
+ String newline = System.getProperty("line.separator");
+ sb.append("------------> ").append(getClass().getSimpleName()).append(" statistics (").append(printNum).append("): ").append(newline);
+ if (documentCounter > 0) {
+ print = true;
+ sb.append("Unique places used: ").append(usedPlaces.cardinality()).append(newline);
+ sb.append("num docs added since last inputs reset: ").append(Format.format(0, documentCounter, col)).append(newline);
+ }
+ if (print) {
+ System.out.println(sb.append(newline).toString());
+ printNum++;
+ }
+ }
+
+ @Override
+ public Document makeDocument() throws Exception {
+ if (documentCounter >= docsToGenerate) {
+ throw new NoMoreDataException();
+ }
+
+ Document doc = new Document();
+ //(don't bother adding an ID; we don't use it.)
+ int places = avgPlacesPerDoc == 1 ? 1 : random.nextInt(avgPlacesPerDoc *2);
+ for(int i = 0; i < places; i++) {
+ GeoPerfData.Place p = useLinearScanAlgorithm ?
+ geoPerfData.nextPlace(documentCounter) :
+ geoPerfData.nextRandomPlace(random, usedPlaces, oneDocPerPlace);
+ Fieldable[] fieldValues = geoSchemaField.createFields(p.lat+","+p.lon,1f);
+ for (Fieldable fieldValue : fieldValues) {
+ doc.add(fieldValue);
+ }
+ }
+ documentCounter++;
+ return doc;
+ }
+
+ @Override
+ public Document makeDocument(int size) throws Exception {
+ throw new UnsupportedOperationException();
+ }
+
+}
Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoQueryMaker.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoQueryMaker.java (revision )
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoQueryMaker.java (revision )
@@ -0,0 +1,65 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.spatial.DistanceUtils;
+import org.apache.lucene.spatial.geometry.DistanceUnits;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.SpatialQueryable;
+import org.apache.solr.search.SpatialOptions;
+
+import java.util.Random;
+
+/**
+ * @author David Smiley dsmiley@mitre.org
+ */
+public class GeoQueryMaker implements QueryMaker {
+
+ private GeoPerfData geoPerfData;
+ private SchemaField geoSchemaField;
+ private Config config;
+ private Random random;
+ private int radiusKm;
+
+ @Override
+ public void setConfig(Config config) {
+ this.config = config;
+ random = new Random(0);
+ geoPerfData = GeoPerfData.getInstance();
+ geoSchemaField = GeoNamesDocMaker.loadGeoSchemaField(config);
+ radiusKm = config.get("query.geo.radiuskm",100);
+ }
+
+ @Override
+ public void resetInputs() {
+ setConfig(config);
+ }
+
+ @Override
+ public String printQueries() {
+ return "(randomly generated queries)";
+ }
+
+ @Override
+ public Query makeQuery(int size) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public Query makeQuery() {
+ GeoPerfData.Place place = geoPerfData.nextRandomPlace(random, null, false);
+
+ //int km = (int) StrictMath.pow(10, qc % 5 );//radius up to 4 zeros (10,000)
+
+ SpatialOptions options = new SpatialOptions();
+ options.pointStr = place.lat+","+place.lon;
+ options.distance = radiusKm;
+ options.units = DistanceUnits.KILOMETERS;
+ options.radius = DistanceUtils.EARTH_MEAN_RADIUS_KM;
+ options.bbox = true;
+ options.field = geoSchemaField;
+
+ return ((SpatialQueryable) geoSchemaField.getType()).createSpatialQuery(null,options);
+ }
+
+}
Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesContentSource.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesContentSource.java (revision )
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoNamesContentSource.java (revision )
@@ -0,0 +1,127 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+import java.io.*;
+import java.util.Properties;
+
+/**
+ * @author David Smiley - dsmiley@mitre.org
+ */
+public class GeoNamesContentSource extends ContentSource {
+ private File file;
+
+ private BufferedReader reader; //must synchronize
+
+ @Override
+ public synchronized void close() throws IOException {
+ if (reader != null) {
+ reader.close();
+ reader = null;
+ }
+ }
+
+ /** Extend DocData to put GeoName data here. */
+ public static class GeoNameDocData extends DocData {
+ String
+ //geonameid , // integer id of record in geonames database
+ //name , // name of geographical point (utf8) varchar(200)
+ asciiname , // name of geographical point in plain ascii characters, varchar(200)
+ alternatenames , // alternatenames, comma separated varchar(5000)
+ latitude , // latitude in decimal degrees (wgs84)
+ longitude , // longitude in decimal degrees (wgs84)
+ feature_class , // see http, ////www.geonames.org/export/codes.html, char(1)
+ feature_code , // see http, ////www.geonames.org/export/codes.html, varchar(10)
+ country_code , // ISO-3166 2-letter country code, 2 characters
+ cc2 , // alternate country codes, comma separated, ISO-3166 2-letter country code, 60 characters
+ admin1_code , // fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)
+ admin2_code , // code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)
+ admin3_code , // code for third level administrative division, varchar(20)
+ admin4_code , // code for fourth level administrative division, varchar(20)
+ population , // bigint (8 byte int)
+ elevation , // in meters, integer
+ gtopo30 , // average elevation of 30'x30' (ca 900mx900m) area in meters, integer
+ timezone , // the timezone id (see file timeZone.txt)
+ modification_date ; // date of last modification in yyyy-MM-dd format
+ }
+
+ @Override
+ public GeoNameDocData getNextDocData(DocData _docData/*unused*/) throws NoMoreDataException, IOException {
+ String line;
+ synchronized (this) {
+ line = reader.readLine();
+ if (line == null)
+ throw new NoMoreDataException();
+ }
+ addDoc();
+
+ String[] fields = parseFields(line,20);
+ GeoNameDocData docData = new GeoNameDocData();
+
+ Properties props = new Properties();
+ docData.setProps(props);
+ int idx = 0;
+ docData.setID(Integer.parseInt(fields[idx++]));
+ docData.setName(fields[idx++]);
+ docData.asciiname = fields[idx++];
+ docData.alternatenames = fields[idx++];
+ docData.latitude = fields[idx++];
+ docData.longitude = fields[idx++];
+ docData.feature_class = fields[idx++];
+ docData.feature_code = fields[idx++];
+ docData.country_code = fields[idx++];
+ docData.cc2 = fields[idx++];
+ docData.admin1_code = fields[idx++];
+ docData.admin2_code = fields[idx++];
+ docData.admin3_code = fields[idx++];
+ docData.admin4_code = fields[idx++];
+ docData.population = fields[idx++];
+ docData.elevation = fields[idx++];
+ docData.gtopo30 = fields[idx++];
+ docData.timezone = fields[idx++];
+ docData.modification_date = fields[idx++];
+ return docData;
+ }
+
+ private String[] parseFields(String line, int numFields) {
+ String[] fields = new String[numFields];
+ int startIdx = 0;
+ for(int f = 0; f < numFields; f++) {
+ if (startIdx == line.length())//handles too few tabs, including the "" case
+ break;
+ int stopIdx = line.indexOf('\t',startIdx);
+ if (stopIdx == -1)
+ stopIdx = line.length();
+ if (stopIdx - startIdx > 0) {
+ fields[f] = line.substring(startIdx,stopIdx);
+ }
+ startIdx = stopIdx+1;
+ }
+ return fields;
+ }
+
+
+ @Override
+ public synchronized void resetInputs() throws IOException {
+ super.resetInputs();
+ InputStream is = getInputStream(file);//already buffered, but not a reader.
+ reader = new BufferedReader(new InputStreamReader(is, encoding));
+ }
+
+ @Override
+ public void setConfig(Config config) {
+ super.setConfig(config);
+ if (encoding == null)
+ encoding = "UTF8";
+ else if (!encoding.equals("UTF8"))
+ throw new IllegalArgumentException("content.source.encoding must be UTF8");
+ if (forever)
+ throw new IllegalArgumentException("content.source.forever is not supported");
+ String fileName = config.get("docs.file", null);
+ if (fileName == null) {
+ throw new IllegalArgumentException("docs.file must be set");
+ }
+ file = new File(fileName).getAbsoluteFile();
+
+ }
+}
Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision 987961)
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision )
@@ -17,33 +17,23 @@
* limitations under the License.
*/
-import java.io.IOException;
-import java.util.Collection;
-import java.util.HashSet;
-
-import java.util.List;
-import java.util.Set;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.search.MultiTermQuery;
-import org.apache.lucene.search.TopFieldCollector;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TopScoreDocCollector;
-import org.apache.lucene.search.Weight;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
/**
* Read index (abstract) task.
* Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve()
@@ -62,6 +52,7 @@
public abstract class ReadTask extends PerfTask {
private final QueryMaker queryMaker;
+ private boolean useHitTotal;
public ReadTask(PerfRunData runData) {
super(runData);
@@ -175,6 +166,8 @@
}
}
}
+ if (useHitTotal)
+ res = hits==null?0:hits.totalHits;
}
if (closeSearcher) {
@@ -252,6 +245,7 @@
public void setup() throws Exception {
super.setup();
numHits = getRunData().getConfig().get("search.num.hits", DEFAULT_SEARCH_NUM_HITS);
+ useHitTotal = getRunData().getConfig().get("search.useHitTotal",false);
}
/**
Index: lucene/contrib/benchmark/build.xml
===================================================================
--- lucene/contrib/benchmark/build.xml (revision 993408)
+++ lucene/contrib/benchmark/build.xml (revision )
@@ -19,6 +19,7 @@
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
+
@@ -141,10 +142,15 @@
+
+
+
+
+
@@ -254,7 +260,17 @@
+
+
+
+
+
+
+
+
+
+
-
+
Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoPerfData.java
===================================================================
--- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoPerfData.java (revision )
+++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/GeoPerfData.java (revision )
@@ -0,0 +1,133 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * Singleton, threadsafe, immutable.
+ * @author David Smiley dsmiley@mitre.org
+ */
+class GeoPerfData {
+
+ private final int zeroPopSubst;
+
+ private final ArrayList allPlaces;//pop descending
+ /** Parallel array to allPlaces containing the cumulative population of all places prior to this one plus this one. */
+ private long[] cumPop;
+
+ private long totalPop;
+
+ static synchronized GeoPerfData initialize(Config config) throws IOException {
+ if (instance != null)
+ throw new IllegalStateException("already initialized");
+ instance = new GeoPerfData(config);
+ return instance;
+ }
+
+ static class Place {
+ //int id;
+ float lat, lon;
+ int pop;
+ }
+
+ private static volatile GeoPerfData instance;
+
+ static synchronized GeoPerfData getInstance() {
+ if (instance == null)
+ throw new IllegalStateException("GeoPerfData not yet initialized");
+ return instance;
+ }
+
+ private GeoPerfData(Config config) throws IOException {
+ zeroPopSubst = config.get("content.geo.zeroPopSubst", 100);
+ int maxPlaces = config.get("content.geo.maxPlaces",Integer.MAX_VALUE);
+ allPlaces = new ArrayList(Math.min(1000000,maxPlaces));
+
+ //initialize data
+ GeoNamesContentSource source = new GeoNamesContentSource();
+ try {
+ source.setConfig(config);
+ source.resetInputs();
+
+ while(allPlaces.size() < maxPlaces) {
+ GeoNamesContentSource.GeoNameDocData docData = source.getNextDocData(null);
+ if (docData == null)
+ break;//won't happen?
+ processInputLine(docData);
+ }
+ } catch (NoMoreDataException e) {
+ //ignore; expected
+ } finally {
+ source.close();
+ }
+ postProcessLines();
+
+ System.out.println("Geo names places: "+allPlaces.size());
+ System.out.println("Geo names total population: "+totalPop);
+ }
+
+ private void processInputLine(GeoNamesContentSource.GeoNameDocData docData) {
+ Place p = new Place();
+ p.lat = Float.parseFloat(docData.latitude);
+ p.lon = Float.parseFloat(docData.longitude);
+ p.pop = Integer.parseInt(docData.population);
+ if (p.pop <= 0)
+ p.pop = zeroPopSubst;
+ allPlaces.add(p);
+ }
+
+ private void postProcessLines() {
+ //sort ascending
+ Collections.sort(allPlaces,new Comparator() {
+ @Override
+ public int compare(Place o1, Place o2) {
+ return o1.pop - o2.pop;
+ }
+ });
+
+ //gather cumulative population parallel array
+ cumPop = new long[allPlaces.size()];
+ totalPop = 0;
+ for (int i = 0; i < cumPop.length; i++) {
+ totalPop += allPlaces.get(i).pop;
+ cumPop[i] = totalPop;
+ }
+ }
+
+ long getTotalPop() {
+ return totalPop;
+ }
+
+ int getTotalPlaces() {
+ return allPlaces.size();
+ }
+
+ Place nextRandomPlace(Random random, BitSet usedPlaces, boolean forceUnused) {
+ long personIdx = (long) (random.nextDouble() * totalPop); // [0 - totalPop)
+ int cumPopIdx = Arrays.binarySearch(cumPop, personIdx);
+ if (cumPopIdx < 0) {
+ //got (-(insertion point) - 1)
+ cumPopIdx = -1*(cumPopIdx + 1);
+ }
+ if (usedPlaces != null) {
+ if (forceUnused) {
+ cumPopIdx = usedPlaces.nextClearBit(cumPopIdx);
+ if (cumPopIdx == allPlaces.size()) {
+ cumPopIdx = usedPlaces.nextClearBit(0);
+ if (cumPopIdx == allPlaces.size()) {
+ throw new IllegalStateException("No more unused places!");
+ }
+ }
+ }
+ usedPlaces.set(cumPopIdx);
+ }
+ return allPlaces.get(cumPopIdx);
+ }
+
+ Place nextPlace(int idx) {
+ return allPlaces.get(idx);
+ }
+
+}
Index: lucene/contrib/benchmark/conf/geoname-spatial.alg
===================================================================
--- lucene/contrib/benchmark/conf/geoname-spatial.alg (revision )
+++ lucene/contrib/benchmark/conf/geoname-spatial.alg (revision )
@@ -0,0 +1,71 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# Using Geonames.org data for geospatial filter queries.
+
+ram.flush.mb=128
+
+directory=FSDirectory
+compound=false
+
+docs.file=temp/US.txt
+
+#content.geo.maxPlaces=1000000
+#content.geo.maxPlaces=10000000
+#content.geo.zeroPopSubst=100
+content.source.forever = false
+
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.GeoNamesDocMaker
+#doc.geo.docsToGenerate=1000000
+doc.geo.avgPlacesPerDoc=1
+doc.geo.oneDocPerPlace=true
+doc.geo.schemaField=geohash
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.GeoQueryMaker
+#query.geo.radiuskm=____km:11:44:230:1800
+
+query.geo.radiuskm=____km:350
+
+search.useHitTotal=true
+
+log.step.AddDoc=100000
+
+{ "Populate"
+ ResetSystemErase
+ CreateIndex
+ { "MAddDocs" AddDoc > : *
+ Optimize
+ CloseIndex
+}:0
+{ "Rounds"
+
+ ResetSystemSoft
+
+ OpenReader
+
+ -{ "Warm" Search > : 5
+
+ { "Search" Search > : 40
+
+ CloseReader
+
+ NewRound
+
+} : 1
+
+RepSelectByPref Search
+#RepSumByPrefRound Search
Index: lucene/contrib/benchmark/conf/solrconf/schema.xml
===================================================================
--- lucene/contrib/benchmark/conf/solrconf/schema.xml (revision )
+++ lucene/contrib/benchmark/conf/solrconf/schema.xml (revision )
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file