Index: TPBDocMaker.java
===================================================================
--- TPBDocMaker.java (revision 0)
+++ TPBDocMaker.java (revision 0)
@@ -0,0 +1,79 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+import java.io.*;
+import java.util.Date;
+import java.util.zip.GZIPInputStream;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * A DocMaker using the TPB collection for its input.
+ * http://thepiratebay.org/tor/3783572/db_dump_and_query_log_from_piratebay.org__summer_of_2006
+ *
+ * Config properties:
+ * - path=<path to torrents.txt.gz file| Default: tpb/torrents.txt.gz>
+ *
+ */
+public class TPBDocMaker extends BasicDocMaker {
+
+ private BufferedReader br;
+ private File file;
+
+ public static final String header = "id\tcategory\tadded\tuploader\tdownloads\tname\tdescription";
+
+ int numUniqueTexts = 0;
+
+ public void setConfig(Config config) {
+ super.setConfig(config);
+ file = new File(config.get("path", "tpb/torrents.txt.gz"));
+ }
+
+ private int iteration = 0;
+
+ protected DocData getNextDocData() throws Exception {
+
+ if (br == null) {
+ openReader();
+ }
+ String line = br.readLine();
+ if (line == null) {
+ br.close();
+ openReader();
+ iteration++;
+ } else if (iteration == 0) {
+ numUniqueTexts++;
+ }
+
+ String[] fields = line.split("\t");
+
+ return new DocData(fields[1] + "_" + iteration, fields[6], fields[5], null, new Date(Long.valueOf(fields[2]) * 1000));
+ }
+
+ private void openReader() throws IOException {
+ br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF8"));
+ String line = br.readLine();
+ if (!line.startsWith(header)) {
+ throw new RuntimeException("Invalid header: "+ line);
+ }
+ }
+
+ public int numUniqueTexts() {
+ return numUniqueTexts;
+ }
+
+}
Index: TPBQueryMaker.java
===================================================================
--- TPBQueryMaker.java (revision 0)
+++ TPBQueryMaker.java (revision 0)
@@ -0,0 +1,76 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.queryParser.MultiFieldQueryParser;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+import java.io.*;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.zip.GZIPInputStream;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * A QueryMaker using the TPB collection for its input.
+ * http://thepiratebay.org/tor/3783572/db_dump_and_query_log_from_piratebay.org__summer_of_2006
+ *
+ * Config properties:
+ * - file=<path to queries.txt.gz file| Default: tpb/queries.txt.gz>
+ *
+ */
+public class TPBQueryMaker extends AbstractQueryMaker {
+
+ public TPBQueryMaker() {
+ }
+
+ public static final String header = "session\ttime\tcategory\tquery";
+
+ protected Query[] prepareQueries() throws Exception {
+ File file = new File(config.get("path", "tpb/queries.txt.gz"));
+ BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF8"));
+ String line = br.readLine();
+ if (!line.startsWith(header)) {
+ throw new Exception("Incompatible header: " + line + "\n Expected " + header);
+ }
+
+ Map/**/ boosts = new HashMap/**/(2);
+ boosts.put(BasicDocMaker.TITLE_FIELD, 1f);
+ boosts.put(BasicDocMaker.BODY_FIELD, 0.3f);
+
+ MultiFieldQueryParser qp = new MultiFieldQueryParser(new String[]{BasicDocMaker.TITLE_FIELD, BasicDocMaker.BODY_FIELD}, new StandardAnalyzer(), boosts);
+
+ List/**/ queries = new ArrayList/**/(3800000);
+
+ while ((line = br.readLine()) != null) {
+ String[] args = line.split("\t");
+ try {
+ queries.add(qp.parse(args[3]));
+ } catch (Exception e) {
+ // ignore bad formatted query
+ }
+ }
+ br.close();
+
+ return (Query[])queries.toArray(new Query[0]);
+ }
+
+}