Index: TPBDocMaker.java =================================================================== --- TPBDocMaker.java (revision 0) +++ TPBDocMaker.java (revision 0) @@ -0,0 +1,79 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +import org.apache.lucene.benchmark.byTask.utils.Config; + +import java.io.*; +import java.util.Date; +import java.util.zip.GZIPInputStream; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * A DocMaker using the TPB collection for its input. + * http://thepiratebay.org/tor/3783572/db_dump_and_query_log_from_piratebay.org__summer_of_2006 + *

+ * Config properties:

+ */ +public class TPBDocMaker extends BasicDocMaker { + + private BufferedReader br; + private File file; + + public static final String header = "id\tcategory\tadded\tuploader\tdownloads\tname\tdescription"; + + int numUniqueTexts = 0; + + public void setConfig(Config config) { + super.setConfig(config); + file = new File(config.get("path", "tpb/torrents.txt.gz")); + } + + private int iteration = 0; + + protected DocData getNextDocData() throws Exception { + + if (br == null) { + openReader(); + } + String line = br.readLine(); + if (line == null) { + br.close(); + openReader(); + iteration++; + } else if (iteration == 0) { + numUniqueTexts++; + } + + String[] fields = line.split("\t"); + + return new DocData(fields[1] + "_" + iteration, fields[6], fields[5], null, new Date(Long.valueOf(fields[2]) * 1000)); + } + + private void openReader() throws IOException { + br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF8")); + String line = br.readLine(); + if (!line.startsWith(header)) { + throw new RuntimeException("Invalid header: "+ line); + } + } + + public int numUniqueTexts() { + return numUniqueTexts; + } + +} Index: TPBQueryMaker.java =================================================================== --- TPBQueryMaker.java (revision 0) +++ TPBQueryMaker.java (revision 0) @@ -0,0 +1,76 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.queryParser.MultiFieldQueryParser; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +import java.io.*; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.HashMap; +import java.util.zip.GZIPInputStream; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * A QueryMaker using the TPB collection for its input. + * http://thepiratebay.org/tor/3783572/db_dump_and_query_log_from_piratebay.org__summer_of_2006 + *

+ * Config properties:

+ */ +public class TPBQueryMaker extends AbstractQueryMaker { + + public TPBQueryMaker() { + } + + public static final String header = "session\ttime\tcategory\tquery"; + + protected Query[] prepareQueries() throws Exception { + File file = new File(config.get("path", "tpb/queries.txt.gz")); + BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF8")); + String line = br.readLine(); + if (!line.startsWith(header)) { + throw new Exception("Incompatible header: " + line + "\n Expected " + header); + } + + Map/**/ boosts = new HashMap/**/(2); + boosts.put(BasicDocMaker.TITLE_FIELD, 1f); + boosts.put(BasicDocMaker.BODY_FIELD, 0.3f); + + MultiFieldQueryParser qp = new MultiFieldQueryParser(new String[]{BasicDocMaker.TITLE_FIELD, BasicDocMaker.BODY_FIELD}, new StandardAnalyzer(), boosts); + + List/**/ queries = new ArrayList/**/(3800000); + + while ((line = br.readLine()) != null) { + String[] args = line.split("\t"); + try { + queries.add(qp.parse(args[3])); + } catch (Exception e) { + // ignore bad formatted query + } + } + br.close(); + + return (Query[])queries.toArray(new Query[0]); + } + +}