diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 08aadf7..e95fe68 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -156,7 +156,11 @@ New Features * LUCENE-4290: Added PostingsHighlighter to the sandbox module. It uses offsets from the postings lists to highlight documents. (Robert Muir) - + +* LUCENE-4628: Added CommonTermsQuery that executes high-frequency terms + in a optional sub-query to prevent slow queries due to "common" terms + like stopwords. (Simon Willnauer) + API Changes * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries diff --git a/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java new file mode 100644 index 0000000..53ed403 --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java @@ -0,0 +1,364 @@ +package org.apache.lucene.queries; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.ToStringUtils; + +/** + * A query that executes high-frequency terms in a optional sub-query to prevent + * slow queries due to "common" terms like stopwords. This query basically + * builds 2 queries off the {@link #add(Term) added} terms where low-frequency + * terms are added to a required boolean clause and high-frequency terms are + * added to an optional boolean clause. The optional clause is only executed if + * the required "low-frequency' clause matches. Scores produced by this query + * will be slightly different to plain {@link BooleanQuery} scorer mainly due to + * differences in the {@link Similarity#coord(int,int) number of leave queries} + * in the required boolean clause. In the most cases high-frequency terms are + * unlikely to significantly contribute to the document score unless at least + * one of the low-frequency terms are matched such that this query can improve + * query execution times significantly if applicable. + *

+ * {@link CommonTermsQuery} has several advantages over stopword filtering at + * index or query time since a term can be "classified" based on the actual + * document frequency in the index and can prevent slow queries even across + * domains without specialized stopword files. + *

+ *

+ * Note: if the query only contains high-frequency terms the query is + * rewritten into a plain conjunction query ie. all high-frequency terms need to + * match in order to match a document. + *

+ */ +public class CommonTermsQuery extends Query { + /* + * TODO maybe it would make sense to abstract this even further and allow to + * rewrite to dismax rather than boolean. Yet, this can already be subclassed + * to do so. + */ + protected final List terms = new ArrayList(); + protected final boolean disableCoord; + protected final float maxTermFrequency; + protected final Occur lowFreqOccur; + protected final Occur highFreqOccur; + protected float lowFreqBoost = 1.0f; + protected float highFreqBoost = 1.0f; + protected int minNrShouldMatch = 0; + + /** + * Creates a new {@link CommonTermsQuery} + * + * @param highFreqOccur + * {@link Occur} used for high frequency terms + * @param lowFreqOccur + * {@link Occur} used for low frequency terms + * @param maxTermFrequency + * a value in [0..1] (or absolute number >=1) representing the + * maximum threshold of a terms document frequency to be considered a + * low frequency term. + * @throws IllegalArgumentException + * if {@link Occur#MUST_NOT} is pass as lowFreqOccur or + * highFreqOccur + */ + public CommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, + float maxTermFrequency) { + this(highFreqOccur, lowFreqOccur, maxTermFrequency, false); + } + + /** + * Creates a new {@link CommonTermsQuery} + * + * @param highFreqOccur + * {@link Occur} used for high frequency terms + * @param lowFreqOccur + * {@link Occur} used for low frequency terms + * @param maxTermFrequency + * a value in [0..1] (or absolute number >=1) representing the + * maximum threshold of a terms document frequency to be considered a + * low frequency term. + * @param disableCoord + * disables {@link Similarity#coord(int,int)} in scoring for the low + * / high frequency sub-queries + * @throws IllegalArgumentException + * if {@link Occur#MUST_NOT} is pass as lowFreqOccur or + * highFreqOccur + */ + public CommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, + float maxTermFrequency, boolean disableCoord) { + if (highFreqOccur == Occur.MUST_NOT) { + throw new IllegalArgumentException( + "highFreqOccur should be MUST or SHOULD but was MUST_NOT"); + } + if (lowFreqOccur == Occur.MUST_NOT) { + throw new IllegalArgumentException( + "lowFreqOccur should be MUST or SHOULD but was MUST_NOT"); + } + this.disableCoord = disableCoord; + this.highFreqOccur = highFreqOccur; + this.lowFreqOccur = lowFreqOccur; + this.maxTermFrequency = maxTermFrequency; + } + + /** + * Adds a term to the {@link CommonTermsQuery} + * + * @param term + * the term to add + */ + public void add(Term term) { + if (term == null) { + throw new IllegalArgumentException("Term must not be null"); + } + this.terms.add(term); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + if (this.terms.isEmpty()) { + return new BooleanQuery(); + } else if (this.terms.size() == 1) { + final TermQuery tq = new TermQuery(this.terms.get(0)); + tq.setBoost(getBoost()); + return tq; + } + final List leaves = reader.leaves(); + final int maxDoc = reader.maxDoc(); + final TermContext[] contextArray = new TermContext[terms.size()]; + final Term[] queryTerms = this.terms.toArray(new Term[0]); + collectTermContext(reader, leaves, contextArray, queryTerms); + return buildQuery(maxDoc, contextArray, queryTerms); + } + + protected Query buildQuery(final int maxDoc, + final TermContext[] contextArray, final Term[] queryTerms) { + BooleanQuery lowFreq = new BooleanQuery(disableCoord); + BooleanQuery highFreq = new BooleanQuery(disableCoord); + highFreq.setBoost(highFreqBoost); + lowFreq.setBoost(lowFreqBoost); + if (lowFreqOccur == Occur.SHOULD) { + lowFreq.setMinimumNumberShouldMatch(minNrShouldMatch); + } + BooleanQuery query = new BooleanQuery(true); + for (int i = 0; i < queryTerms.length; i++) { + TermContext termContext = contextArray[i]; + if (termContext == null) { + lowFreq.add(new TermQuery(queryTerms[i]), lowFreqOccur); + } else { + if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency) + || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency + * (float) maxDoc))) { + highFreq + .add(new TermQuery(queryTerms[i], termContext), highFreqOccur); + } else { + lowFreq.add(new TermQuery(queryTerms[i], termContext), lowFreqOccur); + } + } + + } + if (lowFreq.clauses().isEmpty()) { + /* + * if lowFreq is empty we rewrite the high freq terms in a conjunction to + * prevent slow queries. + */ + if (highFreqOccur == Occur.MUST) { + highFreq.setBoost(getBoost()); + return highFreq; + } else { + BooleanQuery highFreqConjunction = new BooleanQuery(); + for (BooleanClause booleanClause : highFreq) { + highFreqConjunction.add(booleanClause.getQuery(), Occur.MUST); + } + highFreqConjunction.setBoost(getBoost()); + return highFreqConjunction; + + } + } else if (highFreq.clauses().isEmpty()) { + // only do low freq terms - we don't have high freq terms + lowFreq.setBoost(getBoost()); + return lowFreq; + } else { + query.add(highFreq, Occur.SHOULD); + query.add(lowFreq, Occur.MUST); + query.setBoost(getBoost()); + return query; + } + } + + public void collectTermContext(IndexReader reader, + List leaves, TermContext[] contextArray, + Term[] queryTerms) throws IOException { + TermsEnum termsEnum = null; + for (AtomicReaderContext context : leaves) { + final Fields fields = context.reader().fields(); + if (fields == null) { + // reader has no fields + continue; + } + for (int i = 0; i < queryTerms.length; i++) { + Term term = queryTerms[i]; + TermContext termContext = contextArray[i]; + final Terms terms = fields.terms(term.field()); + if (terms == null) { + // field does not exist + continue; + } + termsEnum = terms.iterator(termsEnum); + assert termsEnum != null; + + if (termsEnum == TermsEnum.EMPTY) continue; + if (termsEnum.seekExact(term.bytes(), false)) { + if (termContext == null) { + contextArray[i] = new TermContext(reader.getContext(), + termsEnum.termState(), context.ord, termsEnum.docFreq(), + termsEnum.totalTermFreq()); + } else { + termContext.register(termsEnum.termState(), context.ord, + termsEnum.docFreq(), termsEnum.totalTermFreq()); + } + + } + + } + } + } + + /** + * Returns true iff {@link Similarity#coord(int,int)} is disabled in scoring + * for the high and low frequency query instance. The top level query will + * always disable coords. + * + * @see #CommonTermsQuery(Occur, Occur, float, boolean) + */ + public boolean isCoordDisabled() { + return disableCoord; + } + + /** + * Specifies a minimum number of the optional BooleanClauses which must be + * satisfied in order to produce a match on the low frequency terms query + * part. + * + *

+ * By default no optional clauses are necessary for a match (unless there are + * no required clauses). If this method is used, then the specified number of + * clauses is required. + *

+ * + * @param min + * the number of optional clauses that must match + */ + public void setMinimumNumberShouldMatch(int min) { + this.minNrShouldMatch = min; + } + + /** + * Gets the minimum number of the optional BooleanClauses which must be + * satisfied. + */ + public int getMinimumNumberShouldMatch() { + return minNrShouldMatch; + } + + @Override + public void extractTerms(Set terms) { + terms.addAll(this.terms); + } + + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + boolean needParens = (getBoost() != 1.0) + || (getMinimumNumberShouldMatch() > 0); + if (needParens) { + buffer.append("("); + } + for (int i = 0; i < terms.size(); i++) { + Term t = terms.get(i); + buffer.append(new TermQuery(t).toString()); + + if (i != terms.size() - 1) buffer.append(", "); + } + if (needParens) { + buffer.append(")"); + } + if (getMinimumNumberShouldMatch() > 0) { + buffer.append('~'); + buffer.append(getMinimumNumberShouldMatch()); + } + if (getBoost() != 1.0f) { + buffer.append(ToStringUtils.boost(getBoost())); + } + return buffer.toString(); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + (disableCoord ? 1231 : 1237); + result = prime * result + Float.floatToIntBits(highFreqBoost); + result = prime * result + + ((highFreqOccur == null) ? 0 : highFreqOccur.hashCode()); + result = prime * result + Float.floatToIntBits(lowFreqBoost); + result = prime * result + + ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode()); + result = prime * result + Float.floatToIntBits(maxTermFrequency); + result = prime * result + minNrShouldMatch; + result = prime * result + ((terms == null) ? 0 : terms.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!super.equals(obj)) return false; + if (getClass() != obj.getClass()) return false; + CommonTermsQuery other = (CommonTermsQuery) obj; + if (disableCoord != other.disableCoord) return false; + if (Float.floatToIntBits(highFreqBoost) != Float + .floatToIntBits(other.highFreqBoost)) return false; + if (highFreqOccur != other.highFreqOccur) return false; + if (Float.floatToIntBits(lowFreqBoost) != Float + .floatToIntBits(other.lowFreqBoost)) return false; + if (lowFreqOccur != other.lowFreqOccur) return false; + if (Float.floatToIntBits(maxTermFrequency) != Float + .floatToIntBits(other.maxTermFrequency)) return false; + if (minNrShouldMatch != other.minNrShouldMatch) return false; + if (terms == null) { + if (other.terms != null) return false; + } else if (!terms.equals(other.terms)) return false; + return true; + } + +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java b/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java new file mode 100644 index 0000000..c551de7 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java @@ -0,0 +1,328 @@ +package org.apache.lucene.queries; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Random; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.QueryUtils; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LineFileDocs; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util._TestUtil; + +public class CommonTermsQueryTest extends LuceneTestCase { + + public void testBasics() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + String[] docs = new String[] {"this is the end of the world right", + "is this it or maybe not", + "this is the end of the universe as we know it", + "there is the famous restaurant at the end of the universe",}; + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(newStringField("id", "" + i, Field.Store.YES)); + doc.add(newTextField("field", docs[i], Field.Store.NO)); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + IndexSearcher s = newSearcher(r); + { + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "is")); + query.add(new Term("field", "this")); + query.add(new Term("field", "end")); + query.add(new Term("field", "world")); + query.add(new Term("field", "universe")); + query.add(new Term("field", "right")); + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 3); + assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); + assertEquals("2", r.document(search.scoreDocs[1].doc).get("id")); + assertEquals("3", r.document(search.scoreDocs[2].doc).get("id")); + } + + { // only high freq + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "is")); + query.add(new Term("field", "this")); + query.add(new Term("field", "end")); + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 2); + assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); + assertEquals("2", r.document(search.scoreDocs[1].doc).get("id")); + } + + { // low freq is mandatory + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "is")); + query.add(new Term("field", "this")); + query.add(new Term("field", "end")); + query.add(new Term("field", "world")); + + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 1); + assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); + } + + { // low freq is mandatory + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "restaurant")); + query.add(new Term("field", "universe")); + + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 1); + assertEquals("3", r.document(search.scoreDocs[0].doc).get("id")); + + } + r.close(); + w.close(); + dir.close(); + } + + public void testEqualsHashCode() { + CommonTermsQuery query = new CommonTermsQuery(randomOccur(random()), + randomOccur(random()), random().nextFloat(), random().nextBoolean()); + int terms = atLeast(2); + for (int i = 0; i < terms; i++) { + query.add(new Term(_TestUtil.randomRealisticUnicodeString(random()), + _TestUtil.randomRealisticUnicodeString(random()))); + } + QueryUtils.checkHashEquals(query); + QueryUtils.checkUnequal(new CommonTermsQuery(randomOccur(random()), + randomOccur(random()), random().nextFloat(), random().nextBoolean()), + query); + + { + final long seed = random().nextLong(); + Random r = new Random(seed); + CommonTermsQuery left = new CommonTermsQuery(randomOccur(r), + randomOccur(r), r.nextFloat(), r.nextBoolean()); + int leftTerms = atLeast(r, 2); + for (int i = 0; i < leftTerms; i++) { + left.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil + .randomRealisticUnicodeString(r))); + } + + r = new Random(seed); + CommonTermsQuery right = new CommonTermsQuery(randomOccur(r), + randomOccur(r), r.nextFloat(), r.nextBoolean()); + int rightTerms = atLeast(r, 2); + for (int i = 0; i < rightTerms; i++) { + right.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil + .randomRealisticUnicodeString(r))); + } + QueryUtils.checkEqual(left, right); + } + } + + private static Occur randomOccur(Random random) { + return random.nextBoolean() ? Occur.MUST : Occur.SHOULD; + } + + public void testNullTerm() { + Random random = random(); + CommonTermsQuery query = new CommonTermsQuery(randomOccur(random), + randomOccur(random), random().nextFloat()); + try { + query.add(null); + fail("null values are not supported"); + } catch (IllegalArgumentException ex) { + + } + } + + public void testIllegalOccur() { + Random random = random(); + + try { + new CommonTermsQuery(Occur.MUST_NOT, randomOccur(random), random() + .nextFloat()); + fail("MUST_NOT is not supproted"); + } catch (IllegalArgumentException ex) { + + } + try { + new CommonTermsQuery(randomOccur(random), Occur.MUST_NOT, random() + .nextFloat()); + fail("MUST_NOT is not supproted"); + } catch (IllegalArgumentException ex) { + + } + } + + public void testRandomIndex() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + createRandomIndex(atLeast(50), w, random().nextLong()); + DirectoryReader reader = w.getReader(); + SlowCompositeReaderWrapper wrapper = new SlowCompositeReaderWrapper(reader); + String field = "body"; + Terms terms = wrapper.terms(field); + PriorityQueue lowFreqQueue = new PriorityQueue( + 5) { + + @Override + protected boolean lessThan(TermAndFreq a, TermAndFreq b) { + return a.freq > b.freq; + } + + }; + PriorityQueue highFreqQueue = new PriorityQueue( + 5) { + + @Override + protected boolean lessThan(TermAndFreq a, TermAndFreq b) { + return a.freq < b.freq; + } + + }; + try { + TermsEnum iterator = terms.iterator(null); + while (iterator.next() != null) { + if (highFreqQueue.size() < 5) { + highFreqQueue.add(new TermAndFreq( + BytesRef.deepCopyOf(iterator.term()), iterator.docFreq())); + lowFreqQueue.add(new TermAndFreq( + BytesRef.deepCopyOf(iterator.term()), iterator.docFreq())); + } else { + if (highFreqQueue.top().freq < iterator.docFreq()) { + highFreqQueue.top().freq = iterator.docFreq(); + highFreqQueue.top().term = BytesRef.deepCopyOf(iterator.term()); + highFreqQueue.updateTop(); + } + + if (lowFreqQueue.top().freq > iterator.docFreq()) { + lowFreqQueue.top().freq = iterator.docFreq(); + lowFreqQueue.top().term = BytesRef.deepCopyOf(iterator.term()); + lowFreqQueue.updateTop(); + } + } + } + int lowFreq = lowFreqQueue.top().freq; + int highFreq = highFreqQueue.top().freq; + assumeTrue("unlucky index", highFreq - 1 > lowFreq); + List highTerms = queueToList(highFreqQueue); + List lowTerms = queueToList(lowFreqQueue); + + IndexSearcher searcher = new IndexSearcher(reader); + Occur lowFreqOccur = randomOccur(random()); + BooleanQuery verifyQuery = new BooleanQuery(); + CommonTermsQuery cq = new CommonTermsQuery(randomOccur(random()), + lowFreqOccur, highFreq - 1, random().nextBoolean()); + for (TermAndFreq termAndFreq : lowTerms) { + cq.add(new Term(field, termAndFreq.term)); + verifyQuery.add(new BooleanClause(new TermQuery(new Term(field, + termAndFreq.term)), lowFreqOccur)); + } + for (TermAndFreq termAndFreq : highTerms) { + cq.add(new Term(field, termAndFreq.term)); + } + + TopDocs cqSearch = searcher.search(cq, reader.maxDoc()); + QueryUtils.check(random(), cq, searcher); + + TopDocs verifySearch = searcher.search(verifyQuery, reader.maxDoc()); + assertEquals(verifySearch.totalHits, cqSearch.totalHits); + Set hits = new HashSet(); + for (ScoreDoc doc : verifySearch.scoreDocs) { + hits.add(doc.doc); + } + + for (ScoreDoc doc : cqSearch.scoreDocs) { + assertTrue(hits.remove(doc.doc)); + } + + assertTrue(hits.isEmpty()); + } finally { + reader.close(); + wrapper.close(); + w.close(); + dir.close(); + } + + } + + private static List queueToList(PriorityQueue queue) { + List terms = new ArrayList(); + while (queue.size() > 0) { + terms.add(queue.pop()); + } + return terms; + } + + private static class TermAndFreq { + BytesRef term; + int freq; + + public TermAndFreq(BytesRef term, int freq) { + this.term = term; + this.freq = freq; + + } + + } + + /** + * populates a writer with random stuff. this must be fully reproducable with + * the seed! + */ + public static void createRandomIndex(int numdocs, RandomIndexWriter writer, + long seed) throws IOException { + Random random = new Random(seed); + // primary source for our data is from linefiledocs, its realistic. + LineFileDocs lineFileDocs = new LineFileDocs(random); + + // TODO: we should add other fields that use things like docs&freqs but omit + // positions, + // because linefiledocs doesn't cover all the possibilities. + for (int i = 0; i < numdocs; i++) { + writer.addDocument(lineFileDocs.nextDoc()); + } + + lineFileDocs.close(); + } +}