diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 08aadf7..e95fe68 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -156,7 +156,11 @@ New Features
* LUCENE-4290: Added PostingsHighlighter to the sandbox module. It uses
offsets from the postings lists to highlight documents. (Robert Muir)
-
+
+* LUCENE-4628: Added CommonTermsQuery that executes high-frequency terms
+ in a optional sub-query to prevent slow queries due to "common" terms
+ like stopwords. (Simon Willnauer)
+
API Changes
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java
new file mode 100644
index 0000000..53ed403
--- /dev/null
+++ b/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java
@@ -0,0 +1,364 @@
+package org.apache.lucene.queries;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.ToStringUtils;
+
+/**
+ * A query that executes high-frequency terms in a optional sub-query to prevent
+ * slow queries due to "common" terms like stopwords. This query basically
+ * builds 2 queries off the {@link #add(Term) added} terms where low-frequency
+ * terms are added to a required boolean clause and high-frequency terms are
+ * added to an optional boolean clause. The optional clause is only executed if
+ * the required "low-frequency' clause matches. Scores produced by this query
+ * will be slightly different to plain {@link BooleanQuery} scorer mainly due to
+ * differences in the {@link Similarity#coord(int,int) number of leave queries}
+ * in the required boolean clause. In the most cases high-frequency terms are
+ * unlikely to significantly contribute to the document score unless at least
+ * one of the low-frequency terms are matched such that this query can improve
+ * query execution times significantly if applicable.
+ *
+ * {@link CommonTermsQuery} has several advantages over stopword filtering at
+ * index or query time since a term can be "classified" based on the actual
+ * document frequency in the index and can prevent slow queries even across
+ * domains without specialized stopword files.
+ *
+ *
+ * Note: if the query only contains high-frequency terms the query is
+ * rewritten into a plain conjunction query ie. all high-frequency terms need to
+ * match in order to match a document.
+ *
+ */
+public class CommonTermsQuery extends Query {
+ /*
+ * TODO maybe it would make sense to abstract this even further and allow to
+ * rewrite to dismax rather than boolean. Yet, this can already be subclassed
+ * to do so.
+ */
+ protected final List terms = new ArrayList();
+ protected final boolean disableCoord;
+ protected final float maxTermFrequency;
+ protected final Occur lowFreqOccur;
+ protected final Occur highFreqOccur;
+ protected float lowFreqBoost = 1.0f;
+ protected float highFreqBoost = 1.0f;
+ protected int minNrShouldMatch = 0;
+
+ /**
+ * Creates a new {@link CommonTermsQuery}
+ *
+ * @param highFreqOccur
+ * {@link Occur} used for high frequency terms
+ * @param lowFreqOccur
+ * {@link Occur} used for low frequency terms
+ * @param maxTermFrequency
+ * a value in [0..1] (or absolute number >=1) representing the
+ * maximum threshold of a terms document frequency to be considered a
+ * low frequency term.
+ * @throws IllegalArgumentException
+ * if {@link Occur#MUST_NOT} is pass as lowFreqOccur or
+ * highFreqOccur
+ */
+ public CommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur,
+ float maxTermFrequency) {
+ this(highFreqOccur, lowFreqOccur, maxTermFrequency, false);
+ }
+
+ /**
+ * Creates a new {@link CommonTermsQuery}
+ *
+ * @param highFreqOccur
+ * {@link Occur} used for high frequency terms
+ * @param lowFreqOccur
+ * {@link Occur} used for low frequency terms
+ * @param maxTermFrequency
+ * a value in [0..1] (or absolute number >=1) representing the
+ * maximum threshold of a terms document frequency to be considered a
+ * low frequency term.
+ * @param disableCoord
+ * disables {@link Similarity#coord(int,int)} in scoring for the low
+ * / high frequency sub-queries
+ * @throws IllegalArgumentException
+ * if {@link Occur#MUST_NOT} is pass as lowFreqOccur or
+ * highFreqOccur
+ */
+ public CommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur,
+ float maxTermFrequency, boolean disableCoord) {
+ if (highFreqOccur == Occur.MUST_NOT) {
+ throw new IllegalArgumentException(
+ "highFreqOccur should be MUST or SHOULD but was MUST_NOT");
+ }
+ if (lowFreqOccur == Occur.MUST_NOT) {
+ throw new IllegalArgumentException(
+ "lowFreqOccur should be MUST or SHOULD but was MUST_NOT");
+ }
+ this.disableCoord = disableCoord;
+ this.highFreqOccur = highFreqOccur;
+ this.lowFreqOccur = lowFreqOccur;
+ this.maxTermFrequency = maxTermFrequency;
+ }
+
+ /**
+ * Adds a term to the {@link CommonTermsQuery}
+ *
+ * @param term
+ * the term to add
+ */
+ public void add(Term term) {
+ if (term == null) {
+ throw new IllegalArgumentException("Term must not be null");
+ }
+ this.terms.add(term);
+ }
+
+ @Override
+ public Query rewrite(IndexReader reader) throws IOException {
+ if (this.terms.isEmpty()) {
+ return new BooleanQuery();
+ } else if (this.terms.size() == 1) {
+ final TermQuery tq = new TermQuery(this.terms.get(0));
+ tq.setBoost(getBoost());
+ return tq;
+ }
+ final List leaves = reader.leaves();
+ final int maxDoc = reader.maxDoc();
+ final TermContext[] contextArray = new TermContext[terms.size()];
+ final Term[] queryTerms = this.terms.toArray(new Term[0]);
+ collectTermContext(reader, leaves, contextArray, queryTerms);
+ return buildQuery(maxDoc, contextArray, queryTerms);
+ }
+
+ protected Query buildQuery(final int maxDoc,
+ final TermContext[] contextArray, final Term[] queryTerms) {
+ BooleanQuery lowFreq = new BooleanQuery(disableCoord);
+ BooleanQuery highFreq = new BooleanQuery(disableCoord);
+ highFreq.setBoost(highFreqBoost);
+ lowFreq.setBoost(lowFreqBoost);
+ if (lowFreqOccur == Occur.SHOULD) {
+ lowFreq.setMinimumNumberShouldMatch(minNrShouldMatch);
+ }
+ BooleanQuery query = new BooleanQuery(true);
+ for (int i = 0; i < queryTerms.length; i++) {
+ TermContext termContext = contextArray[i];
+ if (termContext == null) {
+ lowFreq.add(new TermQuery(queryTerms[i]), lowFreqOccur);
+ } else {
+ if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency)
+ || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency
+ * (float) maxDoc))) {
+ highFreq
+ .add(new TermQuery(queryTerms[i], termContext), highFreqOccur);
+ } else {
+ lowFreq.add(new TermQuery(queryTerms[i], termContext), lowFreqOccur);
+ }
+ }
+
+ }
+ if (lowFreq.clauses().isEmpty()) {
+ /*
+ * if lowFreq is empty we rewrite the high freq terms in a conjunction to
+ * prevent slow queries.
+ */
+ if (highFreqOccur == Occur.MUST) {
+ highFreq.setBoost(getBoost());
+ return highFreq;
+ } else {
+ BooleanQuery highFreqConjunction = new BooleanQuery();
+ for (BooleanClause booleanClause : highFreq) {
+ highFreqConjunction.add(booleanClause.getQuery(), Occur.MUST);
+ }
+ highFreqConjunction.setBoost(getBoost());
+ return highFreqConjunction;
+
+ }
+ } else if (highFreq.clauses().isEmpty()) {
+ // only do low freq terms - we don't have high freq terms
+ lowFreq.setBoost(getBoost());
+ return lowFreq;
+ } else {
+ query.add(highFreq, Occur.SHOULD);
+ query.add(lowFreq, Occur.MUST);
+ query.setBoost(getBoost());
+ return query;
+ }
+ }
+
+ public void collectTermContext(IndexReader reader,
+ List leaves, TermContext[] contextArray,
+ Term[] queryTerms) throws IOException {
+ TermsEnum termsEnum = null;
+ for (AtomicReaderContext context : leaves) {
+ final Fields fields = context.reader().fields();
+ if (fields == null) {
+ // reader has no fields
+ continue;
+ }
+ for (int i = 0; i < queryTerms.length; i++) {
+ Term term = queryTerms[i];
+ TermContext termContext = contextArray[i];
+ final Terms terms = fields.terms(term.field());
+ if (terms == null) {
+ // field does not exist
+ continue;
+ }
+ termsEnum = terms.iterator(termsEnum);
+ assert termsEnum != null;
+
+ if (termsEnum == TermsEnum.EMPTY) continue;
+ if (termsEnum.seekExact(term.bytes(), false)) {
+ if (termContext == null) {
+ contextArray[i] = new TermContext(reader.getContext(),
+ termsEnum.termState(), context.ord, termsEnum.docFreq(),
+ termsEnum.totalTermFreq());
+ } else {
+ termContext.register(termsEnum.termState(), context.ord,
+ termsEnum.docFreq(), termsEnum.totalTermFreq());
+ }
+
+ }
+
+ }
+ }
+ }
+
+ /**
+ * Returns true iff {@link Similarity#coord(int,int)} is disabled in scoring
+ * for the high and low frequency query instance. The top level query will
+ * always disable coords.
+ *
+ * @see #CommonTermsQuery(Occur, Occur, float, boolean)
+ */
+ public boolean isCoordDisabled() {
+ return disableCoord;
+ }
+
+ /**
+ * Specifies a minimum number of the optional BooleanClauses which must be
+ * satisfied in order to produce a match on the low frequency terms query
+ * part.
+ *
+ *
+ * By default no optional clauses are necessary for a match (unless there are
+ * no required clauses). If this method is used, then the specified number of
+ * clauses is required.
+ *
+ *
+ * @param min
+ * the number of optional clauses that must match
+ */
+ public void setMinimumNumberShouldMatch(int min) {
+ this.minNrShouldMatch = min;
+ }
+
+ /**
+ * Gets the minimum number of the optional BooleanClauses which must be
+ * satisfied.
+ */
+ public int getMinimumNumberShouldMatch() {
+ return minNrShouldMatch;
+ }
+
+ @Override
+ public void extractTerms(Set terms) {
+ terms.addAll(this.terms);
+ }
+
+ @Override
+ public String toString(String field) {
+ StringBuilder buffer = new StringBuilder();
+ boolean needParens = (getBoost() != 1.0)
+ || (getMinimumNumberShouldMatch() > 0);
+ if (needParens) {
+ buffer.append("(");
+ }
+ for (int i = 0; i < terms.size(); i++) {
+ Term t = terms.get(i);
+ buffer.append(new TermQuery(t).toString());
+
+ if (i != terms.size() - 1) buffer.append(", ");
+ }
+ if (needParens) {
+ buffer.append(")");
+ }
+ if (getMinimumNumberShouldMatch() > 0) {
+ buffer.append('~');
+ buffer.append(getMinimumNumberShouldMatch());
+ }
+ if (getBoost() != 1.0f) {
+ buffer.append(ToStringUtils.boost(getBoost()));
+ }
+ return buffer.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + (disableCoord ? 1231 : 1237);
+ result = prime * result + Float.floatToIntBits(highFreqBoost);
+ result = prime * result
+ + ((highFreqOccur == null) ? 0 : highFreqOccur.hashCode());
+ result = prime * result + Float.floatToIntBits(lowFreqBoost);
+ result = prime * result
+ + ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode());
+ result = prime * result + Float.floatToIntBits(maxTermFrequency);
+ result = prime * result + minNrShouldMatch;
+ result = prime * result + ((terms == null) ? 0 : terms.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (!super.equals(obj)) return false;
+ if (getClass() != obj.getClass()) return false;
+ CommonTermsQuery other = (CommonTermsQuery) obj;
+ if (disableCoord != other.disableCoord) return false;
+ if (Float.floatToIntBits(highFreqBoost) != Float
+ .floatToIntBits(other.highFreqBoost)) return false;
+ if (highFreqOccur != other.highFreqOccur) return false;
+ if (Float.floatToIntBits(lowFreqBoost) != Float
+ .floatToIntBits(other.lowFreqBoost)) return false;
+ if (lowFreqOccur != other.lowFreqOccur) return false;
+ if (Float.floatToIntBits(maxTermFrequency) != Float
+ .floatToIntBits(other.maxTermFrequency)) return false;
+ if (minNrShouldMatch != other.minNrShouldMatch) return false;
+ if (terms == null) {
+ if (other.terms != null) return false;
+ } else if (!terms.equals(other.terms)) return false;
+ return true;
+ }
+
+}
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java b/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java
new file mode 100644
index 0000000..c551de7
--- /dev/null
+++ b/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java
@@ -0,0 +1,328 @@
+package org.apache.lucene.queries;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.QueryUtils;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util._TestUtil;
+
+public class CommonTermsQueryTest extends LuceneTestCase {
+
+ public void testBasics() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ String[] docs = new String[] {"this is the end of the world right",
+ "is this it or maybe not",
+ "this is the end of the universe as we know it",
+ "there is the famous restaurant at the end of the universe",};
+ for (int i = 0; i < docs.length; i++) {
+ Document doc = new Document();
+ doc.add(newStringField("id", "" + i, Field.Store.YES));
+ doc.add(newTextField("field", docs[i], Field.Store.NO));
+ w.addDocument(doc);
+ }
+
+ IndexReader r = w.getReader();
+ IndexSearcher s = newSearcher(r);
+ {
+ CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
+ random().nextBoolean() ? 2.0f : 0.5f);
+ query.add(new Term("field", "is"));
+ query.add(new Term("field", "this"));
+ query.add(new Term("field", "end"));
+ query.add(new Term("field", "world"));
+ query.add(new Term("field", "universe"));
+ query.add(new Term("field", "right"));
+ TopDocs search = s.search(query, 10);
+ assertEquals(search.totalHits, 3);
+ assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
+ assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
+ assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
+ }
+
+ { // only high freq
+ CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
+ random().nextBoolean() ? 2.0f : 0.5f);
+ query.add(new Term("field", "is"));
+ query.add(new Term("field", "this"));
+ query.add(new Term("field", "end"));
+ TopDocs search = s.search(query, 10);
+ assertEquals(search.totalHits, 2);
+ assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
+ assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
+ }
+
+ { // low freq is mandatory
+ CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,
+ random().nextBoolean() ? 2.0f : 0.5f);
+ query.add(new Term("field", "is"));
+ query.add(new Term("field", "this"));
+ query.add(new Term("field", "end"));
+ query.add(new Term("field", "world"));
+
+ TopDocs search = s.search(query, 10);
+ assertEquals(search.totalHits, 1);
+ assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
+ }
+
+ { // low freq is mandatory
+ CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,
+ random().nextBoolean() ? 2.0f : 0.5f);
+ query.add(new Term("field", "restaurant"));
+ query.add(new Term("field", "universe"));
+
+ TopDocs search = s.search(query, 10);
+ assertEquals(search.totalHits, 1);
+ assertEquals("3", r.document(search.scoreDocs[0].doc).get("id"));
+
+ }
+ r.close();
+ w.close();
+ dir.close();
+ }
+
+ public void testEqualsHashCode() {
+ CommonTermsQuery query = new CommonTermsQuery(randomOccur(random()),
+ randomOccur(random()), random().nextFloat(), random().nextBoolean());
+ int terms = atLeast(2);
+ for (int i = 0; i < terms; i++) {
+ query.add(new Term(_TestUtil.randomRealisticUnicodeString(random()),
+ _TestUtil.randomRealisticUnicodeString(random())));
+ }
+ QueryUtils.checkHashEquals(query);
+ QueryUtils.checkUnequal(new CommonTermsQuery(randomOccur(random()),
+ randomOccur(random()), random().nextFloat(), random().nextBoolean()),
+ query);
+
+ {
+ final long seed = random().nextLong();
+ Random r = new Random(seed);
+ CommonTermsQuery left = new CommonTermsQuery(randomOccur(r),
+ randomOccur(r), r.nextFloat(), r.nextBoolean());
+ int leftTerms = atLeast(r, 2);
+ for (int i = 0; i < leftTerms; i++) {
+ left.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil
+ .randomRealisticUnicodeString(r)));
+ }
+
+ r = new Random(seed);
+ CommonTermsQuery right = new CommonTermsQuery(randomOccur(r),
+ randomOccur(r), r.nextFloat(), r.nextBoolean());
+ int rightTerms = atLeast(r, 2);
+ for (int i = 0; i < rightTerms; i++) {
+ right.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil
+ .randomRealisticUnicodeString(r)));
+ }
+ QueryUtils.checkEqual(left, right);
+ }
+ }
+
+ private static Occur randomOccur(Random random) {
+ return random.nextBoolean() ? Occur.MUST : Occur.SHOULD;
+ }
+
+ public void testNullTerm() {
+ Random random = random();
+ CommonTermsQuery query = new CommonTermsQuery(randomOccur(random),
+ randomOccur(random), random().nextFloat());
+ try {
+ query.add(null);
+ fail("null values are not supported");
+ } catch (IllegalArgumentException ex) {
+
+ }
+ }
+
+ public void testIllegalOccur() {
+ Random random = random();
+
+ try {
+ new CommonTermsQuery(Occur.MUST_NOT, randomOccur(random), random()
+ .nextFloat());
+ fail("MUST_NOT is not supproted");
+ } catch (IllegalArgumentException ex) {
+
+ }
+ try {
+ new CommonTermsQuery(randomOccur(random), Occur.MUST_NOT, random()
+ .nextFloat());
+ fail("MUST_NOT is not supproted");
+ } catch (IllegalArgumentException ex) {
+
+ }
+ }
+
+ public void testRandomIndex() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ createRandomIndex(atLeast(50), w, random().nextLong());
+ DirectoryReader reader = w.getReader();
+ SlowCompositeReaderWrapper wrapper = new SlowCompositeReaderWrapper(reader);
+ String field = "body";
+ Terms terms = wrapper.terms(field);
+ PriorityQueue lowFreqQueue = new PriorityQueue(
+ 5) {
+
+ @Override
+ protected boolean lessThan(TermAndFreq a, TermAndFreq b) {
+ return a.freq > b.freq;
+ }
+
+ };
+ PriorityQueue highFreqQueue = new PriorityQueue(
+ 5) {
+
+ @Override
+ protected boolean lessThan(TermAndFreq a, TermAndFreq b) {
+ return a.freq < b.freq;
+ }
+
+ };
+ try {
+ TermsEnum iterator = terms.iterator(null);
+ while (iterator.next() != null) {
+ if (highFreqQueue.size() < 5) {
+ highFreqQueue.add(new TermAndFreq(
+ BytesRef.deepCopyOf(iterator.term()), iterator.docFreq()));
+ lowFreqQueue.add(new TermAndFreq(
+ BytesRef.deepCopyOf(iterator.term()), iterator.docFreq()));
+ } else {
+ if (highFreqQueue.top().freq < iterator.docFreq()) {
+ highFreqQueue.top().freq = iterator.docFreq();
+ highFreqQueue.top().term = BytesRef.deepCopyOf(iterator.term());
+ highFreqQueue.updateTop();
+ }
+
+ if (lowFreqQueue.top().freq > iterator.docFreq()) {
+ lowFreqQueue.top().freq = iterator.docFreq();
+ lowFreqQueue.top().term = BytesRef.deepCopyOf(iterator.term());
+ lowFreqQueue.updateTop();
+ }
+ }
+ }
+ int lowFreq = lowFreqQueue.top().freq;
+ int highFreq = highFreqQueue.top().freq;
+ assumeTrue("unlucky index", highFreq - 1 > lowFreq);
+ List highTerms = queueToList(highFreqQueue);
+ List lowTerms = queueToList(lowFreqQueue);
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Occur lowFreqOccur = randomOccur(random());
+ BooleanQuery verifyQuery = new BooleanQuery();
+ CommonTermsQuery cq = new CommonTermsQuery(randomOccur(random()),
+ lowFreqOccur, highFreq - 1, random().nextBoolean());
+ for (TermAndFreq termAndFreq : lowTerms) {
+ cq.add(new Term(field, termAndFreq.term));
+ verifyQuery.add(new BooleanClause(new TermQuery(new Term(field,
+ termAndFreq.term)), lowFreqOccur));
+ }
+ for (TermAndFreq termAndFreq : highTerms) {
+ cq.add(new Term(field, termAndFreq.term));
+ }
+
+ TopDocs cqSearch = searcher.search(cq, reader.maxDoc());
+ QueryUtils.check(random(), cq, searcher);
+
+ TopDocs verifySearch = searcher.search(verifyQuery, reader.maxDoc());
+ assertEquals(verifySearch.totalHits, cqSearch.totalHits);
+ Set hits = new HashSet();
+ for (ScoreDoc doc : verifySearch.scoreDocs) {
+ hits.add(doc.doc);
+ }
+
+ for (ScoreDoc doc : cqSearch.scoreDocs) {
+ assertTrue(hits.remove(doc.doc));
+ }
+
+ assertTrue(hits.isEmpty());
+ } finally {
+ reader.close();
+ wrapper.close();
+ w.close();
+ dir.close();
+ }
+
+ }
+
+ private static List queueToList(PriorityQueue queue) {
+ List terms = new ArrayList();
+ while (queue.size() > 0) {
+ terms.add(queue.pop());
+ }
+ return terms;
+ }
+
+ private static class TermAndFreq {
+ BytesRef term;
+ int freq;
+
+ public TermAndFreq(BytesRef term, int freq) {
+ this.term = term;
+ this.freq = freq;
+
+ }
+
+ }
+
+ /**
+ * populates a writer with random stuff. this must be fully reproducable with
+ * the seed!
+ */
+ public static void createRandomIndex(int numdocs, RandomIndexWriter writer,
+ long seed) throws IOException {
+ Random random = new Random(seed);
+ // primary source for our data is from linefiledocs, its realistic.
+ LineFileDocs lineFileDocs = new LineFileDocs(random);
+
+ // TODO: we should add other fields that use things like docs&freqs but omit
+ // positions,
+ // because linefiledocs doesn't cover all the possibilities.
+ for (int i = 0; i < numdocs; i++) {
+ writer.addDocument(lineFileDocs.nextDoc());
+ }
+
+ lineFileDocs.close();
+ }
+}