Index: lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java (revision 953479) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.store.RAMDirectory; @@ -45,15 +46,7 @@ IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer())); - //Add series of docs with filterable fields : url, text and dates flags - addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101"); - addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102"); - addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101"); - addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101"); - addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102"); - addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101"); - addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101"); - addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102"); + addDocs(writer); writer.close(); reader=IndexReader.open(directory, true); @@ -61,6 +54,21 @@ } + private void addDocs(IndexWriter writer) throws IOException { + // Add series of docs with filterable fields : url, text and dates flags + addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", + "20040101"); + addDoc(writer, "http://lucene.apache.org", "New release pending", + "20040102"); + addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101"); + addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101"); + addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102"); + addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101"); + addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101"); + addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", + "20050102"); + } + @Override protected void tearDown() throws Exception { reader.close(); @@ -69,14 +77,16 @@ super.tearDown(); } - private void addDoc(IndexWriter writer, String url, String text, String date) throws IOException - { - Document doc=new Document(); - doc.add(new Field(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED)); - doc.add(new Field("text",text,Field.Store.YES,Field.Index.ANALYZED)); - doc.add(new Field("date",date,Field.Store.YES,Field.Index.ANALYZED)); - writer.addDocument(doc); - } + private void addDoc(IndexWriter writer, String url, String text, String date) + throws IOException { + Document doc = new Document(); + doc + .add(new Field(KEY_FIELD, url, Field.Store.YES, + Field.Index.NOT_ANALYZED)); + doc.add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("date", date, Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } public void testDefaultFilter() throws Throwable { @@ -162,6 +172,36 @@ assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc); } } + + public void testDuplicateFilterAcrossSegments() throws Exception { + RAMDirectory duplicates = new RAMDirectory(); + IndexWriter writer = new IndexWriter(duplicates, new IndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer())); + + addDocs(writer); + + writer.close(); + reader = new MultiReader(new IndexReader[] { + IndexReader.open(directory, true), IndexReader.open(duplicates, true)}); + searcher = new IndexSearcher(reader); + + TopDocs docs; + + docs = searcher.search(new MatchAllDocsQuery(), null, 20); + assertEquals("Should be about 16 hits without the filter (just checking)", + 16, docs.totalHits); + + docs = searcher.search(new MatchAllDocsQuery(), new DuplicateFilter( + KEY_FIELD), 20); + assertEquals( + "Should only be two hits even though duplicates exist within and across segments", + 2, docs.totalHits); + + docs = searcher.search(new MatchAllDocsQuery(), new DuplicateFilter( + KEY_FIELD, DuplicateFilter.KM_USE_LAST_OCCURRENCE, + DuplicateFilter.PM_FULL_VALIDATION).setTopLevelReader(reader), 20); + assertEquals("Should only be two hits even though duplicates exist within and across segments", + 2, docs.totalHits); + } - } Index: lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java (revision 940113) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java (working copy) @@ -1,4 +1,5 @@ package org.apache.lucene.search; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -17,214 +18,260 @@ */ import java.io.IOException; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.EmptyDocsEnum; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.OpenBitSet; -import org.apache.lucene.util.Bits; -public class DuplicateFilter extends Filter -{ - - String fieldName; - - /** - * KeepMode determines which document id to consider as the master, all others being - * identified as duplicates. Selecting the "first occurrence" can potentially save on IO. - */ - int keepMode=KM_USE_FIRST_OCCURRENCE; - public static final int KM_USE_FIRST_OCCURRENCE=1; - public static final int KM_USE_LAST_OCCURRENCE=2; - - /** - * "Full" processing mode starts by setting all bits to false and only setting bits - * for documents that contain the given field and are identified as none-duplicates. - - * "Fast" processing sets all bits to true then unsets all duplicate docs found for the - * given field. This approach avoids the need to read TermDocs for terms that are seen - * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially - * faster approach , the downside is that bitsets produced will include bits set for - * documents that do not actually contain the field given. - * - */ - int processingMode=PM_FULL_VALIDATION; - public static final int PM_FULL_VALIDATION=1; - public static final int PM_FAST_INVALIDATION=2; - - - - public DuplicateFilter(String fieldName) - { - this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION); - } - - - public DuplicateFilter(String fieldName, int keepMode, int processingMode) - { - this.fieldName = fieldName; - this.keepMode = keepMode; - this.processingMode = processingMode; - } - +public class DuplicateFilter extends StatefulFilter { + + String fieldName; + + /** + * KeepMode determines which document id to consider as the master, all others + * being identified as duplicates. Selecting the "first occurrence" can + * potentially save on IO. + */ + int keepMode = KM_USE_FIRST_OCCURRENCE; + public static final int KM_USE_FIRST_OCCURRENCE = 1; + public static final int KM_USE_LAST_OCCURRENCE = 2; + + /** + * "Full" processing mode starts by setting all bits to false and only setting + * bits for documents that contain the given field and are identified as + * none-duplicates. + * + * "Fast" processing sets all bits to true then unsets all duplicate docs + * found for the given field. This approach avoids the need to read TermDocs + * for terms that are seen to have a document frequency of exactly "1" (i.e. + * no duplicates). While a potentially faster approach , the downside is that + * bitsets produced will include bits set for documents that do not actually + * contain the field given. + * + */ + int processingMode = PM_FULL_VALIDATION; + public static final int PM_FULL_VALIDATION = 1; + public static final int PM_FAST_INVALIDATION = 2; + + public DuplicateFilter(String fieldName) { + this(fieldName, KM_USE_FIRST_OCCURRENCE, PM_FULL_VALIDATION); + } + + public DuplicateFilter(String fieldName, int keepMode, int processingMode) { + this.fieldName = fieldName; + this.keepMode = keepMode; + this.processingMode = processingMode; + } + @Override - public DocIdSet getDocIdSet(IndexReader reader) throws IOException - { - if(processingMode==PM_FAST_INVALIDATION) - { - return fastBits(reader); - } - else - { - return correctBits(reader); - } - } - - private OpenBitSet correctBits(IndexReader reader) throws IOException { - OpenBitSet bits = new OpenBitSet(reader.maxDoc()); //assume all are INvalid + public DocIdSet getStatefulDocIdSet(IndexReader reader, + StatefulTermsEnum termsEnum) throws IOException { + if (processingMode == PM_FAST_INVALIDATION) { + return fastBits(reader, termsEnum); + } else { + return correctBits(reader, termsEnum); + } + } + + private OpenBitSet correctBits(IndexReader reader, StatefulTermsEnum termsEnum) + throws IOException { + OpenBitSet bits = new OpenBitSet(reader.maxDoc()); // assume all are INvalid final Bits delDocs = MultiFields.getDeletedDocs(reader); - Terms terms = reader.fields().terms(fieldName); - if (terms != null) { - TermsEnum termsEnum = terms.iterator(); - DocsEnum docs = null; - while(true) { - BytesRef currTerm = termsEnum.next(); - if (currTerm == null) { - break; - } else { - docs = termsEnum.docs(delDocs, docs); - int doc = docs.nextDoc(); - if (doc != docs.NO_MORE_DOCS) { - if (keepMode == KM_USE_FIRST_OCCURRENCE) { - bits.set(doc); - } else { - int lastDoc = doc; - while (true) { - lastDoc = doc; - doc = docs.nextDoc(); - if (doc == docs.NO_MORE_DOCS) { - break; - } + termsEnum.resetTermsEnum(reader, fieldName); + DocsEnum docs = null; + while (true) { + BytesRef currTerm = termsEnum.next(); + if (currTerm == null) { + break; + } else { + docs = termsEnum.docs(delDocs, docs); + int doc = docs.nextDoc(); + if (doc != DocsEnum.NO_MORE_DOCS) { + if (keepMode == KM_USE_FIRST_OCCURRENCE) { + bits.set(doc); + } else { + int lastDoc = doc; + while (true) { + lastDoc = doc; + doc = docs.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; } - bits.set(lastDoc); } + bits.set(lastDoc); } } } } return bits; } - - private OpenBitSet fastBits(IndexReader reader) throws IOException - { - - OpenBitSet bits=new OpenBitSet(reader.maxDoc()); - bits.set(0,reader.maxDoc()); //assume all are valid + + private OpenBitSet fastBits(IndexReader reader, StatefulTermsEnum termsEnum) + throws IOException { + + OpenBitSet bits = new OpenBitSet(reader.maxDoc()); + bits.set(0, reader.maxDoc()); // assume all are valid final Bits delDocs = MultiFields.getDeletedDocs(reader); - Terms terms = reader.fields().terms(fieldName); - if (terms != null) { - TermsEnum termsEnum = terms.iterator(); - DocsEnum docs = null; - while(true) { - BytesRef currTerm = termsEnum.next(); - if (currTerm == null) { - break; - } else { - if (termsEnum.docFreq() > 1) { - // unset potential duplicates - docs = termsEnum.docs(delDocs, docs); - int doc = docs.nextDoc(); - if (doc != docs.NO_MORE_DOCS) { - if (keepMode == KM_USE_FIRST_OCCURRENCE) { - doc = docs.nextDoc(); - } - } - - int lastDoc = -1; - while (true) { - lastDoc = doc; - bits.clear(lastDoc); + termsEnum.resetTermsEnum(reader, fieldName); + DocsEnum docs = null; + while (true) { + BytesRef currTerm = termsEnum.next(); + if (currTerm == null) { + break; + } else { + if (termsEnum.docFreq() > 1) { + // unset potential duplicates + docs = termsEnum.docs(delDocs, docs); + int doc = docs.nextDoc(); + if (doc != DocsEnum.NO_MORE_DOCS) { + if (keepMode == KM_USE_FIRST_OCCURRENCE) { doc = docs.nextDoc(); - if (doc == docs.NO_MORE_DOCS) { - break; - } } - - if (keepMode==KM_USE_LAST_OCCURRENCE) { - // restore the last bit - bits.set(lastDoc); + } + + int lastDoc = -1; + while (true) { + lastDoc = doc; + bits.clear(lastDoc); + doc = docs.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; } } + + if (keepMode == KM_USE_LAST_OCCURRENCE) { + // restore the last bit + bits.set(lastDoc); + } } } } - + return bits; } - - public String getFieldName() - { - return fieldName; - } - - - public void setFieldName(String fieldName) - { - this.fieldName = fieldName; - } - - - public int getKeepMode() - { - return keepMode; - } - - - public void setKeepMode(int keepMode) - { - this.keepMode = keepMode; - } - - - @Override - public boolean equals(Object obj) - { - if(this == obj) - return true; - if((obj == null) || (obj.getClass() != this.getClass())) - return false; - DuplicateFilter other = (DuplicateFilter)obj; - return keepMode == other.keepMode && - processingMode == other.processingMode && - (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName))); - } - - - - @Override - public int hashCode() - { - int hash = 217; - hash = 31 * hash + keepMode; - hash = 31 * hash + processingMode; - hash = 31 * hash + fieldName.hashCode(); - return hash; - } - - - public int getProcessingMode() - { - return processingMode; - } - - - public void setProcessingMode(int processingMode) - { - this.processingMode = processingMode; - } - - - + + public String getFieldName() { + return fieldName; + } + + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } + + public int getKeepMode() { + return keepMode; + } + + public void setKeepMode(int keepMode) { + this.keepMode = keepMode; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if ((obj == null) || (obj.getClass() != this.getClass())) return false; + DuplicateFilter other = (DuplicateFilter) obj; + return keepMode == other.keepMode + && processingMode == other.processingMode + && (fieldName == other.fieldName || (fieldName != null && fieldName + .equals(other.fieldName))); + } + + @Override + public int hashCode() { + int hash = 217; + hash = 31 * hash + keepMode; + hash = 31 * hash + processingMode; + hash = 31 * hash + fieldName.hashCode(); + return hash; + } + + public int getProcessingMode() { + return processingMode; + } + + public void setProcessingMode(int processingMode) { + this.processingMode = processingMode; + } + + /** + * @return an instance of a {@link DedupingTermsEnum} that is tied to this + * filter's {@link #fieldName}. + */ + @Override + protected StatefulTermsEnum getTermsEnumModel() { + return new DedupingTermsEnum(getFieldName()); + } + + /** + * The DedupingTermsEnum is a concrete {@link StatefulTermsEnum} + * that accepts a term iff it has not been seen before, either in the current + * segment or ones that came before. + * + * @author Karthick Sankarachary + */ + public class DedupingTermsEnum extends StatefulTermsEnum { + /** + * Construct an instance that is tied to the given field + * + * @param field + * the name of a field + */ + public DedupingTermsEnum(String field) { + super(field); + } + + @Override + public DocsEnum docs(Bits bits, DocsEnum reuse) throws IOException { + switch (keepMode) { + case KM_USE_FIRST_OCCURRENCE: + return super.docs(bits, reuse); + case KM_USE_LAST_OCCURRENCE: + default: + return docFreq() > 0 ? super.docs(bits, reuse) : new EmptyDocsEnum(); + } + } + + @Override + public int docFreq() { + switch (keepMode) { + case KM_USE_FIRST_OCCURRENCE: + return super.docFreq(); + case KM_USE_LAST_OCCURRENCE: + default: { + return (getTotalLowLevelDocFreq() < getTopLevelDocFreq()) ? 0 : super + .docFreq(); + } + } + } + + /** + * Accept a term iff it is "unique" (meaning that it does not exist in this + * instance's memory). In the event it is unique, it is promptly added to + * memory. + * + * @return an {@link AcceptStatus} of YES or NO depending on whether the + * term is unique + */ + @Override + protected AcceptStatus accept(BytesRef term) throws IOException { + switch (keepMode) { + case KM_USE_FIRST_OCCURRENCE: { + boolean uniqueTerm = !isTermMemorized(term); + if (uniqueTerm) { + memorizeTerm(term); + } + return uniqueTerm ? AcceptStatus.YES : AcceptStatus.NO; + } + case KM_USE_LAST_OCCURRENCE: + default: { + return docFreq() > 0 ? AcceptStatus.YES : AcceptStatus.NO; + } + } + } + } + }