Index: src/test/org/apache/solr/search/TestDocSet.java =================================================================== --- src/test/org/apache/solr/search/TestDocSet.java (revision 758160) +++ src/test/org/apache/solr/search/TestDocSet.java (working copy) @@ -88,10 +88,17 @@ checkEqual(a_and, b1.intersection(b2)); checkEqual(a_or, b1.union(b2)); checkEqual(a_andn, b1.andNot(b2)); - + assertEquals(a_and.cardinality(), b1.intersectionSize(b2)); assertEquals(a_or.cardinality(), b1.unionSize(b2)); assertEquals(a_andn.cardinality(), b1.andNotSize(b2)); + + checkEqual(a1, NegatedDocSet.negation(NegatedDocSet.negation(b1))); + //checkEqual(a_and, b1.andNot(NegatedDocSet.negation(b2))); + //checkEqual(a_andn, b1.intersection(NegatedDocSet.negation(b2))); + checkEqual(a_andn, NegatedDocSet.negation(b2).intersection(b1)); + checkEqual(a_or, NegatedDocSet.negation(NegatedDocSet.negation(b1).andNot(b2))); + //checkEqual(a_and, NegatedDocSet.negation(NegatedDocSet.negation(b1).union(NegatedDocSet.negation(b2)))); } Index: src/java/org/apache/solr/search/CollapseFilter.java =================================================================== --- src/java/org/apache/solr/search/CollapseFilter.java (revision 0) +++ src/java/org/apache/solr/search/CollapseFilter.java (revision 0) @@ -0,0 +1,406 @@ +package org.apache.solr.search; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.solr.common.params.CollapseParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.params.CollapseParams.CollapseFacet; +import org.apache.solr.common.params.CollapseParams.CollapseType; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.util.OpenBitSet; + +/** + * @author Emmanuel Keller keller.emmanuel@gmail.com + * @version $Id:$ + * @since solr 1.3 + */ +public class CollapseFilter { + + private class CollapsedDocument { + int documentId; + int totalCount; + + public CollapsedDocument(int documentId, int totalCount) { + this.documentId = documentId; + this.totalCount = totalCount; + } + } + + /** Parameters *************************************************** */ + + /** + * Field to use to collapse results. Parameter. + */ + private String collapseField; + + /** + * Type of collapsing to do -- collapse all hits or adjacent hits only. + * Parameter. + */ + private CollapseType collapseType; + + /** + * Facet before or after collapsing. + */ + private CollapseFacet collapseFacet; + + /** + * Number of documents with the same value for collapseField after which + * collapsing kicks in. Parameter. + */ + private int collapseTreshold; + + /** + * Maximum number of documents to process during field collapsing. + * Parameter. + */ + private int collapseMaxDocs; + + /** + * Return collapse count for each document? Parameter. + */ + private boolean collapseInfoDoc; + + /** + * Return collapse count for each field value? Parameter. + */ + private boolean collapseInfoCount; + + /** Collapse State *********************************************** */ + + /** + * Number of documents that have been collapsed into the key document. + */ + private Map collapseCounts; + + /** + * Maxmimum size for a HashDocSet. + */ + private int hashMaxSize; + + /** + * Buffer for collecting documents. Gets turned into different types of + * DocSet depending on the number of documents we end up with. + */ + private int[] docbuf; + + /** + * Number of documents in docbuf. + */ + private int docbufSize = 0; + + /** + * Maximum document id currently in docbuf. Only valid while docbufSize < + * hashMaxSize. + */ + private int docbufMaxDoc = 0; + + /** + * Bitset representation of docbuf. Gets created when docbufSize >= + * hashMaxSize. + */ + private OpenBitSet docbufBitSet; + + /** Debug Information ******************************************** */ + + private long timeCollapseAdjacent = 0; + + private long timeConvertToBitSet = 0; + + private long timeCreateDocSet = 0; + + private long timeCreateCollapseInfo = 0; + + private String debugDocSetInfo = "unknown"; + + /** + * Creates a CollapseFilter based on the specified parameters. + */ + public CollapseFilter(SolrIndexSearcher searcher, Query query, List filters, Sort sort, int flags, SolrParams params) throws IOException { + parseParameters(searcher, params); + + // Allocate data structures + hashMaxSize = searcher.getSchema().getSolrConfig().hashDocSetMaxSize; + docbuf = new int[hashMaxSize]; + collapseCounts = new HashMap(); + + DocList docs = searcher.getDocList(query, filters, sort, 0, collapseMaxDocs, flags); + String[] values = FieldCache.DEFAULT.getStrings(searcher.getReader(), collapseField); + + if (collapseType == CollapseType.ADJACENT) + adjacentCollapse(docs, values); + else + normalCollapse(docs, values); + + } + + private void normalCollapse(DocList docs, String[] values) { + int docCount = 0; // how many documents we have processed + HashMap collapseDocs = new HashMap(); // keep + // the + // track + // of + // how + // many + // docs + // with + // the + // same + // collapse + // value + // we + // have + // processed + // so + // far + // . + + long startTime = System.currentTimeMillis(); + + for (DocIterator i = docs.iterator(); i.hasNext();) { + int currentId = i.nextDoc(); + String currentValue = values[currentId]; + + // Get the last doc. and the total amount of docs. we have seen so + // far for this collapsing value + CollapsedDocument collapseDoc = collapseDocs.get(currentValue); + if (collapseDoc == null) { + // new collapsing value => create a new record for it + collapseDoc = new CollapsedDocument(currentId, 0); + collapseDocs.put(currentValue, collapseDoc); + } + + collapseDoc.totalCount++; + if (collapseDoc.totalCount <= collapseTreshold) { + // we haven't reached the threshold => add the document to the + // filter and set the current doc as the last seen for this + // collapsing value. + addDoc(currentId); + collapseDoc.documentId = currentId; + } else { + // we've already reached the threshold, the document shouldn't + // be added. instead, we must update the collapsing count. + Integer currentCount = collapseCounts.get(collapseDoc.documentId); + Integer newCount = (currentCount == null) ? 1 : currentCount + 1; + + collapseCounts.put(collapseDoc.documentId, newCount); + } + + // Stop after collapseMaxDocs documents + if (++docCount >= collapseMaxDocs) + break; + } + + timeCollapseAdjacent = System.currentTimeMillis() - startTime; + } + + private void adjacentCollapse(DocList docs, String[] values) { + int docCount = 0; // how many documents we have processed + int repeatCount = 0; // how many times we have seen the same value in a + // row + int collapseCount = 0; // how many documents we have collapsed in this + // run + int collapseId = -1; // the document we're collapsing into + String collapseValue = null; + + long startTime = System.currentTimeMillis(); + + for (DocIterator i = docs.iterator(); i.hasNext();) { + int currentId = i.nextDoc(); + String currentValue = values[currentId]; + + // Initializing + if (collapseValue == null) { + repeatCount = 0; + collapseCount = 0; + collapseId = currentId; + collapseValue = currentValue; + + // Collapse the document if the field value is the same + // and we have a run of at least collapseThreshold docs. + } else if (collapseValue.equals(currentValue)) { + if (++repeatCount >= collapseTreshold) { + collapseCount++; + } else { + addDoc(currentId); + } + } else { + addDoc(collapseId); + + if (collapseCount > 0) + collapseCounts.put(collapseId, collapseCount); + + repeatCount = 0; + collapseCount = 0; + collapseId = currentId; + collapseValue = currentValue; + } + + // Stop after collapseMaxDocs documents + if (++docCount >= collapseMaxDocs) + break; + } + + if (collapseId != -1) + addDoc(collapseId); + + if (collapseCount > 0) + collapseCounts.put(collapseId, collapseCount); + + timeCollapseAdjacent = System.currentTimeMillis() - startTime; + } + + /** + * Adds a document to the internal document buffer. + */ + private void addDoc(int doc) { + // If we have less than hashMaxSize documents, just + // keep adding them to docbuf. We will turn them into + // a HashDocSet later. + + if (docbufSize < hashMaxSize) { + docbuf[docbufSize] = doc; + if (doc > docbufMaxDoc) + docbufMaxDoc = doc; + } else { + // We have exceeded hashMaxSize. Allocate a bit set + // if we don't have one yet, then add to that. + if (docbufBitSet == null) { + long startTime = System.currentTimeMillis(); + docbufBitSet = new OpenBitSet(docbufMaxDoc + 1); + for (int i = 0; i < docbufSize; i++) + docbufBitSet.fastSet(docbuf[i]); + timeConvertToBitSet = System.currentTimeMillis() - startTime; + } + docbufBitSet.set(doc); + } + docbufSize++; + } + + /** + * Creates a DocSet representation of the internal document buffer. + */ + public DocSet getDocSet() { + long startTime = System.currentTimeMillis(); + DocSet result = (docbufBitSet != null) ? new BitDocSet(docbufBitSet) : new HashDocSet(docbuf, 0, docbufSize); + timeCreateDocSet = System.currentTimeMillis() - startTime; + debugDocSetInfo = result.getClass().getSimpleName() + "(" + docbufSize + ")"; + return result; + } + + /** + * Returns timing information for inclusing in the result. + */ + private String getDebugInfo() { + return debugDocSetInfo + " Time(ms): " + timeCollapseAdjacent + "/" + timeCreateCollapseInfo + "/" + timeConvertToBitSet + "/" + timeCreateDocSet; + } + + /** + * Returns collapse counts for all documents in the specified docList. + */ + public NamedList getCollapseInfo(SolrIndexSearcher searcher, DocList docs) throws IOException { + long startTime = System.currentTimeMillis(); + + NamedList result = new NamedList(); + result.add("field", collapseField); + + IndexSchema schema = searcher.getSchema(); + FieldType collapseFieldType = schema.getField(collapseField).getType(); + SchemaField uniqueKeyField = schema.getUniqueKeyField(); + String uniqueKeyName = (uniqueKeyField != null) ? uniqueKeyField.getName() : null; + + if (collapseInfoDoc || collapseInfoCount) { + NamedList resDoc = null; + NamedList resCount = null; + String[] values = null; + IndexReader reader = null; + + if (collapseInfoDoc) { + resDoc = new NamedList(); + result.add("doc", resDoc); + reader = searcher.getReader(); + } + + if (collapseInfoCount) { + resCount = new NamedList(); + result.add("count", resCount); + values = FieldCache.DEFAULT.getStrings(searcher.getReader(), collapseField); + } + + for (DocIterator i = docs.iterator(); i.hasNext();) { + int id = i.nextDoc(); + Integer count = collapseCounts.get(id); + if (count != null) { + if (collapseInfoDoc && uniqueKeyName != null) + resDoc.add(reader.document(id).get(uniqueKeyName), count); + if (collapseInfoCount) + resCount.add(collapseFieldType.indexedToReadable(values[id]), count); + } + } + } + + timeCreateCollapseInfo = System.currentTimeMillis() - startTime; + result.add("debug", getDebugInfo()); + return result; + } + + private void parseParameters(SolrIndexSearcher searcher, SolrParams params) throws IOException { + collapseField = params.required().get(CollapseParams.COLLAPSE_FIELD); + String type = params.get(CollapseParams.COLLAPSE_TYPE); + collapseType = (type != null) ? CollapseType.get(type) : CollapseType.NORMAL; + + String facet = params.get(CollapseParams.COLLAPSE_FACET); + collapseFacet = (facet != null) ? CollapseFacet.get(facet) : CollapseFacet.AFTER; + + Integer ct = params.getInt(CollapseParams.COLLAPSE_THRESHOLD); + if (ct == null) + ct = params.getInt(CollapseParams.COLLAPSE_MAX); + collapseTreshold = (ct != null) ? ct.intValue() : 1; + + collapseMaxDocs = params.getInt(CollapseParams.COLLAPSE_MAXDOCS, 0); + if (collapseMaxDocs <= 0) + collapseMaxDocs = searcher.maxDoc(); + + collapseInfoDoc = params.getBool(CollapseParams.COLLAPSE_INFO_DOC, true); + collapseInfoCount = params.getBool(CollapseParams.COLLAPSE_INFO_COUNT, true); + } + + public CollapseFacet getCollapseFacet() { + return collapseFacet; + } + + public String getCollapseField() { + return collapseField; + } + + public boolean isCollapseInfoCount() { + return collapseInfoCount; + } + + public boolean isCollapseInfoDoc() { + return collapseInfoDoc; + } + + public int getCollapseMaxDocs() { + return collapseMaxDocs; + } + + public int getCollapseTreshold() { + return collapseTreshold; + } + + public CollapseType getCollapseType() { + return collapseType; + } +} Index: src/java/org/apache/solr/search/DocSet.java =================================================================== --- src/java/org/apache/solr/search/DocSet.java (revision 758160) +++ src/java/org/apache/solr/search/DocSet.java (working copy) @@ -195,7 +195,7 @@ if (other instanceof HashDocSet) { return other.intersection(this); } - + // Default... handle with bitsets. OpenBitSet newbits = (OpenBitSet)(this.getBits().clone()); newbits.and(other.getBits()); Index: src/java/org/apache/solr/search/NegatedDocSet.java =================================================================== --- src/java/org/apache/solr/search/NegatedDocSet.java (revision 0) +++ src/java/org/apache/solr/search/NegatedDocSet.java (revision 0) @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.search; + +import org.apache.solr.common.SolrException; + + +/** + * DocSet that contains all documents that are not in the underlying source DocSet. + * Only usable as a filter DocSet as it is not a full implementation. + */ +public class NegatedDocSet extends DocSetBase { + private DocSet source; + + private NegatedDocSet(DocSet source) { + this.source = source; + } + + /** + * Returns the negation of the specified DocSet. This will be an instance of + * NegatedDocSet, unless the source is already a NegatedDocSet in which case + * it's underlying DocSet will be returned. + */ + public static DocSet negation(DocSet source) { + return (source instanceof NegatedDocSet) + ? ((NegatedDocSet) source).source + : new NegatedDocSet(source); + } + + public boolean exists(int docid) { + return !source.exists(docid); + } + + public DocSet intersection(DocSet other) { + // this & other = !source & other = other & !source + return other.andNot(source); + } + + public int intersectionSize(DocSet other) { + return other.andNotSize(source); + } + + public DocSet andNot(DocSet other) { + // this & !other = !source & !other = !(source | other) + return new NegatedDocSet(source.union(other)); + } + + public int andNotSize(DocSet other) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unsupported Operation"); + } + + public DocSet union(DocSet other) { + // this | other = !source | other = !(source & !other) + return new NegatedDocSet(source.andNot(other)); + } + + public int unionSize(DocSet other) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unsupported Operation"); + } + + public DocIterator iterator() { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unsupported Operation"); + } + + public long memSize() { + return 0; + } + + public int size() { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unsupported Operation"); + } +} Index: src/java/org/apache/solr/search/SolrIndexSearcher.java =================================================================== --- src/java/org/apache/solr/search/SolrIndexSearcher.java (revision 758160) +++ src/java/org/apache/solr/search/SolrIndexSearcher.java (working copy) @@ -1357,6 +1357,43 @@ } /** + * Returns documents matching both query and the intersection + * of filterList, sorted by sort. + * Also returns the compete set of documents + * matching query and filter + * (regardless of offset and len). + *

+ * This method is cache aware and may retrieve filter from + * the cache or make an insertion into the cache as a result of this call. + *

+ * FUTURE: The returned DocList may be retrieved from a cache. + *

+ * The DocList and DocSet returned should not be modified. + * + * @param query + * @param filterList may be null + * @param docSet filter docSet + * @param lsort criteria by which to sort (if null, query relevance is used) + * @param offset offset into the list of documents to return + * @param len maximum number of documents to return + * @param flags user supplied flags for the result set + * @return DocListAndSet meeting the specified criteria, should not be modified by the caller. + * @throws IOException + */ + public DocListAndSet getDocListAndSet(Query query, List filterList, DocSet docSet, Sort lsort, int offset, int len, int flags) throws IOException { + //DocListAndSet ret = new DocListAndSet(); + //getDocListC(ret,query,filterList,docSet,lsort,offset,len, flags |= GET_DOCSET); + + QueryCommand qc = new QueryCommand(); + qc.setQuery(query).setFilterList(filterList).setFilter(docSet); + qc.setSort(lsort).setOffset(offset).setLen(len).setFlags(flags |= GET_DOCSET); + QueryResult result = new QueryResult(); + getDocListC(result,qc); + + return result.getDocListAndSet(); + } + + /** * Returns documents matching both query and filter * and sorted by sort. Also returns the compete set of documents * matching query and filter (regardless of offset and len). Index: src/java/org/apache/solr/common/params/CollapseParams.java =================================================================== --- src/java/org/apache/solr/common/params/CollapseParams.java (revision 0) +++ src/java/org/apache/solr/common/params/CollapseParams.java (revision 0) @@ -0,0 +1,81 @@ +package org.apache.solr.common.params; + +import org.apache.solr.common.SolrException; + +public interface CollapseParams { + + /** + * The field to collapse results on. + */ + public static final String COLLAPSE_FIELD = "collapse.field"; + + /** + * Type of collapsing to perform: "normal" or "adjacent". + */ + public static final String COLLAPSE_TYPE = "collapse.type"; + + public enum CollapseType { + NORMAL, ADJACENT; + + public String toString() { + return super.toString().toLowerCase(); + } + + public static CollapseType get(String label) { + try { + return valueOf(label.toUpperCase()); + } catch (IllegalArgumentException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, label + + " is not a valid type of field collapsing", e); + } + } + } + + /** + * Apply faceting before or after collapsing. + */ + public static final String COLLAPSE_FACET = "collapse.facet"; + + public enum CollapseFacet { + BEFORE, AFTER; + + public String toString() { + return super.toString().toLowerCase(); + } + + public static CollapseFacet get(String label) { + try { + return valueOf(label.toUpperCase()); + } catch (IllegalArgumentException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, label + + " is not a valid faceting mode for field collapsing", e); + } + } + } + + /** + * The number of documents with the same value for collapse.field after which + * collapsing kicks in. + */ + public static final String COLLAPSE_THRESHOLD = "collapse.threshold"; + + /** + * @deprecated Deprecated in favour of collapse.threshold. + */ + public static final String COLLAPSE_MAX = "collapse.max"; + + /** + * Maximum number of documents to process during field collapsing. + */ + public static final String COLLAPSE_MAXDOCS = "collapse.maxdocs"; + + /** + * Return collapse count for each document? Defaults to true. + */ + public static final String COLLAPSE_INFO_DOC = "collapse.info.doc"; + + /** + * Return collapse count for each field value? Defaults to true. + */ + public static final String COLLAPSE_INFO_COUNT = "collapse.info.count"; +} Index: src/java/org/apache/solr/handler/component/CollapseComponent.java =================================================================== --- src/java/org/apache/solr/handler/component/CollapseComponent.java (revision 0) +++ src/java/org/apache/solr/handler/component/CollapseComponent.java (revision 0) @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.params.CollapseParams; +import org.apache.solr.common.params.CollapseParams.CollapseFacet; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.request.SolrQueryResponse; +import org.apache.solr.search.*; + + +import java.io.IOException; +import java.net.URL; + +/** + * TODO! + * + * @version $Id: QueryComponent.java 602341 2007-12-08 07:27:49Z ryan $ + * @since solr 1.3 + */ +public class CollapseComponent extends QueryComponent { + /** + * Actually run the query + */ + @Override + public void process(ResponseBuilder builder) throws IOException { + SolrQueryRequest req = builder.req; + SolrQueryResponse rsp = builder.rsp; + SolrIndexSearcher searcher = req.getSearcher(); + SolrParams params = req.getParams(); + + + DocSet collapseFilterDocSet = null; + CollapseFilter collapseFilter = null; + boolean facetAfterCollapse = true; + + if (params.get(CollapseParams.COLLAPSE_FIELD) != null) { + collapseFilter = new CollapseFilter(searcher, builder.getQuery(), builder.getFilters(), + builder.getSortSpec().getSort(), builder.getFieldFlags(), params); + collapseFilterDocSet = collapseFilter.getDocSet(); + facetAfterCollapse = (collapseFilter.getCollapseFacet() == CollapseFacet.AFTER); + } + DocListAndSet results = searcher.getDocListAndSet(builder.getQuery(), + collapseFilterDocSet == null? builder.getFilters(): null, + collapseFilterDocSet, + builder.getSortSpec().getSort(), + builder.getSortSpec().getOffset(), + builder.getSortSpec().getCount(), + builder.getFieldFlags()); + + //for getting the facet count BEFORE the collapsing, we must + //get the doc. collection without filtering by the collapseFilterDocSet. + if (!facetAfterCollapse){ + results.docSet= searcher.getDocSet(builder.getQuery()); + } + + + builder.setResults(results); + if (null != collapseFilter) { + rsp.add("collapse_counts", collapseFilter.getCollapseInfo(searcher, results.docList)); + } + + rsp.add("response", results.docList); + } + + + ///////////////////////////////////////////// + /// SolrInfoMBean + //////////////////////////////////////////// + @Override + public String getDescription() { + return "Field Collapsing"; + } + + @Override + public String getVersion() { + return ""; + } + + @Override + public String getSourceId() { + return ""; + } + + @Override + public String getSource() { + return ""; + } + + @Override + public URL[] getDocs() { + return null; + } +}