Index: lucene/facet/src/java/org/apache/lucene/facet/search/CountingFacetsCollector.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/CountingFacetsCollector.java (revision 1437012) +++ lucene/facet/src/java/org/apache/lucene/facet/search/CountingFacetsCollector.java (working copy) @@ -6,28 +6,31 @@ import java.util.HashMap; import java.util.List; import java.util.Map.Entry; +import java.util.Map; import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.search.params.CountFacetRequest; -import org.apache.lucene.facet.search.params.FacetRequest; import org.apache.lucene.facet.search.params.FacetRequest.SortBy; import org.apache.lucene.facet.search.params.FacetRequest.SortOrder; +import org.apache.lucene.facet.search.params.FacetRequest; import org.apache.lucene.facet.search.params.FacetSearchParams; import org.apache.lucene.facet.search.results.FacetResult; import org.apache.lucene.facet.search.results.FacetResultNode; import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.facet.taxonomy.directory.ParallelTaxonomyArrays; import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocValues.Source; import org.apache.lucene.index.DocValues; -import org.apache.lucene.index.DocValues.Source; +import org.apache.lucene.index.SegmentReader; import org.apache.lucene.search.Collector; import org.apache.lucene.search.Scorer; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.encoding.DGapVInt8IntDecoder; +import org.apache.lucene.util.packed.PackedInts; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -89,11 +92,106 @@ private final int[] counts; private final String facetsField; private final boolean useDirectSource; - private final HashMap matchingDocs = new HashMap(); + private final HashMap matchingDocs = new HashMap(); private DocValues facetsValues; private FixedBitSet bits; - + private static int totalBytes; + + private static class CachedInts { + //final PackedInts.Reader docOffset; + //final PackedInts.Reader values; + final int[] docOffset; + final int[] values; + + public CachedInts(Source source, int maxDoc) { + System.out.println("build cache maxDoc=" + maxDoc); + BytesRef buf = new BytesRef(); + + // First pass: just count how many ords we have: + + // nocommit this could require long?: + int totOrds = 0; + int maxOrd = -1; + + for(int docID=0;docID 0) { + // this document has facets + int upto = buf.offset + buf.length; + int ord = 0; + int offset = buf.offset; + int prev = 0; + while (offset < upto) { + byte b = buf.bytes[offset++]; + if (b >= 0) { + prev = ord = ((ord << 7) | b) + prev; + maxOrd = Math.max(maxOrd, ord); + totOrds++; + ord = 0; + } else { + ord = (ord << 7) | (b & 0x7F); + } + } + } + } + + // Second pass: encode to packed ints: + /* + PackedInts.Mutable docOffset = PackedInts.getMutable(maxDoc+1, PackedInts.bitsRequired(totOrds), PackedInts.DEFAULT); + PackedInts.Mutable values = PackedInts.getMutable(totOrds, PackedInts.bitsRequired(maxOrd), PackedInts.DEFAULT); + */ + System.out.println(" docOffset = " + (4*(maxDoc+1)) + " bytes"); + System.out.println(" values = " + (4*totOrds) + " bytes"); + docOffset = new int[maxDoc+1]; + values = new int[totOrds]; + totalBytes += 4*(maxDoc+1+totOrds); + System.out.println(" total=" + (totalBytes/1024) + " KB"); + totOrds = 0; + for(int docID=0;docID 0) { + // this document has facets + int upto = buf.offset + buf.length; + int ord = 0; + int offset = buf.offset; + int prev = 0; + while (offset < upto) { + byte b = buf.bytes[offset++]; + if (b >= 0) { + prev = ord = ((ord << 7) | b) + prev; + //values.set(totOrds, ord); + values[totOrds] = ord; + totOrds++; + ord = 0; + } else { + ord = (ord << 7) | (b & 0x7F); + } + } + } + } + //docOffset.set(maxDoc, totOrds); + docOffset[maxDoc] = totOrds; + + //this.docOffset = docOffset; + //this.values = values; + } + } + + private static final Map sourceCache = new HashMap(); + + private static synchronized CachedInts getCachedInts(Source source, int maxDoc, SegmentReader r) { + CachedInts ci = sourceCache.get(source); + if (ci == null) { + System.out.println("r=" + r); + ci = new CachedInts(source, maxDoc); + sourceCache.put(source, ci); + } + return ci; + } + public CountingFacetsCollector(FacetSearchParams fsp, TaxonomyReader taxoReader) { this(fsp, taxoReader, new FacetArrays(taxoReader.getSize()), false); } @@ -162,7 +260,7 @@ if (facetsValues != null) { Source facetSource = useDirectSource ? facetsValues.getDirectSource() : facetsValues.getSource(); bits = new FixedBitSet(context.reader().maxDoc()); - matchingDocs.put(facetSource, bits); + matchingDocs.put(getCachedInts(facetSource, context.reader().maxDoc(), (SegmentReader) context.reader()), bits); } } @@ -176,31 +274,25 @@ } private void countFacets() { - for (Entry entry : matchingDocs.entrySet()) { - Source facetsSource = entry.getKey(); + for (Entry entry : matchingDocs.entrySet()) { + //PackedInts.Reader docOffset = entry.getKey().docOffset; + //PackedInts.Reader values = entry.getKey().values; + int[] docOffset = entry.getKey().docOffset; + int[] values = entry.getKey().values; FixedBitSet bits = entry.getValue(); int doc = 0; int length = bits.length(); while (doc < length && (doc = bits.nextSetBit(doc)) != -1) { - facetsSource .getBytes(doc, buf); - if (buf.length > 0) { - // this document has facets - int upto = buf.offset + buf.length; - int ord = 0; - int offset = buf.offset; - int prev = 0; - while (offset < upto) { - byte b = buf.bytes[offset++]; - if (b >= 0) { - prev = ord = ((ord << 7) | b) + prev; - counts[ord]++; - ord = 0; - } else { - ord = (ord << 7) | (b & 0x7F); - } - } + //int start = (int) docOffset.get(doc); + //int end = (int) docOffset.get(doc+1); + int start = docOffset[doc]; + int end = docOffset[doc+1]; + // nocommit use bulk read api: + for(int i=start;i