Index: lucene/CHANGES.txt
--- lucene/CHANGES.txt Wed Mar 30 18:44:50 2011 -0400
+++ lucene/CHANGES.txt Wed Mar 30 19:19:07 2011 -0400
@@ -337,6 +337,13 @@
* LUCENE-3001: Added TrieFieldHelper to write solr compatible numeric
fields without the solr dependency. (ryan)
+* LUCENE-3003: Added new expert class oal.index.DocTermsOrd,
+ refactored from Solr's UnInvertedField, for accessing term ords for
+ multi-valued fields, per document. This is similar to FieldCache in
+ that it inverts the index to compute the ords, but differs in that
+ it's able to handle multi-valued fields and does not hold the term
+ bytes in RAM. (Mike McCandless)
+
Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
Index: lucene/src/java/org/apache/lucene/index/DocTermOrds.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ lucene/src/java/org/apache/lucene/index/DocTermOrds.java Wed Mar 30 19:19:07 2011 -0400
@@ -0,0 +1,799 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import org.apache.lucene.util.PagedBytes;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Bits;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Comparator;
+
+/**
+ * This class enables fast access to multiple term ords for
+ * a specified field across all docIDs.
+ *
+ * Like FieldCache, it uninverts the index and holds a
+ * packed data structure in RAM to enable fast access.
+ * Unlike FieldCache, it can handle multi-valued fields,
+ * and, it does not hold the term bytes in RAM. Rather, you
+ * must obtain a TermsEnum from the {@link #getOrdTermsEnum}
+ * method, and then seek-by-ord to get the term's bytes.
+ *
+ * While normally term ords are type long, in this API they are
+ * int as the internal representation here cannot address
+ * more than MAX_INT unique terms. Also, typically this
+ * class is used on fields with relatively few unique terms
+ * vs the number of documents. In addition, there is an
+ * internal limit (16 MB) on how many bytes each chunk of
+ * documents may consume. If you trip this limit you'll hit
+ * an IllegalStateException.
+ *
+ * Deleted documents are skipped during uninversion, and if
+ * you look them up you'll get 0 ords.
+ *
+ * The returned per-document ords do not retain their
+ * original order in the document. Instead they are returned
+ * in sorted (by ord, ie term's BytesRef comparator) order. They
+ * are also de-dup'd (ie if doc has same term more than once
+ * in this field, you'll only get that ord back once).
+ *
+ * This class tests whether the provided reader is able to
+ * retrieve terms by ord (ie, it's single segment, and it
+ * uses an ord-capable terms index). If not, this class
+ * will create its own term index internally, allowing to
+ * create a wrapped TermsEnum that can handle ord. The
+ * {@link #getOrdTermsEnum} method then provides this
+ * wrapped enum, if necessary.
+ *
+ * The RAM consumption of this class can be high!
+ *
+ * @lucene.experimental
+ */
+
+/*
+ * Final form of the un-inverted field:
+ * Each document points to a list of term numbers that are contained in that document.
+ *
+ * Term numbers are in sorted order, and are encoded as variable-length deltas from the
+ * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A
+ * term number of 0 signals the end of the termNumber list.
+ *
+ * There is a single int[maxDoc()] which either contains a pointer into a byte[] for
+ * the termNumber lists, or directly contains the termNumber list if it fits in the 4
+ * bytes of an integer. If the first byte in the integer is 1, the next 3 bytes
+ * are a pointer into a byte[] where the termNumber list starts.
+ *
+ * There are actually 256 byte arrays, to compensate for the fact that the pointers
+ * into the byte arrays are only 3 bytes long. The correct byte array for a document
+ * is a function of it's id.
+ *
+ * To save space and speed up faceting, any term that matches enough documents will
+ * not be un-inverted... it will be skipped while building the un-inverted field structure,
+ * and will use a set intersection method during faceting.
+ *
+ * To further save memory, the terms (the actual string values) are not all stored in
+ * memory, but a TermIndex is used to convert term numbers to term values only
+ * for the terms needed after faceting has completed. Only every 128th term value
+ * is stored, along with it's corresponding term number, and this is used as an
+ * index to find the closest term and iterate until the desired number is hit (very
+ * much like Lucene's own internal term index).
+ *
+ */
+
+public class DocTermOrds {
+
+ // Term ords are shifted by this, internally, to reseve
+ // values 0 (end term) and 1 (index is a pointer into byte array)
+ private final static int TNUM_OFFSET = 2;
+
+ // Default: every 128th term is indexed
+ public final static int DEFAULT_INDEX_INTERVAL_BITS = 7; // decrease to a low number like 2 for testing
+
+ private int indexIntervalBits;
+ private int indexIntervalMask;
+ private int indexInterval;
+
+ protected final int maxTermDocFreq;
+
+ protected final String field;
+
+ protected int numTermsInField;
+ protected long termInstances; // total number of references to term numbers
+ private long memsz;
+ protected int total_time; // total time to uninvert the field
+ protected int phase1_time; // time for phase1 of the uninvert process
+
+ protected int[] index;
+ protected byte[][] tnums = new byte[256][];
+ protected long sizeOfIndexedStrings;
+ protected BytesRef[] indexedTermsArray;
+ protected BytesRef prefix;
+ protected int ordBase;
+
+ public long ramUsedInBytes() {
+ // can cache the mem size since it shouldn't change
+ if (memsz!=0) return memsz;
+ long sz = 8*8 + 32; // local fields
+ if (index != null) sz += index.length * 4;
+ if (tnums!=null) {
+ for (byte[] arr : tnums)
+ if (arr != null) sz += arr.length;
+ }
+ memsz = sz;
+ return sz;
+ }
+
+ /** Inverts all terms */
+ public DocTermOrds(IndexReader reader, String field) throws IOException {
+ this(reader, field, null, Integer.MAX_VALUE);
+ }
+
+ /** Inverts only terms starting w/ prefix */
+ public DocTermOrds(IndexReader reader, String field, BytesRef termPrefix) throws IOException {
+ this(reader, field, termPrefix, Integer.MAX_VALUE);
+ }
+
+ /** Inverts only terms starting w/ prefix, and only terms
+ * whose docFreq (not taking deletions into account) is
+ * <= maxTermDocFreq */
+ public DocTermOrds(IndexReader reader, String field, BytesRef termPrefix, int maxTermDocFreq) throws IOException {
+ this(reader, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS);
+ uninvert(reader, termPrefix);
+ }
+
+ /** Inverts only terms starting w/ prefix, and only terms
+ * whose docFreq (not taking deletions into account) is
+ * <= maxTermDocFreq, with a custom indexing interval
+ * (default is every 128nd term). */
+ public DocTermOrds(IndexReader reader, String field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits) throws IOException {
+ this(field, maxTermDocFreq, indexIntervalBits);
+ uninvert(reader, termPrefix);
+ }
+
+ /** Subclass inits w/ this, but be sure you then call
+ * uninvert, only once */
+ protected DocTermOrds(String field, int maxTermDocFreq, int indexIntervalBits) throws IOException {
+ //System.out.println("DTO init field=" + field + " maxTDFreq=" + maxTermDocFreq);
+ this.field = field;
+ this.maxTermDocFreq = maxTermDocFreq;
+ this.indexIntervalBits = indexIntervalBits;
+ indexIntervalMask = 0xffffffff >>> (32-indexIntervalBits);
+ indexInterval = 1 << indexIntervalBits;
+ }
+
+ /** Returns a TermsEnum that implements ord. If the
+ * provided reader supports ord, we just return its
+ * TermsEnum; if it does not, we build a "private" terms
+ * index internally (WARNING: consumes RAM) and use that
+ * index to implement ord. This also enables ord on top
+ * of a composite reader. The returned TermsEnum is
+ * unpositioned. This returns null if there are no terms.
+ *
+ *
NOTE: you must pass the same reader that was
+ * used when creating this class */
+ public TermsEnum getOrdTermsEnum(IndexReader reader) throws IOException {
+ if (termInstances == 0) {
+ return null;
+ }
+ if (indexedTermsArray == null) {
+ //System.out.println("GET normal enum");
+ final Terms terms = MultiFields.getTerms(reader, field);
+ if (terms != null) {
+ return terms.iterator();
+ } else {
+ return null;
+ }
+ } else {
+ //System.out.println("GET wrapped enum ordBase=" + ordBase);
+ return new OrdWrappedTermsEnum(reader);
+ }
+ }
+
+ /** Subclass can override this */
+ protected void visitTerm(TermsEnum te, int termNum) throws IOException {
+ }
+
+ protected void setActualDocFreq(int termNum, int df) throws IOException {
+ }
+
+ // Call this only once (if you subclass!)
+ protected void uninvert(final IndexReader reader, final BytesRef termPrefix) throws IOException {
+ //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
+ final long startTime = System.currentTimeMillis();
+ prefix = termPrefix == null ? null : new BytesRef(termPrefix);
+
+ final int maxDoc = reader.maxDoc();
+ final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
+ final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
+ final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)
+
+ final Terms terms = MultiFields.getTerms(reader, field);
+ if (terms == null) {
+ // No terms
+ return;
+ }
+
+ final TermsEnum te = terms.iterator();
+ final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
+ //System.out.println("seekStart=" + seekStart.utf8ToString());
+ if (te.seek(seekStart) == TermsEnum.SeekStatus.END) {
+ // No terms match
+ return;
+ }
+
+ // If we need our "term index wrapper", these will be
+ // init'd below:
+ List indexedTerms = null;
+ PagedBytes indexedTermsBytes = null;
+
+ boolean testedOrd = false;
+
+ final Bits delDocs = MultiFields.getDeletedDocs(reader);
+
+ // we need a minimum of 9 bytes, but round up to 12 since the space would
+ // be wasted with most allocators anyway.
+ byte[] tempArr = new byte[12];
+
+ //
+ // enumerate all terms, and build an intermediate form of the un-inverted field.
+ //
+ // During this intermediate form, every document has a (potential) byte[]
+ // and the int[maxDoc()] array either contains the termNumber list directly
+ // or the *end* offset of the termNumber list in it's byte array (for faster
+ // appending and faster creation of the final form).
+ //
+ // idea... if things are too large while building, we could do a range of docs
+ // at a time (but it would be a fair amount slower to build)
+ // could also do ranges in parallel to take advantage of multiple CPUs
+
+ // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
+ // values. This requires going over the field first to find the most
+ // frequent terms ahead of time.
+
+ int termNum = 0;
+ DocsEnum docsEnum = null;
+
+ // Loop begins with te positioned to first term (we call
+ // seek above):
+ for (;;) {
+ final BytesRef t = te.term();
+ if (t == null || (termPrefix != null && !t.startsWith(termPrefix))) {
+ break;
+ }
+ //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
+
+ if (!testedOrd) {
+ try {
+ ordBase = (int) te.ord();
+ //System.out.println("got ordBase=" + ordBase);
+ } catch (UnsupportedOperationException uoe) {
+ // Reader cannot provide ord support, so we wrap
+ // our own support by creating our own terms index:
+ indexedTerms = new ArrayList();
+ indexedTermsBytes = new PagedBytes(15);
+ //System.out.println("NO ORDS");
+ }
+ testedOrd = true;
+ }
+
+ visitTerm(te, termNum);
+
+ if (indexedTerms != null && (termNum & indexIntervalMask) == 0) {
+ // Index this term
+ sizeOfIndexedStrings += t.length;
+ BytesRef indexedTerm = new BytesRef();
+ indexedTermsBytes.copy(t, indexedTerm);
+ // TODO: really should 1) strip off useless suffix,
+ // and 2) use FST not array/PagedBytes
+ indexedTerms.add(indexedTerm);
+ }
+
+ final int df = te.docFreq();
+ if (df <= maxTermDocFreq) {
+
+ docsEnum = te.docs(delDocs, docsEnum);
+
+ final DocsEnum.BulkReadResult bulkResult = docsEnum.getBulkResult();
+
+ // dF, but takes deletions into account
+ int actualDF = 0;
+
+ for (;;) {
+ int chunk = docsEnum.read();
+ if (chunk <= 0) {
+ break;
+ }
+ //System.out.println(" chunk=" + chunk + " docs");
+
+ actualDF += chunk;
+
+ for (int i=0; i>>=8;
+ }
+ // point at the end index in the byte[]
+ index[doc] = (endPos<<8) | 1;
+ bytes[doc] = tempArr;
+ tempArr = new byte[12];
+ }
+ }
+ }
+ }
+ setActualDocFreq(termNum, actualDF);
+ }
+
+ termNum++;
+ if (te.next() == null) {
+ break;
+ }
+ }
+
+ numTermsInField = termNum;
+
+ long midPoint = System.currentTimeMillis();
+
+ if (termInstances == 0) {
+ // we didn't invert anything
+ // lower memory consumption.
+ tnums = null;
+ } else {
+
+ this.index = index;
+
+ //
+ // transform intermediate form into the final form, building a single byte[]
+ // at a time, and releasing the intermediate byte[]s as we go to avoid
+ // increasing the memory footprint.
+ //
+
+ for (int pass = 0; pass<256; pass++) {
+ byte[] target = tnums[pass];
+ int pos=0; // end in target;
+ if (target != null) {
+ pos = target.length;
+ } else {
+ target = new byte[4096];
+ }
+
+ // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
+ // where pp is the pass (which array we are building), and xx is all values.
+ // each pass shares the same byte[] for termNumber lists.
+ for (int docbase = pass<<16; docbase maxDoc)
+ break;
+ }
+
+ if (indexedTerms != null) {
+ indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
+ }
+ }
+
+ long endTime = System.currentTimeMillis();
+
+ total_time = (int)(endTime-startTime);
+ phase1_time = (int)(midPoint-startTime);
+ }
+
+ /** Number of bytes to represent an unsigned int as a vint. */
+ private static int vIntSize(int x) {
+ if ((x & (0xffffffff << (7*1))) == 0 ) {
+ return 1;
+ }
+ if ((x & (0xffffffff << (7*2))) == 0 ) {
+ return 2;
+ }
+ if ((x & (0xffffffff << (7*3))) == 0 ) {
+ return 3;
+ }
+ if ((x & (0xffffffff << (7*4))) == 0 ) {
+ return 4;
+ }
+ return 5;
+ }
+
+ // todo: if we know the size of the vInt already, we could do
+ // a single switch on the size
+ private static int writeInt(int x, byte[] arr, int pos) {
+ int a;
+ a = (x >>> (7*4));
+ if (a != 0) {
+ arr[pos++] = (byte)(a | 0x80);
+ }
+ a = (x >>> (7*3));
+ if (a != 0) {
+ arr[pos++] = (byte)(a | 0x80);
+ }
+ a = (x >>> (7*2));
+ if (a != 0) {
+ arr[pos++] = (byte)(a | 0x80);
+ }
+ a = (x >>> (7*1));
+ if (a != 0) {
+ arr[pos++] = (byte)(a | 0x80);
+ }
+ arr[pos++] = (byte)(x & 0x7f);
+ return pos;
+ }
+
+ public class TermOrdsIterator {
+ private int tnum;
+ private int upto;
+ private byte[] arr;
+
+ /** Buffer must be at least 5 ints long. Returns number
+ * of term ords placed into buffer; if this count is
+ * less than buffer.length then that is the end. */
+ public int read(int[] buffer) {
+ int bufferUpto = 0;
+ if (arr == null) {
+ // code is inlined into upto
+ //System.out.println("inlined");
+ int code = upto;
+ int delta = 0;
+ for (;;) {
+ delta = (delta << 7) | (code & 0x7f);
+ if ((code & 0x80)==0) {
+ if (delta==0) break;
+ tnum += delta - TNUM_OFFSET;
+ buffer[bufferUpto++] = ordBase+tnum;
+ //System.out.println(" tnum=" + tnum);
+ delta = 0;
+ }
+ code >>>= 8;
+ }
+ } else {
+ // code is a pointer
+ for(;;) {
+ int delta = 0;
+ for(;;) {
+ byte b = arr[upto++];
+ delta = (delta << 7) | (b & 0x7f);
+ //System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b);
+ if ((b & 0x80) == 0) break;
+ }
+ //System.out.println(" delta=" + delta);
+ if (delta == 0) break;
+ tnum += delta - TNUM_OFFSET;
+ //System.out.println(" tnum=" + tnum);
+ buffer[bufferUpto++] = ordBase+tnum;
+ if (bufferUpto == buffer.length) {
+ break;
+ }
+ }
+ }
+
+ return bufferUpto;
+ }
+
+ public TermOrdsIterator reset(int docID) {
+ //System.out.println(" reset docID=" + docID);
+ tnum = 0;
+ final int code = index[docID];
+ if ((code & 0xff)==1) {
+ // a pointer
+ upto = code>>>8;
+ //System.out.println(" pointer! upto=" + upto);
+ int whichArray = (docID >>> 16) & 0xff;
+ arr = tnums[whichArray];
+ } else {
+ //System.out.println(" inline!");
+ arr = null;
+ upto = code;
+ }
+ return this;
+ }
+ }
+
+ /** Returns an iterator to step through the term ords for
+ * this document. It's also possible to subclass this
+ * class and directly access members. */
+ public TermOrdsIterator lookup(int doc, TermOrdsIterator reuse) {
+ final TermOrdsIterator ret;
+ if (reuse != null) {
+ ret = reuse;
+ } else {
+ ret = new TermOrdsIterator();
+ }
+ return ret.reset(doc);
+ }
+
+ /* Only used if original IndexReader doesn't implement
+ * ord; in this case we "wrap" our own terms index
+ * around it. */
+ private final class OrdWrappedTermsEnum extends TermsEnum {
+ private final IndexReader reader;
+ private final TermsEnum termsEnum;
+ private BytesRef term;
+ private long ord = -indexInterval-1; // force "real" seek
+
+ public OrdWrappedTermsEnum(IndexReader reader) throws IOException {
+ this.reader = reader;
+ assert indexedTermsArray != null;
+ termsEnum = MultiFields.getTerms(reader, field).iterator();
+ }
+
+ @Override
+ public Comparator getComparator() throws IOException {
+ return termsEnum.getComparator();
+ }
+
+ @Override
+ public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
+ return termsEnum.docs(skipDocs, reuse);
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ return termsEnum.docsAndPositions(skipDocs, reuse);
+ }
+
+ @Override
+ public BytesRef term() {
+ return term;
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ ord++;
+ if (termsEnum.next() == null) {
+ term = null;
+ return null;
+ }
+ return setTerm(); // this is extra work if we know we are in bounds...
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ return termsEnum.docFreq();
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ return termsEnum.totalTermFreq();
+ }
+
+ @Override
+ public long ord() throws IOException {
+ return ordBase + ord;
+ }
+
+ @Override
+ public SeekStatus seek(BytesRef target, boolean useCache) throws IOException {
+
+ // already here
+ if (term != null && term.equals(target)) {
+ return SeekStatus.FOUND;
+ }
+
+ int startIdx = Arrays.binarySearch(indexedTermsArray, target);
+
+ if (startIdx >= 0) {
+ // we hit the term exactly... lucky us!
+ TermsEnum.SeekStatus seekStatus = termsEnum.seek(target);
+ assert seekStatus == TermsEnum.SeekStatus.FOUND;
+ ord = startIdx << indexIntervalBits;
+ setTerm();
+ assert term != null;
+ return SeekStatus.FOUND;
+ }
+
+ // we didn't hit the term exactly
+ startIdx = -startIdx-1;
+
+ if (startIdx == 0) {
+ // our target occurs *before* the first term
+ TermsEnum.SeekStatus seekStatus = termsEnum.seek(target);
+ assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
+ ord = 0;
+ setTerm();
+ assert term != null;
+ return SeekStatus.NOT_FOUND;
+ }
+
+ // back up to the start of the block
+ startIdx--;
+
+ if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) {
+ // we are already in the right block and the current term is before the term we want,
+ // so we don't need to seek.
+ } else {
+ // seek to the right block
+ TermsEnum.SeekStatus seekStatus = termsEnum.seek(indexedTermsArray[startIdx]);
+ assert seekStatus == TermsEnum.SeekStatus.FOUND;
+ ord = startIdx << indexIntervalBits;
+ setTerm();
+ assert term != null; // should be non-null since it's in the index
+ }
+
+ while (term != null && term.compareTo(target) < 0) {
+ next();
+ }
+
+ if (term == null) {
+ return SeekStatus.END;
+ } else if (term.compareTo(target) == 0) {
+ return SeekStatus.FOUND;
+ } else {
+ return SeekStatus.NOT_FOUND;
+ }
+ }
+
+ @Override
+ public SeekStatus seek(long targetOrd) throws IOException {
+ int delta = (int) (targetOrd - ordBase - ord);
+ //System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord);
+ if (delta < 0 || delta > indexInterval) {
+ final int idx = (int) (targetOrd >>> indexIntervalBits);
+ final BytesRef base = indexedTermsArray[idx];
+ //System.out.println(" do seek term=" + base.utf8ToString());
+ ord = idx << indexIntervalBits;
+ delta = (int) (targetOrd - ord);
+ final TermsEnum.SeekStatus seekStatus = termsEnum.seek(base, true);
+ assert seekStatus == TermsEnum.SeekStatus.FOUND;
+ } else {
+ //System.out.println("seek w/in block");
+ }
+
+ while (--delta >= 0) {
+ BytesRef br = termsEnum.next();
+ if (br == null) {
+ term = null;
+ return null;
+ }
+ ord++;
+ }
+
+ setTerm();
+ return term == null ? SeekStatus.END : SeekStatus.FOUND;
+ //System.out.println(" return term=" + term.utf8ToString());
+ }
+
+ private BytesRef setTerm() throws IOException {
+ term = termsEnum.term();
+ //System.out.println(" setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == null ? "null" : prefix.utf8ToString()));
+ if (prefix != null && !term.startsWith(prefix)) {
+ term = null;
+ }
+ return term;
+ }
+ }
+
+ public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
+ TermsEnum.SeekStatus status = termsEnum.seek(ord);
+ assert status == TermsEnum.SeekStatus.FOUND;
+ return termsEnum.term();
+ }
+}
Index: lucene/src/java/org/apache/lucene/index/IndexReader.java
--- lucene/src/java/org/apache/lucene/index/IndexReader.java Wed Mar 30 18:44:50 2011 -0400
+++ lucene/src/java/org/apache/lucene/index/IndexReader.java Wed Mar 30 19:19:07 2011 -0400
@@ -919,6 +919,22 @@
}
}
+ /**
+ * Returns true if an index exists at the specified directory.
+ * @param directory the directory to check for an index
+ * @param codecProvider provides a CodecProvider in case the index uses non-core codecs
+ * @return true if an index exists; false otherwise
+ * @throws IOException if there is a problem with accessing the index
+ */
+ public static boolean indexExists(Directory directory, CodecProvider codecProvider) throws IOException {
+ try {
+ new SegmentInfos().read(directory, codecProvider);
+ return true;
+ } catch (IOException ioe) {
+ return false;
+ }
+ }
+
/** Returns the number of documents in this index. */
public abstract int numDocs();
Index: lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java
--- lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java Wed Mar 30 18:44:50 2011 -0400
+++ lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java Wed Mar 30 19:19:07 2011 -0400
@@ -181,7 +181,7 @@
System.out.println("RIW.getReader: open new reader");
}
w.commit();
- return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10));
+ return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10), w.getConfig().getCodecProvider());
}
}
Index: lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java
--- lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java Wed Mar 30 18:44:50 2011 -0400
+++ lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java Wed Mar 30 19:19:07 2011 -0400
@@ -32,6 +32,7 @@
import java.util.Set;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@@ -419,12 +420,27 @@
throw new RuntimeException("MockDirectoryWrapper: cannot close: there are still open files: " + openFiles, cause);
}
open = false;
- if (checkIndexOnClose && IndexReader.indexExists(this)) {
- _TestUtil.checkIndex(this);
+ if (checkIndexOnClose) {
+ if (codecProvider != null) {
+ if (IndexReader.indexExists(this, codecProvider)) {
+ _TestUtil.checkIndex(this, codecProvider);
+ }
+ } else {
+ if (IndexReader.indexExists(this)) {
+ _TestUtil.checkIndex(this);
+ }
+ }
}
delegate.close();
}
+ private CodecProvider codecProvider;
+
+ // We pass this CodecProvider to checkIndex when dir is closed...
+ public void setCodecProvider(CodecProvider cp) {
+ codecProvider = cp;
+ }
+
boolean open = true;
public synchronized boolean isOpen() {
Index: lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java
--- lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java Wed Mar 30 18:44:50 2011 -0400
+++ lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java Wed Mar 30 19:19:07 2011 -0400
@@ -157,6 +157,19 @@
return start + r.nextInt(end-start+1);
}
+ public static String randomSimpleString(Random r) {
+ final int end = r.nextInt(10);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ final char[] buffer = new char[end];
+ for (int i = 0; i < end; i++) {
+ buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
+ }
+ return new String(buffer, 0, end);
+ }
+
/** Returns random string, including full unicode range. */
public static String randomUnicodeString(Random r) {
return randomUnicodeString(r, 20);
Index: lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java Wed Mar 30 19:19:07 2011 -0400
@@ -0,0 +1,517 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
+import org.apache.lucene.index.DocTermOrds.TermOrdsIterator;
+import org.apache.lucene.index.codecs.BlockTermsReader;
+import org.apache.lucene.index.codecs.BlockTermsWriter;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.CoreCodecProvider;
+import org.apache.lucene.index.codecs.FieldsConsumer;
+import org.apache.lucene.index.codecs.FieldsProducer;
+import org.apache.lucene.index.codecs.FixedGapTermsIndexReader;
+import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter;
+import org.apache.lucene.index.codecs.PostingsReaderBase;
+import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermsIndexReaderBase;
+import org.apache.lucene.index.codecs.TermsIndexWriterBase;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+// TODO:
+// - test w/ del docs
+// - test prefix
+// - test w/ cutoff
+// - crank docs way up so we get some merging sometimes
+
+public class TestDocTermOrds extends LuceneTestCase {
+
+ public void testSimple() throws Exception {
+ Directory dir = newDirectory();
+ final RandomIndexWriter w = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(newInOrderLogMergePolicy()));
+ Document doc = new Document();
+ Field field = newField("field", "", Field.Index.ANALYZED);
+ doc.add(field);
+ field.setValue("a b c");
+ w.addDocument(doc);
+
+ field.setValue("d e f");
+ w.addDocument(doc);
+
+ field.setValue("a f");
+ w.addDocument(doc);
+
+ final IndexReader r = w.getReader();
+ w.close();
+
+ final DocTermOrds dto = new DocTermOrds(r, "field");
+
+ TermOrdsIterator iter = dto.lookup(0, null);
+ final int[] buffer = new int[5];
+ assertEquals(3, iter.read(buffer));
+ assertEquals(0, buffer[0]);
+ assertEquals(1, buffer[1]);
+ assertEquals(2, buffer[2]);
+
+ iter = dto.lookup(1, iter);
+ assertEquals(3, iter.read(buffer));
+ assertEquals(3, buffer[0]);
+ assertEquals(4, buffer[1]);
+ assertEquals(5, buffer[2]);
+
+ iter = dto.lookup(2, iter);
+ assertEquals(2, iter.read(buffer));
+ assertEquals(0, buffer[0]);
+ assertEquals(5, buffer[1]);
+
+ r.close();
+ dir.close();
+ }
+
+ private static class StandardCodecWithOrds extends Codec {
+ public StandardCodecWithOrds() {
+ name = "StandardOrds";
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ PostingsWriterBase docs = new StandardPostingsWriter(state);
+
+ // TODO: should we make the terms index more easily
+ // pluggable? Ie so that this codec would record which
+ // index impl was used, and switch on loading?
+ // Or... you must make a new Codec for this?
+ TermsIndexWriterBase indexWriter;
+ boolean success = false;
+ try {
+ indexWriter = new FixedGapTermsIndexWriter(state);
+ success = true;
+ } finally {
+ if (!success) {
+ docs.close();
+ }
+ }
+
+ success = false;
+ try {
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ try {
+ docs.close();
+ } finally {
+ indexWriter.close();
+ }
+ }
+ }
+ }
+
+ public final static int TERMS_CACHE_SIZE = 1024;
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+ PostingsReaderBase postings = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId);
+ TermsIndexReaderBase indexReader;
+
+ boolean success = false;
+ try {
+ indexReader = new FixedGapTermsIndexReader(state.dir,
+ state.fieldInfos,
+ state.segmentInfo.name,
+ state.termsIndexDivisor,
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
+ state.codecId);
+ success = true;
+ } finally {
+ if (!success) {
+ postings.close();
+ }
+ }
+
+ success = false;
+ try {
+ FieldsProducer ret = new BlockTermsReader(indexReader,
+ state.dir,
+ state.fieldInfos,
+ state.segmentInfo.name,
+ postings,
+ state.readBufferSize,
+ TERMS_CACHE_SIZE,
+ state.codecId);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ try {
+ postings.close();
+ } finally {
+ indexReader.close();
+ }
+ }
+ }
+ }
+
+ /** Extension of freq postings file */
+ static final String FREQ_EXTENSION = "frq";
+
+ /** Extension of prox postings file */
+ static final String PROX_EXTENSION = "prx";
+
+ @Override
+ public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException {
+ StandardPostingsReader.files(dir, segmentInfo, id, files);
+ BlockTermsReader.files(dir, segmentInfo, id, files);
+ FixedGapTermsIndexReader.files(dir, segmentInfo, id, files);
+ }
+
+ @Override
+ public void getExtensions(Set extensions) {
+ getStandardExtensions(extensions);
+ }
+
+ public static void getStandardExtensions(Set extensions) {
+ extensions.add(FREQ_EXTENSION);
+ extensions.add(PROX_EXTENSION);
+ BlockTermsReader.getExtensions(extensions);
+ FixedGapTermsIndexReader.getIndexExtensions(extensions);
+ }
+ }
+
+ public void testRandom() throws Exception {
+ MockDirectoryWrapper dir = newDirectory();
+
+ final int NUM_TERMS = 100 * RANDOM_MULTIPLIER;
+ final Set terms = new HashSet();
+ while(terms.size() < NUM_TERMS) {
+ final String s = _TestUtil.randomRealisticUnicodeString(random);
+ //final String s = _TestUtil.randomSimpleString(random);
+ if (s.length() > 0) {
+ terms.add(new BytesRef(s));
+ }
+ }
+ final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]);
+ Arrays.sort(termsArray);
+
+ final int NUM_DOCS = 1000 * RANDOM_MULTIPLIER;
+
+ IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer());
+
+ // Sometimes swap in codec that impls ord():
+ if (random.nextInt(10) == 7) {
+ // Make sure terms index has ords:
+ CoreCodecProvider cp = new CoreCodecProvider();
+ cp.register(new StandardCodecWithOrds());
+ cp.setDefaultFieldCodec("StandardOrds");
+
+ // So checkIndex on close works
+ dir.setCodecProvider(cp);
+ conf.setCodecProvider(cp);
+ }
+
+ final RandomIndexWriter w = new RandomIndexWriter(random, dir, conf);
+
+ final int[][] idToOrds = new int[NUM_DOCS][];
+ final Set ordsForDocSet = new HashSet();
+
+ for(int id=0;id prefixes = new HashSet();
+ final int numPrefix = _TestUtil.nextInt(random, 2, 7);
+ if (VERBOSE) {
+ System.out.println("TEST: use " + numPrefix + " prefixes");
+ }
+ while(prefixes.size() < numPrefix) {
+ prefixes.add(_TestUtil.randomRealisticUnicodeString(random));
+ //prefixes.add(_TestUtil.randomSimpleString(random));
+ }
+ final String[] prefixesArray = prefixes.toArray(new String[prefixes.size()]);
+
+ final int NUM_TERMS = 100 * RANDOM_MULTIPLIER;
+ final Set terms = new HashSet();
+ while(terms.size() < NUM_TERMS) {
+ final String s = prefixesArray[random.nextInt(prefixesArray.length)] + _TestUtil.randomRealisticUnicodeString(random);
+ //final String s = prefixesArray[random.nextInt(prefixesArray.length)] + _TestUtil.randomSimpleString(random);
+ if (s.length() > 0) {
+ terms.add(new BytesRef(s));
+ }
+ }
+ final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]);
+ Arrays.sort(termsArray);
+
+ final int NUM_DOCS = 1000 * RANDOM_MULTIPLIER;
+
+ IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer());
+
+ // Sometimes swap in codec that impls ord():
+ if (random.nextInt(10) == 7) {
+ // Make sure terms index has ords:
+ CoreCodecProvider cp = new CoreCodecProvider();
+ cp.register(new StandardCodecWithOrds());
+ cp.setDefaultFieldCodec("StandardOrds");
+
+ // So checkIndex on close works
+ dir.setCodecProvider(cp);
+ conf.setCodecProvider(cp);
+ }
+
+ final RandomIndexWriter w = new RandomIndexWriter(random, dir, conf);
+
+ final int[][] idToOrds = new int[NUM_DOCS][];
+ final Set ordsForDocSet = new HashSet();
+
+ for(int id=0;id (1<<24)*.9) {
- SolrCore.log.warn("Approaching too many values for UnInvertedField faceting on field '"+field+"' : bucket size=" + target.length);
- }
- }
-
- tnums[pass] = target;
-
- if ((pass << 16) > maxDoc)
- break;
- }
}
- long endTime = System.currentTimeMillis();
-
- total_time = (int)(endTime-startTime);
- phase1_time = (int)(midPoint-startTime);
-
SolrCore.log.info("UnInverted multi-valued field " + toString());
+ //System.out.println("CREATED: " + toString() + " ti.index=" + ti.index);
}
-
-
+ public int getNumTerms() {
+ return numTermsInField;
+ }
public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException {
use.incrementAndGet();
@@ -468,6 +206,7 @@
int baseSize = docs.size();
int maxDoc = searcher.maxDoc();
+ //System.out.println("GET COUNTS field=" + field + " baseSize=" + baseSize + " minCount=" + mincount + " maxDoc=" + maxDoc + " numTermsInField=" + numTermsInField);
if (baseSize >= mincount) {
final int[] index = this.index;
@@ -481,14 +220,20 @@
int startTerm = 0;
int endTerm = numTermsInField; // one past the end
- NumberedTermsEnum te = ti.getEnumerator(searcher.getIndexReader());
+ TermsEnum te = getOrdTermsEnum(searcher.getIndexReader());
if (prefix != null && prefix.length() > 0) {
BytesRef prefixBr = new BytesRef(prefix);
- te.skipTo(prefixBr);
- startTerm = te.getTermNumber();
+ if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) {
+ startTerm = numTermsInField;
+ } else {
+ startTerm = (int) te.ord();
+ }
prefixBr.append(ByteUtils.bigTerm);
- te.skipTo(prefixBr);
- endTerm = te.getTermNumber();
+ if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) {
+ endTerm = numTermsInField;
+ } else {
+ endTerm = (int) te.ord();
+ }
}
/***********
@@ -514,13 +259,18 @@
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
+ //System.out.println(" NEG");
}
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
+ //System.out.println(" do big termNum=" + tt.termNum + " term=" + tt.term.utf8ToString());
// TODO: counts could be deferred if sorted==false
if (tt.termNum >= startTerm && tt.termNum < endTerm) {
- counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(ti.field, tt.term)), docs);
+ counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(field, tt.term)), docs);
+ //System.out.println(" count=" + counts[tt.termNum]);
+ } else {
+ //System.out.println("SKIP term=" + tt.termNum);
}
}
@@ -537,9 +287,11 @@
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
int doc = iter.nextDoc();
+ //System.out.println("iter doc=" + doc);
int code = index[doc];
if ((code & 0xff)==1) {
+ //System.out.println(" ptr");
int pos = code>>>8;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
@@ -553,9 +305,11 @@
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
+ //System.out.println(" tnum=" + tnum);
counts[tnum]++;
}
} else {
+ //System.out.println(" inlined");
int tnum = 0;
int delta = 0;
for (;;) {
@@ -563,6 +317,7 @@
if ((code & 0x80)==0) {
if (delta==0) break;
tnum += delta - TNUM_OFFSET;
+ //System.out.println(" tnum=" + tnum);
counts[tnum]++;
delta = 0;
}
@@ -583,6 +338,7 @@
LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE);
int min=mincount-1; // the smallest value in the top 'N' values
+ //System.out.println("START=" + startTerm + " END=" + endTerm);
for (int i=startTerm; imin) {
@@ -641,11 +397,14 @@
}
});
- // convert the term numbers to term values and set as the label
+ // convert the term numbers to term values and set
+ // as the label
+ //System.out.println("sortStart=" + sortedIdxStart + " end=" + sortedIdxEnd);
for (int i=sortedIdxStart; i getComparator() throws IOException {
- return tenum.getComparator();
- }
-
- public DocsEnum getDocsEnum() throws IOException {
- docsEnum = tenum.docs(deletedDocs, docsEnum);
- return docsEnum;
- }
-
- protected BytesRef setTerm() throws IOException {
- termText = tenum.term();
- if (tindex.prefix != null && !termText.startsWith(tindex.prefix)) {
- termText = null;
- }
- return termText;
- }
-
- @Override
- public BytesRef next() throws IOException {
- pos++;
- if (tenum.next() == null) {
- termText = null;
- return null;
- }
- return setTerm(); // this is extra work if we know we are in bounds...
- }
-
- @Override
- public BytesRef term() {
- return termText;
- }
-
- @Override
- public int docFreq() throws IOException {
- return tenum.docFreq();
- }
-
- @Override
- public long totalTermFreq() throws IOException {
- return tenum.totalTermFreq();
- }
-
- public BytesRef skipTo(BytesRef target) throws IOException {
-
- // already here
- if (termText != null && termText.equals(target)) return termText;
-
- if (tenum == null) {
- return null;
- }
-
- int startIdx = Arrays.binarySearch(tindex.index,target);
-
- if (startIdx >= 0) {
- // we hit the term exactly... lucky us!
- TermsEnum.SeekStatus seekStatus = tenum.seek(target);
- assert seekStatus == TermsEnum.SeekStatus.FOUND;
- pos = startIdx << tindex.intervalBits;
- return setTerm();
- }
-
- // we didn't hit the term exactly
- startIdx=-startIdx-1;
-
- if (startIdx == 0) {
- // our target occurs *before* the first term
- TermsEnum.SeekStatus seekStatus = tenum.seek(target);
- assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
- pos = 0;
- return setTerm();
- }
-
- // back up to the start of the block
- startIdx--;
-
- if ((pos >> tindex.intervalBits) == startIdx && termText != null && termText.compareTo(target)<=0) {
- // we are already in the right block and the current term is before the term we want,
- // so we don't need to seek.
- } else {
- // seek to the right block
- TermsEnum.SeekStatus seekStatus = tenum.seek(tindex.index[startIdx]);
- assert seekStatus == TermsEnum.SeekStatus.FOUND;
- pos = startIdx << tindex.intervalBits;
- setTerm(); // should be non-null since it's in the index
- }
-
- while (termText != null && termText.compareTo(target) < 0) {
- next();
- }
-
- return termText;
- }
-
- public BytesRef skipTo(int termNumber) throws IOException {
- int delta = termNumber - pos;
- if (delta < 0 || delta > tindex.interval || tenum==null) {
- int idx = termNumber >>> tindex.intervalBits;
- BytesRef base = tindex.index[idx];
- pos = idx << tindex.intervalBits;
- delta = termNumber - pos;
- TermsEnum.SeekStatus seekStatus = tenum.seek(base);
- assert seekStatus == TermsEnum.SeekStatus.FOUND;
- }
- while (--delta >= 0) {
- BytesRef br = tenum.next();
- if (br == null) {
- termText = null;
- return null;
- }
- ++pos;
- }
- return setTerm();
- }
-
- protected void close() throws IOException {
- // no-op, needed so the anon subclass that does indexing
- // can build its index
- }
-
- /** The current term number, starting at 0.
- * Only valid if the previous call to next() or skipTo() returned true.
- */
- public int getTermNumber() {
- return pos;
- }
-
- @Override
- public long ord() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public SeekStatus seek(long ord) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public SeekStatus seek(BytesRef target, boolean useCache) {
- throw new UnsupportedOperationException();
- }
-}
-
-
-/**
- * Class to save memory by only storing every nth term (for random access), while
- * numbering the terms, allowing them to be retrieved later by number.
- * This is only valid when used with the IndexReader it was created with.
- * The IndexReader is not actually stored to facilitate caching by using it as a key in
- * a weak hash map.
- */
-class TermIndex {
- final static int intervalBits = 7; // decrease to a low number like 2 for testing
- final static int intervalMask = 0xffffffff >>> (32-intervalBits);
- final static int interval = 1 << intervalBits;
-
- final String field;
- final BytesRef prefix;
- BytesRef[] index;
- int nTerms;
- long sizeOfStrings;
-
- TermIndex(String field) {
- this(field, null);
- }
-
- TermIndex(String field, String prefix) {
- this.field = field;
- this.prefix = prefix == null ? null : new BytesRef(prefix);
- }
-
- NumberedTermsEnum getEnumerator(IndexReader reader, int termNumber) throws IOException {
- NumberedTermsEnum te = new NumberedTermsEnum(reader, this);
- te.skipTo(termNumber);
- return te;
- }
-
- /* The first time an enumerator is requested, it should be used
- with next() to fully traverse all of the terms so the index
- will be built.
- */
- NumberedTermsEnum getEnumerator(IndexReader reader) throws IOException {
- if (index==null) return new NumberedTermsEnum(reader,this, prefix==null?new BytesRef():prefix, 0) {
- ArrayList lst;
- PagedBytes bytes;
-
- @Override
- protected BytesRef setTerm() throws IOException {
- BytesRef br = super.setTerm();
- if (br != null && (pos & intervalMask)==0) {
- sizeOfStrings += br.length;
- if (lst==null) {
- lst = new ArrayList();
- bytes = new PagedBytes(15);
- }
- BytesRef out = new BytesRef();
- bytes.copy(br, out);
- lst.add(out);
- }
- return br;
- }
-
- @Override
- public BytesRef skipTo(int termNumber) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void close() throws IOException {
- nTerms=pos;
- super.close();
- index = lst!=null ? lst.toArray(new BytesRef[lst.size()]) : new BytesRef[0];
- }
- };
- else return new NumberedTermsEnum(reader,this,new BytesRef(),0);
- }
-
-
- /**
- * Returns the approximate amount of memory taken by this TermIndex.
- * This is only an approximation and doesn't take into account java object overhead.
- *
- * @return
- * the approximate memory consumption in bytes
- */
- public long memSize() {
- // assume 8 byte references?
- return 8+8+8+8+(index.length<<3)+sizeOfStrings;
- }
-}
-
Index: solr/src/test/org/apache/solr/request/TestFaceting.java
--- solr/src/test/org/apache/solr/request/TestFaceting.java Wed Mar 30 18:44:50 2011 -0400
+++ solr/src/test/org/apache/solr/request/TestFaceting.java Wed Mar 30 19:19:07 2011 -0400
@@ -17,14 +17,17 @@
package org.apache.solr.request;
+import java.util.Locale;
+import java.util.Random;
+
+import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.After;
import org.junit.BeforeClass;
import org.junit.Test;
-import java.util.Locale;
-import java.util.Random;
/**
* @version $Id$
@@ -62,43 +65,47 @@
}
void doTermEnum(int size) throws Exception {
+ //System.out.println("doTermEnum size=" + size);
close();
createIndex(size);
req = lrf.makeRequest("q","*:*");
- TermIndex ti = new TermIndex(proto.field());
- NumberedTermsEnum te = ti.getEnumerator(req.getSearcher().getIndexReader());
+ UnInvertedField uif = new UnInvertedField(proto.field(), req.getSearcher());
- // iterate through first
- while(te.term() != null) te.next();
- assertEquals(size, te.getTermNumber());
- te.close();
+ assertEquals(size, uif.getNumTerms());
- te = ti.getEnumerator(req.getSearcher().getIndexReader());
+ TermsEnum te = uif.getOrdTermsEnum(req.getSearcher().getIndexReader());
+ assertEquals(size == 0, te == null);
Random r = new Random(size);
// test seeking by term string
for (int i=0; i0, te.skipTo(new BytesRef("000")) != null);
- assertEquals(0, te.getTermNumber());
if (size>0) {
+ assertEquals(size>0, te.seek(new BytesRef("000"), true) != TermsEnum.SeekStatus.END);
+ assertEquals(0, te.ord());
assertEquals(t(0), te.term().utf8ToString());
- } else {
- assertEquals(null, te.term());
}
if (size>0) {
@@ -106,9 +113,10 @@
for (int i=0; i