Index: lucene/src/java/org/apache/lucene/index/DocTermOrds.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ lucene/src/java/org/apache/lucene/index/DocTermOrds.java Tue Mar 29 15:49:08 2011 -0400
@@ -0,0 +1,745 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import org.apache.lucene.util.PagedBytes;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Bits;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Comparator;
+
+// nocommit TODO
+// - jdoc explain int vs long ord
+// - allow specifying interval
+// - jdoc how del docs handled
+// - jdoc 2.1B unique term limit
+// - should we pull in this "emulated terms index"
+// here...?
+// - NOTE that doc's ords become sorted, ie, they are not
+// in order
+
+/**
+ *
+ * Final form of the un-inverted field:
+ * Each document points to a list of term numbers that are contained in that document.
+ *
+ * Term numbers are in sorted order, and are encoded as variable-length deltas from the
+ * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A
+ * term number of 0 signals the end of the termNumber list.
+ *
+ * There is a single int[maxDoc()] which either contains a pointer into a byte[] for
+ * the termNumber lists, or directly contains the termNumber list if it fits in the 4
+ * bytes of an integer. If the first byte in the integer is 1, the next 3 bytes
+ * are a pointer into a byte[] where the termNumber list starts.
+ *
+ * There are actually 256 byte arrays, to compensate for the fact that the pointers
+ * into the byte arrays are only 3 bytes long. The correct byte array for a document
+ * is a function of it's id.
+ *
+ * To save space and speed up faceting, any term that matches enough documents will
+ * not be un-inverted... it will be skipped while building the un-inverted field structure,
+ * and will use a set intersection method during faceting.
+ *
+ * To further save memory, the terms (the actual string values) are not all stored in
+ * memory, but a TermIndex is used to convert term numbers to term values only
+ * for the terms needed after faceting has completed. Only every 128th term value
+ * is stored, along with it's corresponding term number, and this is used as an
+ * index to find the closest term and iterate until the desired number is hit (very
+ * much like Lucene's own internal term index).
+ *
+ * @lucene.experimental
+ */
+
+public class DocTermOrds {
+
+ // Term ords are shifted by this, internally, to reseve
+ // values 0 (end term) and 1 (index is a pointer into byte array)
+ private final static int TNUM_OFFSET = 2;
+ public final static int INDEX_INTERVAL_BITS = 7; // decrease to a low number like 2 for testing
+ public final static int INDEX_INTERVAL_MASK = 0xffffffff >>> (32-INDEX_INTERVAL_BITS);
+ public final static int INDEX_INTERVAL = 1 << INDEX_INTERVAL_BITS;
+
+ protected final int maxTermDocFreq;
+
+ protected final String field;
+
+ protected int numTermsInField;
+ protected long termInstances; // total number of references to term numbers
+ private long memsz;
+ protected int total_time; // total time to uninvert the field
+ protected int phase1_time; // time for phase1 of the uninvert process
+
+ protected int[] index;
+ protected byte[][] tnums = new byte[256][];
+ protected long sizeOfIndexedStrings;
+ protected BytesRef[] indexedTermsArray;
+ protected BytesRef prefix;
+ protected int ordBase;
+
+ public long ramUsedInBytes() {
+ // can cache the mem size since it shouldn't change
+ if (memsz!=0) return memsz;
+ long sz = 8*8 + 32; // local fields
+ if (index != null) sz += index.length * 4;
+ if (tnums!=null) {
+ for (byte[] arr : tnums)
+ if (arr != null) sz += arr.length;
+ }
+ memsz = sz;
+ return sz;
+ }
+
+ public DocTermOrds(IndexReader reader, String field) throws IOException {
+ this(reader, field, null, Integer.MAX_VALUE);
+ }
+
+ public DocTermOrds(IndexReader reader, String field, BytesRef termPrefix) throws IOException {
+ this(reader, field, termPrefix, Integer.MAX_VALUE);
+ }
+
+ /** If term's docFreq is > maxDocFreq, then it's skipped. */
+ public DocTermOrds(IndexReader reader, String field, BytesRef termPrefix, int maxTermDocFreq) throws IOException {
+ this(field, maxTermDocFreq);
+ uninvert(reader, termPrefix);
+ }
+
+ /** Subclass inits w/ this, but be sure you then call
+ * uninvert! */
+ protected DocTermOrds(String field, int maxTermDocFreq) throws IOException {
+ this.field = field;
+ this.maxTermDocFreq = maxTermDocFreq;
+ }
+
+ /** Returns a TermsEnum that implements ord. If the
+ * provided reader supports ord, we just return its
+ * TermsEnum; if it does not, we build a "private" terms
+ * index internally (WARNING: consumes RAM) and use that
+ * index to implement ord. This also enables ord on top
+ * of a composite reader. The returned TermsEnum is
+ * unpositioned. Returns null if there are no terms.
+ *
+ *
NOTE: you must pass the same reader that was
+ * used when creating this class */
+ public TermsEnum getOrdTermsEnum(IndexReader reader) throws IOException {
+ if (indexedTermsArray == null) {
+ //System.out.println("GET normal enum");
+ // nocommit -- is this wrong for the prefix case?
+ final Terms terms = MultiFields.getTerms(reader, field);
+ if (terms != null) {
+ return terms.iterator();
+ } else {
+ return null;
+ }
+ } else if (termInstances > 0) {
+ //System.out.println("GET wrapped enum");
+ return new OrdWrappedTermsEnum(reader);
+ } else {
+ return null;
+ }
+ }
+
+ /** Subclass can override this */
+ protected void visitTerm(TermsEnum te, int termNum) throws IOException {
+ }
+
+ protected void setActualDocFreq(int termNum, int df) throws IOException {
+ }
+
+ // Call this only once (if you subclass!)
+ protected void uninvert(final IndexReader reader, final BytesRef termPrefix) throws IOException {
+ //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
+ final long startTime = System.currentTimeMillis();
+ prefix = termPrefix == null ? null : new BytesRef(termPrefix);
+
+ final int maxDoc = reader.maxDoc();
+ final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
+ final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
+ final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)
+
+ final Terms terms = MultiFields.getTerms(reader, field);
+ if (terms == null) {
+ // No terms
+ return;
+ }
+
+ final TermsEnum te = terms.iterator();
+ final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
+ if (te.seek(seekStart) == TermsEnum.SeekStatus.END) {
+ // No terms match
+ // nocommit must test this
+ return;
+ }
+
+ // If we need our "term index wrapper", these will be
+ // init'd below:
+ List indexedTerms = null;
+ PagedBytes indexedTermsBytes = null;
+
+ boolean testedOrd = false;
+
+ final Bits delDocs = MultiFields.getDeletedDocs(reader);
+
+ // we need a minimum of 9 bytes, but round up to 12 since the space would
+ // be wasted with most allocators anyway.
+ byte[] tempArr = new byte[12];
+
+ //
+ // enumerate all terms, and build an intermediate form of the un-inverted field.
+ //
+ // During this intermediate form, every document has a (potential) byte[]
+ // and the int[maxDoc()] array either contains the termNumber list directly
+ // or the *end* offset of the termNumber list in it's byte array (for faster
+ // appending and faster creation of the final form).
+ //
+ // idea... if things are too large while building, we could do a range of docs
+ // at a time (but it would be a fair amount slower to build)
+ // could also do ranges in parallel to take advantage of multiple CPUs
+
+ // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
+ // values. This requires going over the field first to find the most
+ // frequent terms ahead of time.
+
+ int termNum = 0;
+ DocsEnum docsEnum = null;
+
+ // Loop begins with te positioned to first term (we call
+ // seek above):
+ for (;;) {
+ final BytesRef t = te.term();
+ if (t == null || (termPrefix != null && !t.startsWith(termPrefix))) {
+ break;
+ }
+ //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
+
+ if (!testedOrd) {
+ try {
+ ordBase = (int) te.ord();
+ } catch (UnsupportedOperationException uoe) {
+ // Reader cannot provide ord support, so we wrap
+ // our own support by creating our own terms index:
+ indexedTerms = new ArrayList();
+ indexedTermsBytes = new PagedBytes(15);
+ }
+ testedOrd = true;
+ }
+
+ visitTerm(te, termNum);
+
+ if (indexedTerms != null && (termNum & INDEX_INTERVAL_MASK) == 0) {
+ // Index this term
+ sizeOfIndexedStrings += t.length;
+ BytesRef indexedTerm = new BytesRef();
+ indexedTermsBytes.copy(t, indexedTerm);
+ // TODO: really should 1) strip off useless suffix,
+ // and 2) use FST not array/PagedBytes
+ indexedTerms.add(indexedTerm);
+ }
+
+ final int df = te.docFreq();
+ if (df <= maxTermDocFreq) {
+
+ docsEnum = te.docs(delDocs, docsEnum);
+
+ final DocsEnum.BulkReadResult bulkResult = docsEnum.getBulkResult();
+
+ // dF, but takes deletions into account
+ int actualDF = 0;
+
+ for (;;) {
+ int chunk = docsEnum.read();
+ if (chunk <= 0) {
+ break;
+ }
+ //System.out.println(" chunk=" + chunk + " docs");
+
+ actualDF += chunk;
+
+ for (int i=0; i>>=8;
+ }
+ // point at the end index in the byte[]
+ index[doc] = (endPos<<8) | 1;
+ bytes[doc] = tempArr;
+ tempArr = new byte[12];
+ }
+ }
+ }
+ }
+ setActualDocFreq(termNum, actualDF);
+ }
+
+ termNum++;
+ if (te.next() == null) {
+ break;
+ }
+ }
+
+ numTermsInField = termNum;
+
+ long midPoint = System.currentTimeMillis();
+
+ if (termInstances == 0) {
+ // we didn't invert anything
+ // lower memory consumption.
+ tnums = null;
+ } else {
+
+ this.index = index;
+
+ //
+ // transform intermediate form into the final form, building a single byte[]
+ // at a time, and releasing the intermediate byte[]s as we go to avoid
+ // increasing the memory footprint.
+ //
+
+ for (int pass = 0; pass<256; pass++) {
+ byte[] target = tnums[pass];
+ int pos=0; // end in target;
+ if (target != null) {
+ pos = target.length;
+ } else {
+ target = new byte[4096];
+ }
+
+ // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
+ // where pp is the pass (which array we are building), and xx is all values.
+ // each pass shares the same byte[] for termNumber lists.
+ for (int docbase = pass<<16; docbase maxDoc)
+ break;
+ }
+
+ if (indexedTerms != null) {
+ indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
+ }
+ }
+
+ long endTime = System.currentTimeMillis();
+
+ total_time = (int)(endTime-startTime);
+ phase1_time = (int)(midPoint-startTime);
+ }
+
+ /** Number of bytes to represent an unsigned int as a vint. */
+ private static int vIntSize(int x) {
+ if ((x & (0xffffffff << (7*1))) == 0 ) {
+ return 1;
+ }
+ if ((x & (0xffffffff << (7*2))) == 0 ) {
+ return 2;
+ }
+ if ((x & (0xffffffff << (7*3))) == 0 ) {
+ return 3;
+ }
+ if ((x & (0xffffffff << (7*4))) == 0 ) {
+ return 4;
+ }
+ return 5;
+ }
+
+ // todo: if we know the size of the vInt already, we could do
+ // a single switch on the size
+ private static int writeInt(int x, byte[] arr, int pos) {
+ int a;
+ a = (x >>> (7*4));
+ if (a != 0) {
+ arr[pos++] = (byte)(a | 0x80);
+ }
+ a = (x >>> (7*3));
+ if (a != 0) {
+ arr[pos++] = (byte)(a | 0x80);
+ }
+ a = (x >>> (7*2));
+ if (a != 0) {
+ arr[pos++] = (byte)(a | 0x80);
+ }
+ a = (x >>> (7*1));
+ if (a != 0) {
+ arr[pos++] = (byte)(a | 0x80);
+ }
+ arr[pos++] = (byte)(x & 0x7f);
+ return pos;
+ }
+
+ // nocommit -- test the 0 case (doc w/ no terms)
+ public class TermOrdsIterator {
+ private int tnum;
+ private int upto;
+ private byte[] arr;
+
+ /** Buffer must be at least 5 ints long. Returns number
+ * of term ords placed into buffer; if this count is
+ * less than buffer.length then that is the end. */
+ public int read(int[] buffer) {
+ int bufferUpto = 0;
+ if (arr == null) {
+ // code is inlined into upto
+ //System.out.println("inlined");
+ int code = upto;
+ int delta = 0;
+ for (;;) {
+ delta = (delta << 7) | (code & 0x7f);
+ if ((code & 0x80)==0) {
+ if (delta==0) break;
+ tnum += delta - TNUM_OFFSET;
+ buffer[bufferUpto++] = ordBase+tnum;
+ //System.out.println(" tnum=" + tnum);
+ delta = 0;
+ }
+ code >>>= 8;
+ }
+ } else {
+ // code is a pointer
+ for(;;) {
+ int delta = 0;
+ for(;;) {
+ byte b = arr[upto++];
+ delta = (delta << 7) | (b & 0x7f);
+ //System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b);
+ if ((b & 0x80) == 0) break;
+ }
+ //System.out.println(" delta=" + delta);
+ if (delta == 0) break;
+ tnum += delta - TNUM_OFFSET;
+ //System.out.println(" tnum=" + tnum);
+ buffer[bufferUpto++] = ordBase+tnum;
+ if (bufferUpto == buffer.length) {
+ break;
+ }
+ }
+ }
+
+ return bufferUpto;
+ }
+
+ public TermOrdsIterator reset(int docID) {
+ //System.out.println(" reset docID=" + docID);
+ tnum = 0;
+ final int code = index[docID];
+ if ((code & 0xff)==1) {
+ // a pointer
+ upto = code>>>8;
+ //System.out.println(" pointer! upto=" + upto);
+ int whichArray = (docID >>> 16) & 0xff;
+ arr = tnums[whichArray];
+ } else {
+ //System.out.println(" inline!");
+ arr = null;
+ upto = code;
+ }
+ return this;
+ }
+ }
+
+ /** Returns an iterator to step through the term ords for
+ * this document. It's also possible to subclass this
+ * class and directly access members. */
+ public TermOrdsIterator lookup(int doc, TermOrdsIterator reuse) {
+ final TermOrdsIterator ret;
+ if (reuse != null) {
+ ret = reuse;
+ } else {
+ ret = new TermOrdsIterator();
+ }
+ return ret.reset(doc);
+ }
+
+ /* Only used if original IndexReader doesn't implement
+ * ord; in this case we "wrap" our own terms index
+ * around it. */
+ private final class OrdWrappedTermsEnum extends TermsEnum {
+ private final IndexReader reader;
+ private final TermsEnum termsEnum;
+ private BytesRef term;
+ private long ord = -1;
+
+ public OrdWrappedTermsEnum(IndexReader reader) throws IOException {
+ this.reader = reader;
+ assert indexedTermsArray != null;
+ termsEnum = MultiFields.getTerms(reader, field).iterator();
+ }
+
+ @Override
+ public Comparator getComparator() throws IOException {
+ return termsEnum.getComparator();
+ }
+
+ @Override
+ public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
+ return termsEnum.docs(skipDocs, reuse);
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ return termsEnum.docsAndPositions(skipDocs, reuse);
+ }
+
+ @Override
+ public BytesRef term() {
+ return term;
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ ord++;
+ if (termsEnum.next() == null) {
+ term = null;
+ return null;
+ }
+ return setTerm(); // this is extra work if we know we are in bounds...
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ return termsEnum.docFreq();
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ return termsEnum.totalTermFreq();
+ }
+
+ @Override
+ public long ord() throws IOException {
+ return ord;
+ }
+
+ @Override
+ public SeekStatus seek(BytesRef target, boolean useCache) throws IOException {
+
+ // already here
+ if (term != null && term.equals(target)) {
+ return SeekStatus.FOUND;
+ }
+
+ int startIdx = Arrays.binarySearch(indexedTermsArray, target);
+
+ if (startIdx >= 0) {
+ // we hit the term exactly... lucky us!
+ TermsEnum.SeekStatus seekStatus = termsEnum.seek(target);
+ assert seekStatus == TermsEnum.SeekStatus.FOUND;
+ ord = startIdx << INDEX_INTERVAL_BITS;
+ setTerm();
+ assert term != null;
+ return SeekStatus.FOUND;
+ }
+
+ // we didn't hit the term exactly
+ startIdx = -startIdx-1;
+
+ if (startIdx == 0) {
+ // our target occurs *before* the first term
+ TermsEnum.SeekStatus seekStatus = termsEnum.seek(target);
+ assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
+ ord = 0;
+ setTerm();
+ assert term != null;
+ return SeekStatus.NOT_FOUND;
+ }
+
+ // back up to the start of the block
+ startIdx--;
+
+ if ((ord >> INDEX_INTERVAL_BITS) == startIdx && term != null && term.compareTo(target) <= 0) {
+ // we are already in the right block and the current term is before the term we want,
+ // so we don't need to seek.
+ } else {
+ // seek to the right block
+ TermsEnum.SeekStatus seekStatus = termsEnum.seek(indexedTermsArray[startIdx]);
+ assert seekStatus == TermsEnum.SeekStatus.FOUND;
+ ord = startIdx << INDEX_INTERVAL_BITS;
+ setTerm();
+ assert term != null; // should be non-null since it's in the index
+ }
+
+ while (term != null && term.compareTo(target) < 0) {
+ next();
+ }
+
+ if (term == null) {
+ return SeekStatus.END;
+ } else if (term.compareTo(target) == 0) {
+ return SeekStatus.FOUND;
+ } else {
+ return SeekStatus.NOT_FOUND;
+ }
+ }
+
+ @Override
+ public SeekStatus seek(long targetOrd) throws IOException {
+ int delta = (int) (targetOrd - ord);
+ //System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord);
+ if (delta < 0 || delta > INDEX_INTERVAL) {
+ final int idx = (int) (targetOrd >>> INDEX_INTERVAL_BITS);
+ final BytesRef base = indexedTermsArray[idx];
+ //System.out.println(" do seek term=" + base.utf8ToString());
+ ord = idx << INDEX_INTERVAL_BITS;
+ delta = (int) (targetOrd - ord);
+ final TermsEnum.SeekStatus seekStatus = termsEnum.seek(base, true);
+ assert seekStatus == TermsEnum.SeekStatus.FOUND;
+ }
+
+ while (--delta >= 0) {
+ BytesRef br = termsEnum.next();
+ if (br == null) {
+ term = null;
+ return null;
+ }
+ ord++;
+ }
+
+ setTerm();
+ assert term != null;
+ //System.out.println(" return term=" + term.utf8ToString());
+ return SeekStatus.FOUND;
+ }
+
+ private BytesRef setTerm() throws IOException {
+ term = termsEnum.term();
+ if (prefix != null && !term.startsWith(prefix)) {
+ term = null;
+ }
+ return term;
+ }
+ }
+
+ public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
+ TermsEnum.SeekStatus status = termsEnum.seek(ord);
+ assert status == TermsEnum.SeekStatus.FOUND;
+ return termsEnum.term();
+ }
+}
Index: lucene/src/java/org/apache/lucene/index/IndexReader.java
--- lucene/src/java/org/apache/lucene/index/IndexReader.java Tue Mar 29 18:45:54 2011 +0000
+++ lucene/src/java/org/apache/lucene/index/IndexReader.java Tue Mar 29 15:49:08 2011 -0400
@@ -919,6 +919,16 @@
}
}
+ // nocommit jdoc
+ public static boolean indexExists(Directory directory, CodecProvider cp) throws IOException {
+ try {
+ new SegmentInfos().read(directory, cp);
+ return true;
+ } catch (IOException ioe) {
+ return false;
+ }
+ }
+
/** Returns the number of documents in this index. */
public abstract int numDocs();
Index: lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java
--- lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java Tue Mar 29 18:45:54 2011 +0000
+++ lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java Tue Mar 29 15:49:08 2011 -0400
@@ -181,7 +181,7 @@
System.out.println("RIW.getReader: open new reader");
}
w.commit();
- return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10));
+ return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10), w.getConfig().getCodecProvider());
}
}
Index: lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java
--- lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java Tue Mar 29 18:45:54 2011 +0000
+++ lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java Tue Mar 29 15:49:08 2011 -0400
@@ -32,6 +32,7 @@
import java.util.Set;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@@ -419,12 +420,27 @@
throw new RuntimeException("MockDirectoryWrapper: cannot close: there are still open files: " + openFiles, cause);
}
open = false;
- if (checkIndexOnClose && IndexReader.indexExists(this)) {
- _TestUtil.checkIndex(this);
+ if (checkIndexOnClose) {
+ if (codecProvider != null) {
+ if (IndexReader.indexExists(this, codecProvider)) {
+ _TestUtil.checkIndex(this, codecProvider);
+ }
+ } else {
+ if (IndexReader.indexExists(this)) {
+ _TestUtil.checkIndex(this);
+ }
+ }
}
delegate.close();
}
+ private CodecProvider codecProvider;
+
+ // We pass this CodecProvider to checkIndex when dir is closed...
+ public void setCodecProvider(CodecProvider cp) {
+ codecProvider = cp;
+ }
+
boolean open = true;
public synchronized boolean isOpen() {
Index: lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java
--- lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java Tue Mar 29 18:45:54 2011 +0000
+++ lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java Tue Mar 29 15:49:08 2011 -0400
@@ -157,6 +157,19 @@
return start + r.nextInt(end-start+1);
}
+ public static String simpleRandomString(Random r) {
+ final int end = r.nextInt(10);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ final char[] buffer = new char[end];
+ for (int i = 0; i < end; i++) {
+ buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
+ }
+ return new String(buffer, 0, end);
+ }
+
/** Returns random string, including full unicode range. */
public static String randomUnicodeString(Random r) {
return randomUnicodeString(r, 20);
Index: lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java Tue Mar 29 15:49:08 2011 -0400
@@ -0,0 +1,358 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
+import org.apache.lucene.index.DocTermOrds.TermOrdsIterator;
+import org.apache.lucene.index.codecs.BlockTermsReader;
+import org.apache.lucene.index.codecs.BlockTermsWriter;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.CoreCodecProvider;
+import org.apache.lucene.index.codecs.FieldsConsumer;
+import org.apache.lucene.index.codecs.FieldsProducer;
+import org.apache.lucene.index.codecs.FixedGapTermsIndexReader;
+import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter;
+import org.apache.lucene.index.codecs.PostingsReaderBase;
+import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermsIndexReaderBase;
+import org.apache.lucene.index.codecs.TermsIndexWriterBase;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+// TODO:
+// - test w/ del docs
+// - test prefix
+// - test w/ cutoff
+// - crank docs way up so we get some merging sometimes
+
+public class TestDocTermOrds extends LuceneTestCase {
+
+ public void testSimple() throws Exception {
+ Directory dir = newDirectory();
+ final RandomIndexWriter w = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(newInOrderLogMergePolicy()));
+ Document doc = new Document();
+ Field field = newField("field", "", Field.Index.ANALYZED);
+ doc.add(field);
+ field.setValue("a b c");
+ w.addDocument(doc);
+
+ field.setValue("d e f");
+ w.addDocument(doc);
+
+ field.setValue("a f");
+ w.addDocument(doc);
+
+ final IndexReader r = w.getReader();
+ w.close();
+
+ final DocTermOrds dto = new DocTermOrds(r, "field");
+
+ TermOrdsIterator iter = dto.lookup(0, null);
+ final int[] buffer = new int[5];
+ assertEquals(3, iter.read(buffer));
+ assertEquals(0, buffer[0]);
+ assertEquals(1, buffer[1]);
+ assertEquals(2, buffer[2]);
+
+ iter = dto.lookup(1, iter);
+ assertEquals(3, iter.read(buffer));
+ assertEquals(3, buffer[0]);
+ assertEquals(4, buffer[1]);
+ assertEquals(5, buffer[2]);
+
+ iter = dto.lookup(2, iter);
+ assertEquals(2, iter.read(buffer));
+ assertEquals(0, buffer[0]);
+ assertEquals(5, buffer[1]);
+
+ r.close();
+ dir.close();
+ }
+
+ private static class StandardCodecWithOrds extends Codec {
+ public StandardCodecWithOrds() {
+ name = "StandardOrds";
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ PostingsWriterBase docs = new StandardPostingsWriter(state);
+
+ // TODO: should we make the terms index more easily
+ // pluggable? Ie so that this codec would record which
+ // index impl was used, and switch on loading?
+ // Or... you must make a new Codec for this?
+ TermsIndexWriterBase indexWriter;
+ boolean success = false;
+ try {
+ indexWriter = new FixedGapTermsIndexWriter(state);
+ success = true;
+ } finally {
+ if (!success) {
+ docs.close();
+ }
+ }
+
+ success = false;
+ try {
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ try {
+ docs.close();
+ } finally {
+ indexWriter.close();
+ }
+ }
+ }
+ }
+
+ public final static int TERMS_CACHE_SIZE = 1024;
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+ PostingsReaderBase postings = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId);
+ TermsIndexReaderBase indexReader;
+
+ boolean success = false;
+ try {
+ indexReader = new FixedGapTermsIndexReader(state.dir,
+ state.fieldInfos,
+ state.segmentInfo.name,
+ state.termsIndexDivisor,
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
+ state.codecId);
+ success = true;
+ } finally {
+ if (!success) {
+ postings.close();
+ }
+ }
+
+ success = false;
+ try {
+ FieldsProducer ret = new BlockTermsReader(indexReader,
+ state.dir,
+ state.fieldInfos,
+ state.segmentInfo.name,
+ postings,
+ state.readBufferSize,
+ TERMS_CACHE_SIZE,
+ state.codecId);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ try {
+ postings.close();
+ } finally {
+ indexReader.close();
+ }
+ }
+ }
+ }
+
+ /** Extension of freq postings file */
+ static final String FREQ_EXTENSION = "frq";
+
+ /** Extension of prox postings file */
+ static final String PROX_EXTENSION = "prx";
+
+ @Override
+ public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException {
+ StandardPostingsReader.files(dir, segmentInfo, id, files);
+ BlockTermsReader.files(dir, segmentInfo, id, files);
+ FixedGapTermsIndexReader.files(dir, segmentInfo, id, files);
+ }
+
+ @Override
+ public void getExtensions(Set extensions) {
+ getStandardExtensions(extensions);
+ }
+
+ public static void getStandardExtensions(Set extensions) {
+ extensions.add(FREQ_EXTENSION);
+ extensions.add(PROX_EXTENSION);
+ BlockTermsReader.getExtensions(extensions);
+ FixedGapTermsIndexReader.getIndexExtensions(extensions);
+ }
+ }
+
+ public void testRandom() throws Exception {
+ MockDirectoryWrapper dir = newDirectory();
+
+ final int NUM_TERMS = 100 * RANDOM_MULTIPLIER;
+ final Set terms = new HashSet();
+ while(terms.size() < NUM_TERMS) {
+ // nocommit
+ //final String s = _TestUtil.randomRealisticUnicodeString(random);
+ final String s = _TestUtil.simpleRandomString(random);
+ if (s.length() > 0) {
+ terms.add(new BytesRef(s));
+ }
+ }
+ final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]);
+ Arrays.sort(termsArray);
+
+ final int NUM_DOCS = 1000 * RANDOM_MULTIPLIER;
+
+ IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer());
+
+ // Sometimes swap in codec that impls ord():
+ if (random.nextInt(10) == 7) {
+ // Make sure terms index has ords:
+ CoreCodecProvider cp = new CoreCodecProvider();
+ cp.register(new StandardCodecWithOrds());
+ cp.setDefaultFieldCodec("StandardOrds");
+
+ // So checkIndex on close works
+ dir.setCodecProvider(cp);
+ conf.setCodecProvider(cp);
+ }
+
+ final RandomIndexWriter w = new RandomIndexWriter(random, dir, conf);
+
+ final int[][] idToOrds = new int[NUM_DOCS][];
+ final Set ordsForDocSet = new HashSet();
+
+ for(int id=0;id (1<<24)*.9) {
- SolrCore.log.warn("Approaching too many values for UnInvertedField faceting on field '"+field+"' : bucket size=" + target.length);
- }
- }
-
- tnums[pass] = target;
-
- if ((pass << 16) > maxDoc)
- break;
- }
}
- long endTime = System.currentTimeMillis();
-
- total_time = (int)(endTime-startTime);
- phase1_time = (int)(midPoint-startTime);
-
SolrCore.log.info("UnInverted multi-valued field " + toString());
+ //System.out.println("CREATED: " + toString() + " ti.index=" + ti.index);
}
-
-
+ public int getNumTerms() {
+ return numTermsInField;
+ }
public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException {
use.incrementAndGet();
@@ -468,6 +209,7 @@
int baseSize = docs.size();
int maxDoc = searcher.maxDoc();
+ //System.out.println("GET COUNTS field=" + field + " baseSize=" + baseSize + " minCount=" + mincount + " maxDoc=" + maxDoc + " numTermsInField=" + numTermsInField);
if (baseSize >= mincount) {
final int[] index = this.index;
@@ -481,14 +223,15 @@
int startTerm = 0;
int endTerm = numTermsInField; // one past the end
- NumberedTermsEnum te = ti.getEnumerator(searcher.getIndexReader());
+ TermsEnum te = getOrdTermsEnum(searcher.getIndexReader());
+ //System.out.println("GOT enum " + te);
if (prefix != null && prefix.length() > 0) {
BytesRef prefixBr = new BytesRef(prefix);
- te.skipTo(prefixBr);
- startTerm = te.getTermNumber();
+ te.seek(prefixBr, true);
+ startTerm = (int) te.ord();
prefixBr.append(ByteUtils.bigTerm);
- te.skipTo(prefixBr);
- endTerm = te.getTermNumber();
+ te.seek(prefixBr, true);
+ endTerm = (int) te.ord();
}
/***********
@@ -514,13 +257,18 @@
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
+ //System.out.println(" NEG");
}
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
+ //System.out.println(" do big termNum=" + tt.termNum + " term=" + tt.term.utf8ToString());
// TODO: counts could be deferred if sorted==false
if (tt.termNum >= startTerm && tt.termNum < endTerm) {
- counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(ti.field, tt.term)), docs);
+ counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(field, tt.term)), docs);
+ //System.out.println(" count=" + counts[tt.termNum]);
+ } else {
+ //System.out.println("SKIP term=" + tt.termNum);
}
}
@@ -540,6 +288,7 @@
int code = index[doc];
if ((code & 0xff)==1) {
+ //System.out.println(" ptr");
int pos = code>>>8;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
@@ -553,9 +302,11 @@
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
+ //System.out.println(" tnum=" + tnum);
counts[tnum]++;
}
} else {
+ //System.out.println(" inlined");
int tnum = 0;
int delta = 0;
for (;;) {
@@ -563,6 +314,7 @@
if ((code & 0x80)==0) {
if (delta==0) break;
tnum += delta - TNUM_OFFSET;
+ //System.out.println(" tnum=" + tnum);
counts[tnum]++;
delta = 0;
}
@@ -668,8 +420,6 @@
res.add(label, c);
}
}
-
- te.close();
}
@@ -678,6 +428,8 @@
res.add(null, SimpleFacets.getFieldMissingCount(searcher, baseDocs, field));
}
+ //System.out.println(" res=" + res);
+
return res;
}
@@ -731,8 +483,7 @@
final int[] index = this.index;
final int[] counts = new int[numTermsInField];//keep track of the number of times we see each word in the field for all the documents in the docset
- NumberedTermsEnum te = ti.getEnumerator(searcher.getIndexReader());
-
+ TermsEnum te = getOrdTermsEnum(searcher.getIndexReader());
boolean doNegative = false;
if (finfo.length == 0) {
@@ -755,7 +506,7 @@
for (TopTerm tt : bigTerms.values()) {
// TODO: counts could be deferred if sorted==false
if (tt.termNum >= 0 && tt.termNum < numTermsInField) {
- final Term t = new Term(ti.field, tt.term);
+ final Term t = new Term(field, tt.term);
if (finfo.length == 0) {
counts[tt.termNum] = searcher.numDocs(new TermQuery(t), docs);
} else {
@@ -836,7 +587,6 @@
f.accumulateTermNum(i, value);
}
}
- te.close();
int c = missing.size();
allstats.addMissing(c);
@@ -870,23 +620,26 @@
}
/** may return a reused BytesRef */
- BytesRef getTermValue(NumberedTermsEnum te, int termNum) throws IOException {
+ BytesRef getTermValue(TermsEnum te, int termNum) throws IOException {
+ //System.out.println("getTermValue termNum=" + termNum + " this=" + this + " numTerms=" + numTermsInField);
if (bigTerms.size() > 0) {
// see if the term is one of our big terms.
TopTerm tt = bigTerms.get(termNum);
if (tt != null) {
+ //System.out.println(" return big " + tt.term);
return tt.term;
}
}
- return te.skipTo(termNum);
+ return lookupTerm(te, termNum);
}
@Override
public String toString() {
+ final long indexSize = indexedTermsArray == null ? 0 : (8+8+8+8+(indexedTermsArray.length<<3)+sizeOfIndexedStrings); // assume 8 byte references?
return "{field=" + field
+ ",memSize="+memSize()
- + ",tindexSize="+ti.memSize()
+ + ",tindexSize="+indexSize
+ ",time="+total_time
+ ",phase1="+phase1_time
+ ",nTerms="+numTermsInField
@@ -896,7 +649,6 @@
+ "}";
}
-
//////////////////////////////////////////////////////////////////
//////////////////////////// caching /////////////////////////////
//////////////////////////////////////////////////////////////////
@@ -920,287 +672,3 @@
return uif;
}
}
-
-
-// How to share TermDocs (int[] score[])???
-// Hot to share TermPositions?
-/***
-class TermEnumListener {
- void doTerm(Term t) {
- }
- void done() {
- }
-}
-***/
-
-
-class NumberedTermsEnum extends TermsEnum {
- protected final IndexReader reader;
- protected final TermIndex tindex;
- protected TermsEnum tenum;
- protected int pos=-1;
- protected BytesRef termText;
- protected DocsEnum docsEnum;
- protected Bits deletedDocs;
-
-
- NumberedTermsEnum(IndexReader reader, TermIndex tindex) throws IOException {
- this.reader = reader;
- this.tindex = tindex;
- }
-
-
- NumberedTermsEnum(IndexReader reader, TermIndex tindex, BytesRef termValue, int pos) throws IOException {
- this.reader = reader;
- this.tindex = tindex;
- this.pos = pos;
- Terms terms = MultiFields.getTerms(reader, tindex.field);
- deletedDocs = MultiFields.getDeletedDocs(reader);
- if (terms != null) {
- tenum = terms.iterator();
- tenum.seek(termValue);
- setTerm();
- }
- }
-
- @Override
- public Comparator getComparator() throws IOException {
- return tenum.getComparator();
- }
-
- public DocsEnum getDocsEnum() throws IOException {
- docsEnum = tenum.docs(deletedDocs, docsEnum);
- return docsEnum;
- }
-
- protected BytesRef setTerm() throws IOException {
- termText = tenum.term();
- if (tindex.prefix != null && !termText.startsWith(tindex.prefix)) {
- termText = null;
- }
- return termText;
- }
-
- @Override
- public BytesRef next() throws IOException {
- pos++;
- if (tenum.next() == null) {
- termText = null;
- return null;
- }
- return setTerm(); // this is extra work if we know we are in bounds...
- }
-
- @Override
- public BytesRef term() {
- return termText;
- }
-
- @Override
- public int docFreq() throws IOException {
- return tenum.docFreq();
- }
-
- @Override
- public long totalTermFreq() throws IOException {
- return tenum.totalTermFreq();
- }
-
- public BytesRef skipTo(BytesRef target) throws IOException {
-
- // already here
- if (termText != null && termText.equals(target)) return termText;
-
- if (tenum == null) {
- return null;
- }
-
- int startIdx = Arrays.binarySearch(tindex.index,target);
-
- if (startIdx >= 0) {
- // we hit the term exactly... lucky us!
- TermsEnum.SeekStatus seekStatus = tenum.seek(target);
- assert seekStatus == TermsEnum.SeekStatus.FOUND;
- pos = startIdx << tindex.intervalBits;
- return setTerm();
- }
-
- // we didn't hit the term exactly
- startIdx=-startIdx-1;
-
- if (startIdx == 0) {
- // our target occurs *before* the first term
- TermsEnum.SeekStatus seekStatus = tenum.seek(target);
- assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
- pos = 0;
- return setTerm();
- }
-
- // back up to the start of the block
- startIdx--;
-
- if ((pos >> tindex.intervalBits) == startIdx && termText != null && termText.compareTo(target)<=0) {
- // we are already in the right block and the current term is before the term we want,
- // so we don't need to seek.
- } else {
- // seek to the right block
- TermsEnum.SeekStatus seekStatus = tenum.seek(tindex.index[startIdx]);
- assert seekStatus == TermsEnum.SeekStatus.FOUND;
- pos = startIdx << tindex.intervalBits;
- setTerm(); // should be non-null since it's in the index
- }
-
- while (termText != null && termText.compareTo(target) < 0) {
- next();
- }
-
- return termText;
- }
-
- public BytesRef skipTo(int termNumber) throws IOException {
- int delta = termNumber - pos;
- if (delta < 0 || delta > tindex.interval || tenum==null) {
- int idx = termNumber >>> tindex.intervalBits;
- BytesRef base = tindex.index[idx];
- pos = idx << tindex.intervalBits;
- delta = termNumber - pos;
- TermsEnum.SeekStatus seekStatus = tenum.seek(base);
- assert seekStatus == TermsEnum.SeekStatus.FOUND;
- }
- while (--delta >= 0) {
- BytesRef br = tenum.next();
- if (br == null) {
- termText = null;
- return null;
- }
- ++pos;
- }
- return setTerm();
- }
-
- protected void close() throws IOException {
- // no-op, needed so the anon subclass that does indexing
- // can build its index
- }
-
- /** The current term number, starting at 0.
- * Only valid if the previous call to next() or skipTo() returned true.
- */
- public int getTermNumber() {
- return pos;
- }
-
- @Override
- public long ord() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public SeekStatus seek(long ord) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public SeekStatus seek(BytesRef target, boolean useCache) {
- throw new UnsupportedOperationException();
- }
-}
-
-
-/**
- * Class to save memory by only storing every nth term (for random access), while
- * numbering the terms, allowing them to be retrieved later by number.
- * This is only valid when used with the IndexReader it was created with.
- * The IndexReader is not actually stored to facilitate caching by using it as a key in
- * a weak hash map.
- */
-class TermIndex {
- final static int intervalBits = 7; // decrease to a low number like 2 for testing
- final static int intervalMask = 0xffffffff >>> (32-intervalBits);
- final static int interval = 1 << intervalBits;
-
- final String field;
- final BytesRef prefix;
- BytesRef[] index;
- int nTerms;
- long sizeOfStrings;
-
- TermIndex(String field) {
- this(field, null);
- }
-
- TermIndex(String field, String prefix) {
- this.field = field;
- this.prefix = prefix == null ? null : new BytesRef(prefix);
- }
-
- NumberedTermsEnum getEnumerator(IndexReader reader, int termNumber) throws IOException {
- NumberedTermsEnum te = new NumberedTermsEnum(reader, this);
- te.skipTo(termNumber);
- return te;
- }
-
- /* The first time an enumerator is requested, it should be used
- with next() to fully traverse all of the terms so the index
- will be built.
- */
- NumberedTermsEnum getEnumerator(IndexReader reader) throws IOException {
- if (index==null) return new NumberedTermsEnum(reader,this, prefix==null?new BytesRef():prefix, 0) {
- ArrayList lst;
- PagedBytes bytes;
-
- @Override
- protected BytesRef setTerm() throws IOException {
- BytesRef br = super.setTerm();
- if (br != null && (pos & intervalMask)==0) {
- sizeOfStrings += br.length;
- if (lst==null) {
- lst = new ArrayList();
- bytes = new PagedBytes(15);
- }
- BytesRef out = new BytesRef();
- bytes.copy(br, out);
- lst.add(out);
- }
- return br;
- }
-
- @Override
- public BytesRef skipTo(int termNumber) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void close() throws IOException {
- nTerms=pos;
- super.close();
- index = lst!=null ? lst.toArray(new BytesRef[lst.size()]) : new BytesRef[0];
- }
- };
- else return new NumberedTermsEnum(reader,this,new BytesRef(),0);
- }
-
-
- /**
- * Returns the approximate amount of memory taken by this TermIndex.
- * This is only an approximation and doesn't take into account java object overhead.
- *
- * @return
- * the approximate memory consumption in bytes
- */
- public long memSize() {
- // assume 8 byte references?
- return 8+8+8+8+(index.length<<3)+sizeOfStrings;
- }
-}
-
Index: solr/src/test/org/apache/solr/request/TestFaceting.java
--- solr/src/test/org/apache/solr/request/TestFaceting.java Tue Mar 29 18:45:54 2011 +0000
+++ solr/src/test/org/apache/solr/request/TestFaceting.java Tue Mar 29 15:49:08 2011 -0400
@@ -17,14 +17,17 @@
package org.apache.solr.request;
+import java.util.Locale;
+import java.util.Random;
+
+import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.After;
import org.junit.BeforeClass;
import org.junit.Test;
-import java.util.Locale;
-import java.util.Random;
/**
* @version $Id$
@@ -62,43 +65,51 @@
}
void doTermEnum(int size) throws Exception {
+ //System.out.println("doTermEnum size=" + size);
close();
createIndex(size);
req = lrf.makeRequest("q","*:*");
- TermIndex ti = new TermIndex(proto.field());
- NumberedTermsEnum te = ti.getEnumerator(req.getSearcher().getIndexReader());
+ UnInvertedField uif = new UnInvertedField(proto.field(), req.getSearcher());
- // iterate through first
- while(te.term() != null) te.next();
- assertEquals(size, te.getTermNumber());
- te.close();
+ assertEquals(size, uif.getNumTerms());
- te = ti.getEnumerator(req.getSearcher().getIndexReader());
+ TermsEnum te = uif.getOrdTermsEnum(req.getSearcher().getIndexReader());
+ assertEquals(size == 0, te == null);
Random r = new Random(size);
// test seeking by term string
for (int i=0; i0, te.skipTo(new BytesRef("000")) != null);
- assertEquals(0, te.getTermNumber());
if (size>0) {
+ assertEquals(size>0, te.seek(new BytesRef("000"), true) != TermsEnum.SeekStatus.END);
+ assertEquals(0, te.ord());
assertEquals(t(0), te.term().utf8ToString());
- } else {
- assertEquals(null, te.term());
}
if (size>0) {
@@ -106,9 +117,10 @@
for (int i=0; i