Index: lucene/src/java/org/apache/lucene/index/DocTermOrds.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/index/DocTermOrds.java Tue Mar 29 15:49:08 2011 -0400 @@ -0,0 +1,745 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Bits; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Comparator; + +// nocommit TODO +// - jdoc explain int vs long ord +// - allow specifying interval +// - jdoc how del docs handled +// - jdoc 2.1B unique term limit +// - should we pull in this "emulated terms index" +// here...? +// - NOTE that doc's ords become sorted, ie, they are not +// in order + +/** + * + * Final form of the un-inverted field: + * Each document points to a list of term numbers that are contained in that document. + * + * Term numbers are in sorted order, and are encoded as variable-length deltas from the + * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A + * term number of 0 signals the end of the termNumber list. + * + * There is a single int[maxDoc()] which either contains a pointer into a byte[] for + * the termNumber lists, or directly contains the termNumber list if it fits in the 4 + * bytes of an integer. If the first byte in the integer is 1, the next 3 bytes + * are a pointer into a byte[] where the termNumber list starts. + * + * There are actually 256 byte arrays, to compensate for the fact that the pointers + * into the byte arrays are only 3 bytes long. The correct byte array for a document + * is a function of it's id. + * + * To save space and speed up faceting, any term that matches enough documents will + * not be un-inverted... it will be skipped while building the un-inverted field structure, + * and will use a set intersection method during faceting. + * + * To further save memory, the terms (the actual string values) are not all stored in + * memory, but a TermIndex is used to convert term numbers to term values only + * for the terms needed after faceting has completed. Only every 128th term value + * is stored, along with it's corresponding term number, and this is used as an + * index to find the closest term and iterate until the desired number is hit (very + * much like Lucene's own internal term index). + * + * @lucene.experimental + */ + +public class DocTermOrds { + + // Term ords are shifted by this, internally, to reseve + // values 0 (end term) and 1 (index is a pointer into byte array) + private final static int TNUM_OFFSET = 2; + public final static int INDEX_INTERVAL_BITS = 7; // decrease to a low number like 2 for testing + public final static int INDEX_INTERVAL_MASK = 0xffffffff >>> (32-INDEX_INTERVAL_BITS); + public final static int INDEX_INTERVAL = 1 << INDEX_INTERVAL_BITS; + + protected final int maxTermDocFreq; + + protected final String field; + + protected int numTermsInField; + protected long termInstances; // total number of references to term numbers + private long memsz; + protected int total_time; // total time to uninvert the field + protected int phase1_time; // time for phase1 of the uninvert process + + protected int[] index; + protected byte[][] tnums = new byte[256][]; + protected long sizeOfIndexedStrings; + protected BytesRef[] indexedTermsArray; + protected BytesRef prefix; + protected int ordBase; + + public long ramUsedInBytes() { + // can cache the mem size since it shouldn't change + if (memsz!=0) return memsz; + long sz = 8*8 + 32; // local fields + if (index != null) sz += index.length * 4; + if (tnums!=null) { + for (byte[] arr : tnums) + if (arr != null) sz += arr.length; + } + memsz = sz; + return sz; + } + + public DocTermOrds(IndexReader reader, String field) throws IOException { + this(reader, field, null, Integer.MAX_VALUE); + } + + public DocTermOrds(IndexReader reader, String field, BytesRef termPrefix) throws IOException { + this(reader, field, termPrefix, Integer.MAX_VALUE); + } + + /** If term's docFreq is > maxDocFreq, then it's skipped. */ + public DocTermOrds(IndexReader reader, String field, BytesRef termPrefix, int maxTermDocFreq) throws IOException { + this(field, maxTermDocFreq); + uninvert(reader, termPrefix); + } + + /** Subclass inits w/ this, but be sure you then call + * uninvert! */ + protected DocTermOrds(String field, int maxTermDocFreq) throws IOException { + this.field = field; + this.maxTermDocFreq = maxTermDocFreq; + } + + /** Returns a TermsEnum that implements ord. If the + * provided reader supports ord, we just return its + * TermsEnum; if it does not, we build a "private" terms + * index internally (WARNING: consumes RAM) and use that + * index to implement ord. This also enables ord on top + * of a composite reader. The returned TermsEnum is + * unpositioned. Returns null if there are no terms. + * + *

NOTE: you must pass the same reader that was + * used when creating this class */ + public TermsEnum getOrdTermsEnum(IndexReader reader) throws IOException { + if (indexedTermsArray == null) { + //System.out.println("GET normal enum"); + // nocommit -- is this wrong for the prefix case? + final Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + return terms.iterator(); + } else { + return null; + } + } else if (termInstances > 0) { + //System.out.println("GET wrapped enum"); + return new OrdWrappedTermsEnum(reader); + } else { + return null; + } + } + + /** Subclass can override this */ + protected void visitTerm(TermsEnum te, int termNum) throws IOException { + } + + protected void setActualDocFreq(int termNum, int df) throws IOException { + } + + // Call this only once (if you subclass!) + protected void uninvert(final IndexReader reader, final BytesRef termPrefix) throws IOException { + //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); + final long startTime = System.currentTimeMillis(); + prefix = termPrefix == null ? null : new BytesRef(termPrefix); + + final int maxDoc = reader.maxDoc(); + final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number + final int[] lastTerm = new int[maxDoc]; // last term we saw for this document + final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) + + final Terms terms = MultiFields.getTerms(reader, field); + if (terms == null) { + // No terms + return; + } + + final TermsEnum te = terms.iterator(); + final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); + if (te.seek(seekStart) == TermsEnum.SeekStatus.END) { + // No terms match + // nocommit must test this + return; + } + + // If we need our "term index wrapper", these will be + // init'd below: + List indexedTerms = null; + PagedBytes indexedTermsBytes = null; + + boolean testedOrd = false; + + final Bits delDocs = MultiFields.getDeletedDocs(reader); + + // we need a minimum of 9 bytes, but round up to 12 since the space would + // be wasted with most allocators anyway. + byte[] tempArr = new byte[12]; + + // + // enumerate all terms, and build an intermediate form of the un-inverted field. + // + // During this intermediate form, every document has a (potential) byte[] + // and the int[maxDoc()] array either contains the termNumber list directly + // or the *end* offset of the termNumber list in it's byte array (for faster + // appending and faster creation of the final form). + // + // idea... if things are too large while building, we could do a range of docs + // at a time (but it would be a fair amount slower to build) + // could also do ranges in parallel to take advantage of multiple CPUs + + // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) + // values. This requires going over the field first to find the most + // frequent terms ahead of time. + + int termNum = 0; + DocsEnum docsEnum = null; + + // Loop begins with te positioned to first term (we call + // seek above): + for (;;) { + final BytesRef t = te.term(); + if (t == null || (termPrefix != null && !t.startsWith(termPrefix))) { + break; + } + //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); + + if (!testedOrd) { + try { + ordBase = (int) te.ord(); + } catch (UnsupportedOperationException uoe) { + // Reader cannot provide ord support, so we wrap + // our own support by creating our own terms index: + indexedTerms = new ArrayList(); + indexedTermsBytes = new PagedBytes(15); + } + testedOrd = true; + } + + visitTerm(te, termNum); + + if (indexedTerms != null && (termNum & INDEX_INTERVAL_MASK) == 0) { + // Index this term + sizeOfIndexedStrings += t.length; + BytesRef indexedTerm = new BytesRef(); + indexedTermsBytes.copy(t, indexedTerm); + // TODO: really should 1) strip off useless suffix, + // and 2) use FST not array/PagedBytes + indexedTerms.add(indexedTerm); + } + + final int df = te.docFreq(); + if (df <= maxTermDocFreq) { + + docsEnum = te.docs(delDocs, docsEnum); + + final DocsEnum.BulkReadResult bulkResult = docsEnum.getBulkResult(); + + // dF, but takes deletions into account + int actualDF = 0; + + for (;;) { + int chunk = docsEnum.read(); + if (chunk <= 0) { + break; + } + //System.out.println(" chunk=" + chunk + " docs"); + + actualDF += chunk; + + for (int i=0; i>>=8; + } + // point at the end index in the byte[] + index[doc] = (endPos<<8) | 1; + bytes[doc] = tempArr; + tempArr = new byte[12]; + } + } + } + } + setActualDocFreq(termNum, actualDF); + } + + termNum++; + if (te.next() == null) { + break; + } + } + + numTermsInField = termNum; + + long midPoint = System.currentTimeMillis(); + + if (termInstances == 0) { + // we didn't invert anything + // lower memory consumption. + tnums = null; + } else { + + this.index = index; + + // + // transform intermediate form into the final form, building a single byte[] + // at a time, and releasing the intermediate byte[]s as we go to avoid + // increasing the memory footprint. + // + + for (int pass = 0; pass<256; pass++) { + byte[] target = tnums[pass]; + int pos=0; // end in target; + if (target != null) { + pos = target.length; + } else { + target = new byte[4096]; + } + + // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx + // where pp is the pass (which array we are building), and xx is all values. + // each pass shares the same byte[] for termNumber lists. + for (int docbase = pass<<16; docbase maxDoc) + break; + } + + if (indexedTerms != null) { + indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); + } + } + + long endTime = System.currentTimeMillis(); + + total_time = (int)(endTime-startTime); + phase1_time = (int)(midPoint-startTime); + } + + /** Number of bytes to represent an unsigned int as a vint. */ + private static int vIntSize(int x) { + if ((x & (0xffffffff << (7*1))) == 0 ) { + return 1; + } + if ((x & (0xffffffff << (7*2))) == 0 ) { + return 2; + } + if ((x & (0xffffffff << (7*3))) == 0 ) { + return 3; + } + if ((x & (0xffffffff << (7*4))) == 0 ) { + return 4; + } + return 5; + } + + // todo: if we know the size of the vInt already, we could do + // a single switch on the size + private static int writeInt(int x, byte[] arr, int pos) { + int a; + a = (x >>> (7*4)); + if (a != 0) { + arr[pos++] = (byte)(a | 0x80); + } + a = (x >>> (7*3)); + if (a != 0) { + arr[pos++] = (byte)(a | 0x80); + } + a = (x >>> (7*2)); + if (a != 0) { + arr[pos++] = (byte)(a | 0x80); + } + a = (x >>> (7*1)); + if (a != 0) { + arr[pos++] = (byte)(a | 0x80); + } + arr[pos++] = (byte)(x & 0x7f); + return pos; + } + + // nocommit -- test the 0 case (doc w/ no terms) + public class TermOrdsIterator { + private int tnum; + private int upto; + private byte[] arr; + + /** Buffer must be at least 5 ints long. Returns number + * of term ords placed into buffer; if this count is + * less than buffer.length then that is the end. */ + public int read(int[] buffer) { + int bufferUpto = 0; + if (arr == null) { + // code is inlined into upto + //System.out.println("inlined"); + int code = upto; + int delta = 0; + for (;;) { + delta = (delta << 7) | (code & 0x7f); + if ((code & 0x80)==0) { + if (delta==0) break; + tnum += delta - TNUM_OFFSET; + buffer[bufferUpto++] = ordBase+tnum; + //System.out.println(" tnum=" + tnum); + delta = 0; + } + code >>>= 8; + } + } else { + // code is a pointer + for(;;) { + int delta = 0; + for(;;) { + byte b = arr[upto++]; + delta = (delta << 7) | (b & 0x7f); + //System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b); + if ((b & 0x80) == 0) break; + } + //System.out.println(" delta=" + delta); + if (delta == 0) break; + tnum += delta - TNUM_OFFSET; + //System.out.println(" tnum=" + tnum); + buffer[bufferUpto++] = ordBase+tnum; + if (bufferUpto == buffer.length) { + break; + } + } + } + + return bufferUpto; + } + + public TermOrdsIterator reset(int docID) { + //System.out.println(" reset docID=" + docID); + tnum = 0; + final int code = index[docID]; + if ((code & 0xff)==1) { + // a pointer + upto = code>>>8; + //System.out.println(" pointer! upto=" + upto); + int whichArray = (docID >>> 16) & 0xff; + arr = tnums[whichArray]; + } else { + //System.out.println(" inline!"); + arr = null; + upto = code; + } + return this; + } + } + + /** Returns an iterator to step through the term ords for + * this document. It's also possible to subclass this + * class and directly access members. */ + public TermOrdsIterator lookup(int doc, TermOrdsIterator reuse) { + final TermOrdsIterator ret; + if (reuse != null) { + ret = reuse; + } else { + ret = new TermOrdsIterator(); + } + return ret.reset(doc); + } + + /* Only used if original IndexReader doesn't implement + * ord; in this case we "wrap" our own terms index + * around it. */ + private final class OrdWrappedTermsEnum extends TermsEnum { + private final IndexReader reader; + private final TermsEnum termsEnum; + private BytesRef term; + private long ord = -1; + + public OrdWrappedTermsEnum(IndexReader reader) throws IOException { + this.reader = reader; + assert indexedTermsArray != null; + termsEnum = MultiFields.getTerms(reader, field).iterator(); + } + + @Override + public Comparator getComparator() throws IOException { + return termsEnum.getComparator(); + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { + return termsEnum.docs(skipDocs, reuse); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + return termsEnum.docsAndPositions(skipDocs, reuse); + } + + @Override + public BytesRef term() { + return term; + } + + @Override + public BytesRef next() throws IOException { + ord++; + if (termsEnum.next() == null) { + term = null; + return null; + } + return setTerm(); // this is extra work if we know we are in bounds... + } + + @Override + public int docFreq() throws IOException { + return termsEnum.docFreq(); + } + + @Override + public long totalTermFreq() throws IOException { + return termsEnum.totalTermFreq(); + } + + @Override + public long ord() throws IOException { + return ord; + } + + @Override + public SeekStatus seek(BytesRef target, boolean useCache) throws IOException { + + // already here + if (term != null && term.equals(target)) { + return SeekStatus.FOUND; + } + + int startIdx = Arrays.binarySearch(indexedTermsArray, target); + + if (startIdx >= 0) { + // we hit the term exactly... lucky us! + TermsEnum.SeekStatus seekStatus = termsEnum.seek(target); + assert seekStatus == TermsEnum.SeekStatus.FOUND; + ord = startIdx << INDEX_INTERVAL_BITS; + setTerm(); + assert term != null; + return SeekStatus.FOUND; + } + + // we didn't hit the term exactly + startIdx = -startIdx-1; + + if (startIdx == 0) { + // our target occurs *before* the first term + TermsEnum.SeekStatus seekStatus = termsEnum.seek(target); + assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; + ord = 0; + setTerm(); + assert term != null; + return SeekStatus.NOT_FOUND; + } + + // back up to the start of the block + startIdx--; + + if ((ord >> INDEX_INTERVAL_BITS) == startIdx && term != null && term.compareTo(target) <= 0) { + // we are already in the right block and the current term is before the term we want, + // so we don't need to seek. + } else { + // seek to the right block + TermsEnum.SeekStatus seekStatus = termsEnum.seek(indexedTermsArray[startIdx]); + assert seekStatus == TermsEnum.SeekStatus.FOUND; + ord = startIdx << INDEX_INTERVAL_BITS; + setTerm(); + assert term != null; // should be non-null since it's in the index + } + + while (term != null && term.compareTo(target) < 0) { + next(); + } + + if (term == null) { + return SeekStatus.END; + } else if (term.compareTo(target) == 0) { + return SeekStatus.FOUND; + } else { + return SeekStatus.NOT_FOUND; + } + } + + @Override + public SeekStatus seek(long targetOrd) throws IOException { + int delta = (int) (targetOrd - ord); + //System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord); + if (delta < 0 || delta > INDEX_INTERVAL) { + final int idx = (int) (targetOrd >>> INDEX_INTERVAL_BITS); + final BytesRef base = indexedTermsArray[idx]; + //System.out.println(" do seek term=" + base.utf8ToString()); + ord = idx << INDEX_INTERVAL_BITS; + delta = (int) (targetOrd - ord); + final TermsEnum.SeekStatus seekStatus = termsEnum.seek(base, true); + assert seekStatus == TermsEnum.SeekStatus.FOUND; + } + + while (--delta >= 0) { + BytesRef br = termsEnum.next(); + if (br == null) { + term = null; + return null; + } + ord++; + } + + setTerm(); + assert term != null; + //System.out.println(" return term=" + term.utf8ToString()); + return SeekStatus.FOUND; + } + + private BytesRef setTerm() throws IOException { + term = termsEnum.term(); + if (prefix != null && !term.startsWith(prefix)) { + term = null; + } + return term; + } + } + + public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException { + TermsEnum.SeekStatus status = termsEnum.seek(ord); + assert status == TermsEnum.SeekStatus.FOUND; + return termsEnum.term(); + } +} Index: lucene/src/java/org/apache/lucene/index/IndexReader.java --- lucene/src/java/org/apache/lucene/index/IndexReader.java Tue Mar 29 18:45:54 2011 +0000 +++ lucene/src/java/org/apache/lucene/index/IndexReader.java Tue Mar 29 15:49:08 2011 -0400 @@ -919,6 +919,16 @@ } } + // nocommit jdoc + public static boolean indexExists(Directory directory, CodecProvider cp) throws IOException { + try { + new SegmentInfos().read(directory, cp); + return true; + } catch (IOException ioe) { + return false; + } + } + /** Returns the number of documents in this index. */ public abstract int numDocs(); Index: lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java --- lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java Tue Mar 29 18:45:54 2011 +0000 +++ lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java Tue Mar 29 15:49:08 2011 -0400 @@ -181,7 +181,7 @@ System.out.println("RIW.getReader: open new reader"); } w.commit(); - return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10)); + return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10), w.getConfig().getCodecProvider()); } } Index: lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java --- lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java Tue Mar 29 18:45:54 2011 +0000 +++ lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java Tue Mar 29 15:49:08 2011 -0400 @@ -32,6 +32,7 @@ import java.util.Set; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -419,12 +420,27 @@ throw new RuntimeException("MockDirectoryWrapper: cannot close: there are still open files: " + openFiles, cause); } open = false; - if (checkIndexOnClose && IndexReader.indexExists(this)) { - _TestUtil.checkIndex(this); + if (checkIndexOnClose) { + if (codecProvider != null) { + if (IndexReader.indexExists(this, codecProvider)) { + _TestUtil.checkIndex(this, codecProvider); + } + } else { + if (IndexReader.indexExists(this)) { + _TestUtil.checkIndex(this); + } + } } delegate.close(); } + private CodecProvider codecProvider; + + // We pass this CodecProvider to checkIndex when dir is closed... + public void setCodecProvider(CodecProvider cp) { + codecProvider = cp; + } + boolean open = true; public synchronized boolean isOpen() { Index: lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java --- lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java Tue Mar 29 18:45:54 2011 +0000 +++ lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java Tue Mar 29 15:49:08 2011 -0400 @@ -157,6 +157,19 @@ return start + r.nextInt(end-start+1); } + public static String simpleRandomString(Random r) { + final int end = r.nextInt(10); + if (end == 0) { + // allow 0 length + return ""; + } + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + buffer[i] = (char) _TestUtil.nextInt(r, 97, 102); + } + return new String(buffer, 0, end); + } + /** Returns random string, including full unicode range. */ public static String randomUnicodeString(Random r) { return randomUnicodeString(r, 20); Index: lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java Tue Mar 29 15:49:08 2011 -0400 @@ -0,0 +1,358 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.DocTermOrds.TermOrdsIterator; +import org.apache.lucene.index.codecs.BlockTermsReader; +import org.apache.lucene.index.codecs.BlockTermsWriter; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CoreCodecProvider; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.FixedGapTermsIndexReader; +import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter; +import org.apache.lucene.index.codecs.PostingsReaderBase; +import org.apache.lucene.index.codecs.PostingsWriterBase; +import org.apache.lucene.index.codecs.TermsIndexReaderBase; +import org.apache.lucene.index.codecs.TermsIndexWriterBase; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MockDirectoryWrapper; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +// TODO: +// - test w/ del docs +// - test prefix +// - test w/ cutoff +// - crank docs way up so we get some merging sometimes + +public class TestDocTermOrds extends LuceneTestCase { + + public void testSimple() throws Exception { + Directory dir = newDirectory(); + final RandomIndexWriter w = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(newInOrderLogMergePolicy())); + Document doc = new Document(); + Field field = newField("field", "", Field.Index.ANALYZED); + doc.add(field); + field.setValue("a b c"); + w.addDocument(doc); + + field.setValue("d e f"); + w.addDocument(doc); + + field.setValue("a f"); + w.addDocument(doc); + + final IndexReader r = w.getReader(); + w.close(); + + final DocTermOrds dto = new DocTermOrds(r, "field"); + + TermOrdsIterator iter = dto.lookup(0, null); + final int[] buffer = new int[5]; + assertEquals(3, iter.read(buffer)); + assertEquals(0, buffer[0]); + assertEquals(1, buffer[1]); + assertEquals(2, buffer[2]); + + iter = dto.lookup(1, iter); + assertEquals(3, iter.read(buffer)); + assertEquals(3, buffer[0]); + assertEquals(4, buffer[1]); + assertEquals(5, buffer[2]); + + iter = dto.lookup(2, iter); + assertEquals(2, iter.read(buffer)); + assertEquals(0, buffer[0]); + assertEquals(5, buffer[1]); + + r.close(); + dir.close(); + } + + private static class StandardCodecWithOrds extends Codec { + public StandardCodecWithOrds() { + name = "StandardOrds"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase docs = new StandardPostingsWriter(state); + + // TODO: should we make the terms index more easily + // pluggable? Ie so that this codec would record which + // index impl was used, and switch on loading? + // Or... you must make a new Codec for this? + TermsIndexWriterBase indexWriter; + boolean success = false; + try { + indexWriter = new FixedGapTermsIndexWriter(state); + success = true; + } finally { + if (!success) { + docs.close(); + } + } + + success = false; + try { + FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs); + success = true; + return ret; + } finally { + if (!success) { + try { + docs.close(); + } finally { + indexWriter.close(); + } + } + } + } + + public final static int TERMS_CACHE_SIZE = 1024; + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postings = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId); + TermsIndexReaderBase indexReader; + + boolean success = false; + try { + indexReader = new FixedGapTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUnicodeComparator(), + state.codecId); + success = true; + } finally { + if (!success) { + postings.close(); + } + } + + success = false; + try { + FieldsProducer ret = new BlockTermsReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + postings, + state.readBufferSize, + TERMS_CACHE_SIZE, + state.codecId); + success = true; + return ret; + } finally { + if (!success) { + try { + postings.close(); + } finally { + indexReader.close(); + } + } + } + } + + /** Extension of freq postings file */ + static final String FREQ_EXTENSION = "frq"; + + /** Extension of prox postings file */ + static final String PROX_EXTENSION = "prx"; + + @Override + public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException { + StandardPostingsReader.files(dir, segmentInfo, id, files); + BlockTermsReader.files(dir, segmentInfo, id, files); + FixedGapTermsIndexReader.files(dir, segmentInfo, id, files); + } + + @Override + public void getExtensions(Set extensions) { + getStandardExtensions(extensions); + } + + public static void getStandardExtensions(Set extensions) { + extensions.add(FREQ_EXTENSION); + extensions.add(PROX_EXTENSION); + BlockTermsReader.getExtensions(extensions); + FixedGapTermsIndexReader.getIndexExtensions(extensions); + } + } + + public void testRandom() throws Exception { + MockDirectoryWrapper dir = newDirectory(); + + final int NUM_TERMS = 100 * RANDOM_MULTIPLIER; + final Set terms = new HashSet(); + while(terms.size() < NUM_TERMS) { + // nocommit + //final String s = _TestUtil.randomRealisticUnicodeString(random); + final String s = _TestUtil.simpleRandomString(random); + if (s.length() > 0) { + terms.add(new BytesRef(s)); + } + } + final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]); + Arrays.sort(termsArray); + + final int NUM_DOCS = 1000 * RANDOM_MULTIPLIER; + + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()); + + // Sometimes swap in codec that impls ord(): + if (random.nextInt(10) == 7) { + // Make sure terms index has ords: + CoreCodecProvider cp = new CoreCodecProvider(); + cp.register(new StandardCodecWithOrds()); + cp.setDefaultFieldCodec("StandardOrds"); + + // So checkIndex on close works + dir.setCodecProvider(cp); + conf.setCodecProvider(cp); + } + + final RandomIndexWriter w = new RandomIndexWriter(random, dir, conf); + + final int[][] idToOrds = new int[NUM_DOCS][]; + final Set ordsForDocSet = new HashSet(); + + for(int id=0;id (1<<24)*.9) { - SolrCore.log.warn("Approaching too many values for UnInvertedField faceting on field '"+field+"' : bucket size=" + target.length); - } - } - - tnums[pass] = target; - - if ((pass << 16) > maxDoc) - break; - } } - long endTime = System.currentTimeMillis(); - - total_time = (int)(endTime-startTime); - phase1_time = (int)(midPoint-startTime); - SolrCore.log.info("UnInverted multi-valued field " + toString()); + //System.out.println("CREATED: " + toString() + " ti.index=" + ti.index); } - - + public int getNumTerms() { + return numTermsInField; + } public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException { use.incrementAndGet(); @@ -468,6 +209,7 @@ int baseSize = docs.size(); int maxDoc = searcher.maxDoc(); + //System.out.println("GET COUNTS field=" + field + " baseSize=" + baseSize + " minCount=" + mincount + " maxDoc=" + maxDoc + " numTermsInField=" + numTermsInField); if (baseSize >= mincount) { final int[] index = this.index; @@ -481,14 +223,15 @@ int startTerm = 0; int endTerm = numTermsInField; // one past the end - NumberedTermsEnum te = ti.getEnumerator(searcher.getIndexReader()); + TermsEnum te = getOrdTermsEnum(searcher.getIndexReader()); + //System.out.println("GOT enum " + te); if (prefix != null && prefix.length() > 0) { BytesRef prefixBr = new BytesRef(prefix); - te.skipTo(prefixBr); - startTerm = te.getTermNumber(); + te.seek(prefixBr, true); + startTerm = (int) te.ord(); prefixBr.append(ByteUtils.bigTerm); - te.skipTo(prefixBr); - endTerm = te.getTermNumber(); + te.seek(prefixBr, true); + endTerm = (int) te.ord(); } /*********** @@ -514,13 +257,18 @@ docs = new BitDocSet(bs, maxDoc - baseSize); // simply negating will mean that we have deleted docs in the set. // that should be OK, as their entries in our table should be empty. + //System.out.println(" NEG"); } // For the biggest terms, do straight set intersections for (TopTerm tt : bigTerms.values()) { + //System.out.println(" do big termNum=" + tt.termNum + " term=" + tt.term.utf8ToString()); // TODO: counts could be deferred if sorted==false if (tt.termNum >= startTerm && tt.termNum < endTerm) { - counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(ti.field, tt.term)), docs); + counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(field, tt.term)), docs); + //System.out.println(" count=" + counts[tt.termNum]); + } else { + //System.out.println("SKIP term=" + tt.termNum); } } @@ -540,6 +288,7 @@ int code = index[doc]; if ((code & 0xff)==1) { + //System.out.println(" ptr"); int pos = code>>>8; int whichArray = (doc >>> 16) & 0xff; byte[] arr = tnums[whichArray]; @@ -553,9 +302,11 @@ } if (delta == 0) break; tnum += delta - TNUM_OFFSET; + //System.out.println(" tnum=" + tnum); counts[tnum]++; } } else { + //System.out.println(" inlined"); int tnum = 0; int delta = 0; for (;;) { @@ -563,6 +314,7 @@ if ((code & 0x80)==0) { if (delta==0) break; tnum += delta - TNUM_OFFSET; + //System.out.println(" tnum=" + tnum); counts[tnum]++; delta = 0; } @@ -668,8 +420,6 @@ res.add(label, c); } } - - te.close(); } @@ -678,6 +428,8 @@ res.add(null, SimpleFacets.getFieldMissingCount(searcher, baseDocs, field)); } + //System.out.println(" res=" + res); + return res; } @@ -731,8 +483,7 @@ final int[] index = this.index; final int[] counts = new int[numTermsInField];//keep track of the number of times we see each word in the field for all the documents in the docset - NumberedTermsEnum te = ti.getEnumerator(searcher.getIndexReader()); - + TermsEnum te = getOrdTermsEnum(searcher.getIndexReader()); boolean doNegative = false; if (finfo.length == 0) { @@ -755,7 +506,7 @@ for (TopTerm tt : bigTerms.values()) { // TODO: counts could be deferred if sorted==false if (tt.termNum >= 0 && tt.termNum < numTermsInField) { - final Term t = new Term(ti.field, tt.term); + final Term t = new Term(field, tt.term); if (finfo.length == 0) { counts[tt.termNum] = searcher.numDocs(new TermQuery(t), docs); } else { @@ -836,7 +587,6 @@ f.accumulateTermNum(i, value); } } - te.close(); int c = missing.size(); allstats.addMissing(c); @@ -870,23 +620,26 @@ } /** may return a reused BytesRef */ - BytesRef getTermValue(NumberedTermsEnum te, int termNum) throws IOException { + BytesRef getTermValue(TermsEnum te, int termNum) throws IOException { + //System.out.println("getTermValue termNum=" + termNum + " this=" + this + " numTerms=" + numTermsInField); if (bigTerms.size() > 0) { // see if the term is one of our big terms. TopTerm tt = bigTerms.get(termNum); if (tt != null) { + //System.out.println(" return big " + tt.term); return tt.term; } } - return te.skipTo(termNum); + return lookupTerm(te, termNum); } @Override public String toString() { + final long indexSize = indexedTermsArray == null ? 0 : (8+8+8+8+(indexedTermsArray.length<<3)+sizeOfIndexedStrings); // assume 8 byte references? return "{field=" + field + ",memSize="+memSize() - + ",tindexSize="+ti.memSize() + + ",tindexSize="+indexSize + ",time="+total_time + ",phase1="+phase1_time + ",nTerms="+numTermsInField @@ -896,7 +649,6 @@ + "}"; } - ////////////////////////////////////////////////////////////////// //////////////////////////// caching ///////////////////////////// ////////////////////////////////////////////////////////////////// @@ -920,287 +672,3 @@ return uif; } } - - -// How to share TermDocs (int[] score[])??? -// Hot to share TermPositions? -/*** -class TermEnumListener { - void doTerm(Term t) { - } - void done() { - } -} -***/ - - -class NumberedTermsEnum extends TermsEnum { - protected final IndexReader reader; - protected final TermIndex tindex; - protected TermsEnum tenum; - protected int pos=-1; - protected BytesRef termText; - protected DocsEnum docsEnum; - protected Bits deletedDocs; - - - NumberedTermsEnum(IndexReader reader, TermIndex tindex) throws IOException { - this.reader = reader; - this.tindex = tindex; - } - - - NumberedTermsEnum(IndexReader reader, TermIndex tindex, BytesRef termValue, int pos) throws IOException { - this.reader = reader; - this.tindex = tindex; - this.pos = pos; - Terms terms = MultiFields.getTerms(reader, tindex.field); - deletedDocs = MultiFields.getDeletedDocs(reader); - if (terms != null) { - tenum = terms.iterator(); - tenum.seek(termValue); - setTerm(); - } - } - - @Override - public Comparator getComparator() throws IOException { - return tenum.getComparator(); - } - - public DocsEnum getDocsEnum() throws IOException { - docsEnum = tenum.docs(deletedDocs, docsEnum); - return docsEnum; - } - - protected BytesRef setTerm() throws IOException { - termText = tenum.term(); - if (tindex.prefix != null && !termText.startsWith(tindex.prefix)) { - termText = null; - } - return termText; - } - - @Override - public BytesRef next() throws IOException { - pos++; - if (tenum.next() == null) { - termText = null; - return null; - } - return setTerm(); // this is extra work if we know we are in bounds... - } - - @Override - public BytesRef term() { - return termText; - } - - @Override - public int docFreq() throws IOException { - return tenum.docFreq(); - } - - @Override - public long totalTermFreq() throws IOException { - return tenum.totalTermFreq(); - } - - public BytesRef skipTo(BytesRef target) throws IOException { - - // already here - if (termText != null && termText.equals(target)) return termText; - - if (tenum == null) { - return null; - } - - int startIdx = Arrays.binarySearch(tindex.index,target); - - if (startIdx >= 0) { - // we hit the term exactly... lucky us! - TermsEnum.SeekStatus seekStatus = tenum.seek(target); - assert seekStatus == TermsEnum.SeekStatus.FOUND; - pos = startIdx << tindex.intervalBits; - return setTerm(); - } - - // we didn't hit the term exactly - startIdx=-startIdx-1; - - if (startIdx == 0) { - // our target occurs *before* the first term - TermsEnum.SeekStatus seekStatus = tenum.seek(target); - assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; - pos = 0; - return setTerm(); - } - - // back up to the start of the block - startIdx--; - - if ((pos >> tindex.intervalBits) == startIdx && termText != null && termText.compareTo(target)<=0) { - // we are already in the right block and the current term is before the term we want, - // so we don't need to seek. - } else { - // seek to the right block - TermsEnum.SeekStatus seekStatus = tenum.seek(tindex.index[startIdx]); - assert seekStatus == TermsEnum.SeekStatus.FOUND; - pos = startIdx << tindex.intervalBits; - setTerm(); // should be non-null since it's in the index - } - - while (termText != null && termText.compareTo(target) < 0) { - next(); - } - - return termText; - } - - public BytesRef skipTo(int termNumber) throws IOException { - int delta = termNumber - pos; - if (delta < 0 || delta > tindex.interval || tenum==null) { - int idx = termNumber >>> tindex.intervalBits; - BytesRef base = tindex.index[idx]; - pos = idx << tindex.intervalBits; - delta = termNumber - pos; - TermsEnum.SeekStatus seekStatus = tenum.seek(base); - assert seekStatus == TermsEnum.SeekStatus.FOUND; - } - while (--delta >= 0) { - BytesRef br = tenum.next(); - if (br == null) { - termText = null; - return null; - } - ++pos; - } - return setTerm(); - } - - protected void close() throws IOException { - // no-op, needed so the anon subclass that does indexing - // can build its index - } - - /** The current term number, starting at 0. - * Only valid if the previous call to next() or skipTo() returned true. - */ - public int getTermNumber() { - return pos; - } - - @Override - public long ord() { - throw new UnsupportedOperationException(); - } - - @Override - public SeekStatus seek(long ord) { - throw new UnsupportedOperationException(); - } - - @Override - public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { - throw new UnsupportedOperationException(); - } - - @Override - public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) { - throw new UnsupportedOperationException(); - } - - @Override - public SeekStatus seek(BytesRef target, boolean useCache) { - throw new UnsupportedOperationException(); - } -} - - -/** - * Class to save memory by only storing every nth term (for random access), while - * numbering the terms, allowing them to be retrieved later by number. - * This is only valid when used with the IndexReader it was created with. - * The IndexReader is not actually stored to facilitate caching by using it as a key in - * a weak hash map. - */ -class TermIndex { - final static int intervalBits = 7; // decrease to a low number like 2 for testing - final static int intervalMask = 0xffffffff >>> (32-intervalBits); - final static int interval = 1 << intervalBits; - - final String field; - final BytesRef prefix; - BytesRef[] index; - int nTerms; - long sizeOfStrings; - - TermIndex(String field) { - this(field, null); - } - - TermIndex(String field, String prefix) { - this.field = field; - this.prefix = prefix == null ? null : new BytesRef(prefix); - } - - NumberedTermsEnum getEnumerator(IndexReader reader, int termNumber) throws IOException { - NumberedTermsEnum te = new NumberedTermsEnum(reader, this); - te.skipTo(termNumber); - return te; - } - - /* The first time an enumerator is requested, it should be used - with next() to fully traverse all of the terms so the index - will be built. - */ - NumberedTermsEnum getEnumerator(IndexReader reader) throws IOException { - if (index==null) return new NumberedTermsEnum(reader,this, prefix==null?new BytesRef():prefix, 0) { - ArrayList lst; - PagedBytes bytes; - - @Override - protected BytesRef setTerm() throws IOException { - BytesRef br = super.setTerm(); - if (br != null && (pos & intervalMask)==0) { - sizeOfStrings += br.length; - if (lst==null) { - lst = new ArrayList(); - bytes = new PagedBytes(15); - } - BytesRef out = new BytesRef(); - bytes.copy(br, out); - lst.add(out); - } - return br; - } - - @Override - public BytesRef skipTo(int termNumber) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public void close() throws IOException { - nTerms=pos; - super.close(); - index = lst!=null ? lst.toArray(new BytesRef[lst.size()]) : new BytesRef[0]; - } - }; - else return new NumberedTermsEnum(reader,this,new BytesRef(),0); - } - - - /** - * Returns the approximate amount of memory taken by this TermIndex. - * This is only an approximation and doesn't take into account java object overhead. - * - * @return - * the approximate memory consumption in bytes - */ - public long memSize() { - // assume 8 byte references? - return 8+8+8+8+(index.length<<3)+sizeOfStrings; - } -} - Index: solr/src/test/org/apache/solr/request/TestFaceting.java --- solr/src/test/org/apache/solr/request/TestFaceting.java Tue Mar 29 18:45:54 2011 +0000 +++ solr/src/test/org/apache/solr/request/TestFaceting.java Tue Mar 29 15:49:08 2011 -0400 @@ -17,14 +17,17 @@ package org.apache.solr.request; +import java.util.Locale; +import java.util.Random; + +import org.apache.lucene.index.DocTermOrds; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.solr.SolrTestCaseJ4; import org.junit.After; import org.junit.BeforeClass; import org.junit.Test; -import java.util.Locale; -import java.util.Random; /** * @version $Id$ @@ -62,43 +65,51 @@ } void doTermEnum(int size) throws Exception { + //System.out.println("doTermEnum size=" + size); close(); createIndex(size); req = lrf.makeRequest("q","*:*"); - TermIndex ti = new TermIndex(proto.field()); - NumberedTermsEnum te = ti.getEnumerator(req.getSearcher().getIndexReader()); + UnInvertedField uif = new UnInvertedField(proto.field(), req.getSearcher()); - // iterate through first - while(te.term() != null) te.next(); - assertEquals(size, te.getTermNumber()); - te.close(); + assertEquals(size, uif.getNumTerms()); - te = ti.getEnumerator(req.getSearcher().getIndexReader()); + TermsEnum te = uif.getOrdTermsEnum(req.getSearcher().getIndexReader()); + assertEquals(size == 0, te == null); Random r = new Random(size); // test seeking by term string for (int i=0; i0) { + assertEquals(size>0, te.seek(new BytesRef("000"), true) != TermsEnum.SeekStatus.END); + assertEquals(0, te.ord()); assertEquals(t(0), te.term().utf8ToString()); - } else { - assertEquals(null, te.term()); } if (size>0) { @@ -106,9 +117,10 @@ for (int i=0; i