Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Thu Apr 14 18:48:02 2011 -0400 @@ -44,7 +44,7 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) { + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly /* ignored */) { final Term t = new Term(field, text); int loc = Arrays.binarySearch(terms, t, InstantiatedTerm.termComparator); if (loc < 0) { Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Thu Apr 14 18:48:02 2011 -0400 @@ -855,7 +855,7 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) { + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly /* ignored */) { termUpto = Arrays.binarySearch(info.sortedTerms, text, termComparator); if (termUpto < 0) { // not found; choose successor termUpto = -termUpto -1; Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java Thu Apr 14 18:48:02 2011 -0400 @@ -540,7 +540,7 @@ // we have a non-empty index, check if the term exists currentTerm.copy(word); for (TermsEnum te : termsEnums) { - if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) { + if (te.seek(currentTerm, false, true) == TermsEnum.SeekStatus.FOUND) { continue terms; } } Index: lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java --- lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java Thu Apr 14 18:48:02 2011 -0400 @@ -373,7 +373,7 @@ // System.out.println(" term=" + term); - if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seek(term.bytes(), false, true) == TermsEnum.SeekStatus.FOUND) { DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); if (docsEnum != null) { @@ -391,6 +391,8 @@ delCount++; } } + //} else { + //System.out.println("BDS: seg=" + reader.getSegmentName() + " term=" + term + " NOT_FOUND"); } } Index: lucene/src/java/org/apache/lucene/index/CompoundFileReader.java --- lucene/src/java/org/apache/lucene/index/CompoundFileReader.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/CompoundFileReader.java Thu Apr 14 18:48:02 2011 -0400 @@ -160,7 +160,7 @@ id = IndexFileNames.stripSegmentName(id); FileEntry entry = entries.get(id); if (entry == null) - throw new IOException("No sub-file with id " + id + " found (files: " + entries.keySet() + ")"); + throw new IOException("No sub-file with id " + id + " found (fileName=" + fileName + " files: " + entries.keySet() + ")"); return new CSIndexInput(stream, entry.offset, entry.length, readBufferSize); } Index: lucene/src/java/org/apache/lucene/index/DocTermOrds.java --- lucene/src/java/org/apache/lucene/index/DocTermOrds.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/DocTermOrds.java Thu Apr 14 18:48:02 2011 -0400 @@ -691,7 +691,7 @@ } @Override - public SeekStatus seek(BytesRef target, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef target, boolean useCache, boolean exactOnly /* ignored */) throws IOException { // already here if (term != null && term.equals(target)) { @@ -761,7 +761,7 @@ //System.out.println(" do seek term=" + base.utf8ToString()); ord = idx << indexIntervalBits; delta = (int) (targetOrd - ord); - final TermsEnum.SeekStatus seekStatus = termsEnum.seek(base, true); + final TermsEnum.SeekStatus seekStatus = termsEnum.seek(base, true, false); assert seekStatus == TermsEnum.SeekStatus.FOUND; } else { //System.out.println("seek w/in block"); Index: lucene/src/java/org/apache/lucene/index/FilterIndexReader.java --- lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Thu Apr 14 18:48:02 2011 -0400 @@ -132,8 +132,8 @@ public FilterTermsEnum(TermsEnum in) { this.in = in; } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { - return in.seek(text, useCache); + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly) throws IOException { + return in.seek(text, useCache, exactOnly); } @Override Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java --- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Thu Apr 14 18:48:02 2011 -0400 @@ -139,7 +139,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef term, boolean useCache, boolean exactOnly) throws IOException { queue.clear(); numTop = 0; @@ -147,8 +147,13 @@ if (lastSeek != null && termComp.compare(lastSeek, term) <= 0) { seekOpt = true; } - lastSeekScratch.copy(term); - lastSeek = lastSeekScratch; + + if (!exactOnly) { + lastSeekScratch.copy(term); + lastSeek = lastSeekScratch; + } else { + lastSeek = null; + } for(int i=0;i fields = new TreeMap(); + + // Caches the most recently looked-up field + terms: + private final DoubleBarrelLRUCache termsCache; + + private int indexDivisor; + + // keeps the dirStart offset + protected long dirOffset; + protected long indexDirOffset; + + // Used as key for the terms cache + private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey { + String field; + BytesRef term; + + public FieldAndTerm() { + } + + public FieldAndTerm(FieldAndTerm other) { + field = other.field; + term = new BytesRef(other.term); + } + + @Override + public boolean equals(Object _other) { + FieldAndTerm other = (FieldAndTerm) _other; + return other.field == field && term.bytesEquals(other.term); + } + + @Override + public Object clone() { + return new FieldAndTerm(this); + } + + @Override + public int hashCode() { + return field.hashCode() * 31 + term.hashCode(); + } + } + + // nocommit + private String segment; + + public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, String segment, + BlockTreePostingsReaderBase postingsReader, int readBufferSize, + int termsCacheSize, String codecId, int indexDivisor) + throws IOException { + + this.postingsReader = postingsReader; + this.indexDivisor = indexDivisor; + termsCache = new DoubleBarrelLRUCache(termsCacheSize); + + this.segment = segment; + in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTreeTermsWriter.TERMS_EXTENSION), + readBufferSize); + + boolean success = false; + IOException ioe = null; + try { + readHeader(in); + final String f = IndexFileNames.segmentFileName(segment, codecId, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION); + indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION), + readBufferSize); + readIndexHeader(indexIn); + + // Have PostingsReader init itself + postingsReader.init(in); + + // Read per-field details + seekDir(in, dirOffset); + seekDir(indexIn, indexDirOffset); + + final int numFields = in.readVInt(); + + for(int i=0;i= 0; + final long termsStartPointer = in.readVLong(); + final long rootBlockFP = in.readVLong(); + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong(); + final long indexStartFP = indexIn.readVLong(); + assert !fields.containsKey(fieldInfo.name); + fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, rootBlockFP, sumTotalTermFreq, indexStartFP)); + } + success = true; + } finally { + if (!success) { + close(); + } + } + if (indexDivisor > 0) { + indexIn.close(); + } + } + + protected void readHeader(IndexInput input) throws IOException { + CodecUtil.checkHeader(input, BlockTreeTermsWriter.CODEC_NAME, + BlockTreeTermsWriter.VERSION_START, + BlockTreeTermsWriter.VERSION_CURRENT); + dirOffset = input.readLong(); + } + + protected void readIndexHeader(IndexInput input) throws IOException { + CodecUtil.checkHeader(input, BlockTreeTermsWriter.CODEC_NAME, + BlockTreeTermsWriter.VERSION_START, + BlockTreeTermsWriter.VERSION_CURRENT); + indexDirOffset = input.readLong(); + } + + protected void seekDir(IndexInput input, long dirOffset) + throws IOException { + input.seek(dirOffset); + } + + @Override + public void loadTermsIndex(int indexDivisor) throws IOException { + if (this.indexDivisor < 0) { + if (indexDivisor < 0) { + this.indexDivisor = -indexDivisor; + } else { + this.indexDivisor = indexDivisor; + } + for(FieldReader field : fields.values()) { + field.loadIndex(); + } + indexIn.close(); + } + } + + @Override + public void close() throws IOException { + try { + try { + if (indexIn != null && indexDivisor < 0) { + indexIn.close(); + } + } finally { + if (in != null) { + in.close(); + } + } + } finally { + try { + if (postingsReader != null) { + postingsReader.close(); + } + } finally { + for(FieldReader field : fields.values()) { + field.close(); + } + // Clear so refs to terms index is GCable even if + // app hangs onto us: + fields.clear(); + } + } + } + + public static void files(Directory dir, SegmentInfo segmentInfo, String id, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, BlockTreeTermsWriter.TERMS_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION)); + } + + public static void getExtensions(Collection extensions) { + extensions.add(BlockTreeTermsWriter.TERMS_EXTENSION); + extensions.add(BlockTreeTermsWriter.TERMS_INDEX_EXTENSION); + } + + @Override + public FieldsEnum iterator() { + return new TermFieldsEnum(); + } + + @Override + public Terms terms(String field) throws IOException { + return fields.get(field); + } + + // Iterates through all fields + private class TermFieldsEnum extends FieldsEnum { + final Iterator it; + FieldReader current; + + TermFieldsEnum() { + it = fields.values().iterator(); + } + + @Override + public String next() { + if (it.hasNext()) { + current = it.next(); + return current.fieldInfo.name; + } else { + current = null; + return null; + } + } + + @Override + public TermsEnum terms() throws IOException { + return current.iterator(); + } + } + + // nocommit -- in block header we could record that block + // has no sub-blocks (is a "leaf" block) and specialize + // decodes, eg don't flag each entry + + private class FieldReader extends Terms implements Closeable { + final long numTerms; + final FieldInfo fieldInfo; + final long termsStartPointer; + final long sumTotalTermFreq; + final long indexStartFP; + final long rootBlockFP; + private FST index; + + //private boolean DEBUG; + + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long rootBlockFP, long sumTotalTermFreq, long indexStartFP) throws IOException { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); + this.numTerms = numTerms; + this.termsStartPointer = termsStartPointer; + this.sumTotalTermFreq = sumTotalTermFreq; + this.indexStartFP = indexStartFP; + this.rootBlockFP = rootBlockFP; + if (DEBUG) { + System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootBlockFP + " divisor=" + indexDivisor); + } + + if (indexDivisor > 0) { + loadIndex(); + } + } + + void loadIndex() throws IOException { + if (index == null) { + IndexInput clone = (IndexInput) indexIn.clone(); + clone.seek(indexStartFP); + index = new FST(clone, PositiveIntOutputs.getSingleton(true)); + + if (false) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(index, w, false, false); + System.out.println("FST INDEX: SAVED to " + dotFileName); + w.close(); + } + + // nocommit -- impl divisor (how!?) + } + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public void close() { + super.close(); + } + + @Override + public TermsEnum iterator() throws IOException { + return new SegmentTermsEnum(); + } + + @Override + public long getUniqueTermCount() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + // Iterates through terms in this field + private final class SegmentTermsEnum extends TermsEnum { + private final IndexInput in; + private final FieldAndTerm fieldTerm = new FieldAndTerm(); + + private Frame[] stack; + private final Frame staticFrame; + private Frame currentFrame; + private boolean termExists; + + // What prefix of the current term was present in the index: + private int currentIndexDepth; + + // nocommit + private boolean eof; + + final BytesRef term = new BytesRef(); + + @SuppressWarnings("unchecked") private FST.Arc[] arcs = new FST.Arc[5]; + + /* This is true if indexEnum is "still" seek'd to the index term + for the current term. We set it to true on seeking, and then it + remains valid until next() is called enough times to load another + terms block: */ + //private boolean indexIsCurrent; + + /* True if we've already called .next() on the indexEnum, to "bracket" + the current block of terms: */ + //private boolean didIndexNext; + + /* Next index term, bracketing the current block of terms; this is + only valid if didIndexNext is true: */ + //private BytesRef nextIndexTerm; + + /* True after seek(TermState), do defer seeking. If the app then + calls next() (which is not "typical"), then we'll do the real seek */ + //private boolean seekPending; + + /* How many blocks we've read since last seek. Once this + is >= indexEnum.getDivisor() we set indexIsCurrent to false (since + the index can no long bracket seek-within-block). */ + //private int blocksSinceSeek; + + public SegmentTermsEnum() throws IOException { + if (DEBUG) System.out.println("BTTR.init seg=" + segment); + in = (IndexInput) BlockTreeTermsReader.this.in.clone(); + in.seek(termsStartPointer); + fieldTerm.field = fieldInfo.name; + stack = new Frame[5]; + for(int stackOrd=0;stackOrd(); + } + + // Init w/ root block; don't use index since it may + // not (and need not) have been loaded + //final FST.Arc arc = index.getFirstArc(getArc(0)); + + // Empty string prefix must have an output in the index! + //assert arc.isFinal(); + + // nocommit -- can we avoid this? eg if is going to + // call seek... + currentFrame = staticFrame; + final FST.Arc arc; + if (index != null) { + arc = index.getFirstArc(getArc(0)); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + pushFrame(arc, rootBlockFP); + currentFrame.loadBlock(); + currentIndexDepth = 0; + if (DEBUG) { + System.out.println("init frame state " + currentFrame.ord); + printSeekState(); + } + } + + private Frame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for(int stackOrd=stack.length;stackOrd getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings("unchecked") final FST.Arc[] next = new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for(int arcOrd=arcs.length;arcOrd(); + } + arcs = next; + } + return arcs[ord]; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + private void pushFrame(FST.Arc arc, long code) throws IOException { + final Frame f = currentFrame = getFrame(1+currentFrame.ord); + f.nextEnt = -1; + f.scanned = true; + f.depth = term.length; + f.state.termBlockOrd = 0; + f.fp = code >>> 1; + f.hasTerms = (code & 1) != 0; + f.arc = arc; + f.lastSubFP = -1; + if (DEBUG) System.out.println(" push frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " pref=" + term.utf8ToString()); + } + + @Override + public SeekStatus seek(final BytesRef target, final boolean useCache, final boolean exactOnly) throws IOException { + + //System.out.println("DEBUG IS " + DEBUG); + eof = false; + + if (index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + if (DEBUG) { + System.out.println("\nBTTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term.utf8ToString() + " (exists?=" + termExists + ") useCache=" + useCache + " exactOnly=" + exactOnly + " this=" + this + " currentIndexDepth=" + currentIndexDepth); + printSeekState(); + } + + //System.out.println("BTTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); + + /* + if (exactOnly == false) { + new Throwable().printStackTrace(System.out); + } + */ + + // Check cache + if (useCache) { + fieldTerm.term = target; + // TODO: should we differentiate "frozen" + // TermState (ie one that was cloned and + // cached/returned by termState()) from the + // malleable (primary) one? + final TermState cachedState = termsCache.get(fieldTerm); + if (cachedState != null) { + if (DEBUG) { + System.out.println(" cached!"); + } + seek(target, cachedState); + //System.out.println(" term=" + term.utf8ToString()); + return SeekStatus.FOUND; + } + // nocommit -- we never enroll state into the termsCache! + } + + // nocommit -- on exactOnly seek we must set + // currentFrame to staticFrame, even on not_found? + // or... can't we re-use its state even if it + // returned not_found!? + + FST.Arc arc; + int targetUpto; + long output; + + // nocommit + // if (currentFrame != staticFrame && currentIndexDepth > 0) { + if (currentFrame != staticFrame) { + // We are already positioned; find the common + // prefix of new seek term vs current term and + // re-use the seek state + + if (DEBUG) { + System.out.println(" re-use current state currentIndexDepth=" + currentIndexDepth); + } + + arc = getArc(0); + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + final Frame startCurrentFrame = currentFrame; + + currentFrame = getFrame(0); + + final int targetLimit = target.length < term.length ? target.length : term.length; + + while (true) { + final int cmp; + final boolean stop; + + if (targetUpto < targetLimit) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + targetUpto++; + stop = false; + if (DEBUG) { + System.out.println(" cycle targetUpto=" + targetUpto + " stop=false targetLabel=" + (char) (target.bytes[target.offset + targetUpto - 1]) + " vs termLabel=" + (char) (term.bytes[targetUpto-1]) + " cmp=" + cmp); + } + } else { + cmp = term.length - target.length; + if (DEBUG) { + System.out.println(" cycle targetUpto=" + targetUpto + " stop=true cmp=" + cmp); + } + stop = true; + } + + if (cmp == 0) { + if (targetUpto-1 == currentIndexDepth) { + // Compare suffix now: we must determine + // whether target is before current, and + // rewind the current frame if so: + final int sav = targetUpto-1; + if (!stop) { + while(true) { + final int cmp2; + final boolean stop2; + if (targetUpto < targetLimit) { + cmp2 = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + targetUpto++; + stop2 = false; + } else { + cmp2 = term.length - target.length; + stop2 = true; + } + if (cmp2 > 0) { + // Target is before current -- must + // rewind: + if (DEBUG) { + System.out.println(" rewind currentFrame"); + } + currentFrame.rewind(); + break; + } else if (cmp2 < 0) { + // Target is after current -- no need + // to rewind + break; + } else if (stop2) { + // nocommit -- we should never return + // FOUND here...? + // nocommit right? + if (termExists) { + if (DEBUG) { + System.out.println(" same term!; return FOUND"); + } + currentFrame = startCurrentFrame; + return SeekStatus.FOUND; + } else { + if (DEBUG) { + System.out.println(" same term!; break"); + } + break; + }// : SeekStatus.NOT_FOUND; + //break; + } + } + } + targetUpto = sav; + if (DEBUG) { + System.out.println(" cycle: break @ currentIndexDepth=" + currentIndexDepth + " clear scanned for frame ord=" + currentFrame.ord); + } + // nocommit -- too powerful? + currentFrame.scanned = false; + break; + } + if (stop) { + // Exactly equal + assert currentIndexDepth == targetUpto; + if (termExists) { + if (DEBUG) { + System.out.println(" same term! return FOUND"); + } + return SeekStatus.FOUND; + } else { + if (DEBUG) { + System.out.println(" same term! break"); + } + break; + } + //return termExists ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; + } + arc = getArc(targetUpto); + assert arc.output != null; + assert arc.label == (term.bytes[targetUpto-1] & 0xFF): "arc.label=" + (char) arc.label + " vs " + (char) (term.bytes[targetUpto-1] & 0xFF); + output += arc.output; + if (arc.isFinal()) { + currentFrame = getFrame(1+currentFrame.ord); + if (DEBUG) { + System.out.println(" re-use frame ord=" + currentFrame.ord + " fp=" + currentFrame.fp + " hasTerms?=" + currentFrame.hasTerms + " nextEnt=" + currentFrame.nextEnt); + } + } + } else { + if (!stop) { + targetUpto--; + arc = getArc(targetUpto); + } + // nocommit -- may have to rollback frame too? + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + if (DEBUG) { + System.out.println(" target is after current"); + } + currentFrame.scanned = false; + } else { + // We shared some prefix, but the target term + // is before current term; this means we can + // keep the currentFrame but we must reset it + // (so we scan from the start) + if (DEBUG) { + System.out.println(" target is before current"); + } + currentFrame.rewind(); + } + break; + } + } + //currentIndexDepth = targetUpto; + + } else { + arc = index.getFirstArc(getArc(0)); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + if (DEBUG) { + System.out.println(" push root frame"); + } + + output = arc.output; + + currentFrame = staticFrame; + + // Append new empty frame to stack; we lazily + // load this block only when needed: + term.length = 0; + targetUpto = 0; + //currentIndexDepth = 0; + pushFrame(arc, output + arc.nextFinalOutput.longValue()); + } + + if (DEBUG) { + System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord); + } + + // Walk the index, matching the prefix of the target term: + while(true) { + + final FST.Arc nextArc; + + if (targetUpto == target.length) { + if (DEBUG) { + System.out.println(" index: target exhausted"); + } + nextArc = null; + } else { + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xff; + if (term.bytes.length <= targetUpto) { + term.bytes = ArrayUtil.grow(term.bytes, 1+targetUpto); + } + term.bytes[targetUpto] = (byte) targetLabel; + term.length = 1+targetUpto; + + nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); + if (DEBUG && nextArc == null) { + System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " 0x" + Integer.toHexString(targetLabel)); + } + } + + if (nextArc == null) { + // We've exhausted the index + + //currentIndexDepth = targetUpto; + currentIndexDepth = currentFrame.depth; + + if (exactOnly && targetUpto < target.length && !currentFrame.hasTerms) { + term.length--; + if (DEBUG) { + System.out.println("FAST NOT_FOUND term=" + term.utf8ToString()); + } + termExists = false; + // nocommit -- only have to copy suffix: + term.copy(target); + return SeekStatus.NOT_FOUND; + } + + // nocommit -- once we mark whether a block + // has any terms (vs just sub-blocks)... must + // handle the case, here, where none of the + // frames we passed has any terms (ie, + // currentFrame is null) + + // Exact match for a prefix of the target, in the + // index; now scan that block. + currentFrame.loadBlock(); + SeekStatus result = currentFrame.scanToTerm(target, exactOnly); + + if (result == SeekStatus.END) { + if (!exactOnly) { + // Must position to next block + if (next() == null) { + termExists = false; + term.length = 0; + currentFrame.rewind(); + return SeekStatus.END; + } else { + termExists = true; + } + } else { + // nocommit -- do our own copy: we only need + // to copy the "new" suffix + termExists = false; + term.copy(target); + } + + if (DEBUG) { + System.out.println(" block end; return NOT_FOUND term=" + term.utf8ToString()); + } + + return SeekStatus.NOT_FOUND; + } else if (result == SeekStatus.NOT_FOUND) { + if (DEBUG) { + System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term); + } + termExists = currentFrame.lastWasTerm; + return SeekStatus.NOT_FOUND; + } else { + if (DEBUG) { + System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + } + termExists = currentFrame.lastWasTerm; + return SeekStatus.FOUND; + } + + } else { + // Follow this arc + arc = nextArc; + if (DEBUG) { + System.out.println(" targetLabel=" + (char) (target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + } + targetUpto++; + //currentIndexDepth++; + + // Aggregate output as we go: + assert arc.output != null; + output += arc.output; + + if (DEBUG) { + System.out.println(" index: cycle output=" + output); + } + + if (arc.isFinal()) { + if (DEBUG) System.out.println(" arc is final!"); + // Append new empty frame to stack; we lazily + // load this block only when needed: + pushFrame(arc, output + arc.nextFinalOutput.longValue()); + } + } + } + + // nocommit -- add back asserts that verify we don't + // scan too many blocks... + } + + private void printSeekState() throws IOException { + if (currentFrame == staticFrame) { + System.out.println(" no prior seek"); + } else { + System.out.println(" prior seek state:"); + int ord = 0; + while(true) { + Frame f = getFrame(ord); + final BytesRef prefix = new BytesRef(term.bytes, 0, f.depth); + System.out.println(" frame ord=" + ord + " fp=" + f.fp + " depth=" + f.depth + " prefix=" + prefix.utf8ToString() + " nextEnt=" + f.nextEnt + " (of " + f.entCount + ") scanned=" + f.scanned + " hasTerms=" + f.hasTerms + " code=" + (f.fp<<1 + (f.hasTerms ? 1 : 0)) + " lastSubFP=" + f.lastSubFP); + if (f == currentFrame) { + break; + } + if (index != null) { + assert f.arc != null; + if (f.depth > 0 && f.arc.label != (term.bytes[f.depth-1]&0xFF)) { + System.out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.bytes[f.depth-1]&0xFF)); + throw new RuntimeException("seek state is broken"); + } + Long output = Util.get(index, prefix); + if (output == null) { + System.out.println(" broken seek state: prefix is not final in index"); + throw new RuntimeException("seek state is broken"); + } else { + final long code = (f.fp<<1) | (f.hasTerms ? 1:0); + if (output != code) { + System.out.println(" broken seek state: output code=" + output + " doesn't match frame code=" + code); + throw new RuntimeException("seek state is broken"); + } + } + } + if (f.depth == currentIndexDepth) { + break; + } + ord++; + } + } + } + + /* Decodes only the term bytes of the next term. If caller then asks for + metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) + decode all metadata up to the current term. */ + @Override + public BytesRef next() throws IOException { + assert !eof; + if (DEBUG) { + System.out.println("BTTR.next seg=" + segment + " field=" + fieldInfo.name + " this=" + this + " termBlockOrd=" + currentFrame.state.termBlockOrd); + printSeekState(); + } + + if (currentFrame == staticFrame) { + // If seek was previously called and the term was + // cached, or seek(TermState) was called, usually + // caller is just going to pull a D/&PEnum or get + // docFreq, etc. But, if they then call next(), + // this method catches up all internal state so next() + // works properly: + if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term); + final SeekStatus result = seek(term); + assert result == SeekStatus.FOUND; + } + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) { + if (DEBUG) System.out.println(" return null"); + eof = true; + term.length = 0; + currentFrame.rewind(); + termExists = false; + return null; + } + final long lastFP = currentFrame.fp; + final boolean lastHasTerms = currentFrame.hasTerms; + currentFrame = stack[currentFrame.ord-1]; + + if (currentFrame.nextEnt == -1) { + // We popped into a frame that's not loaded yet + // -- init it + currentFrame.loadBlock(); + currentFrame.scanToSubBlock(lastFP, lastHasTerms); + } else if (!currentFrame.scanned) { + currentFrame.scanToSubBlock(lastFP, lastHasTerms); + } + + // Note that the seek state (last seek) has been + // invalidated beyond this depth + currentIndexDepth = Math.min(currentIndexDepth, currentFrame.depth); + if (DEBUG) { + System.out.println(" reset currentIndexDepth=" + currentIndexDepth); + } + } + + while(true) { + if (currentFrame.next()) { + // Push to new block: + if (DEBUG) System.out.println(" push frame"); + pushFrame(null, currentFrame.subBlockFP); + currentFrame.loadBlock(); + } else { + if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term); + // nocommit + termExists = true; + //termExists = false; + return term; + } + } + } + + @Override + public BytesRef term() { + assert !eof; + return term; + } + + @Override + public int docFreq() throws IOException { + assert !eof; + //System.out.println("BTR.docFreq"); + currentFrame.decodeMetaData(); + //System.out.println(" return " + state.docFreq); + return currentFrame.state.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + return currentFrame.state.totalTermFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { + assert !eof; + if (DEBUG) { + System.out.println("BTTR.docs seg=" + segment + " state=" + currentFrame.state + " this=" + this); + } + currentFrame.decodeMetaData(); + final DocsEnum docsEnum = postingsReader.docs(fieldInfo, currentFrame.state, skipDocs, reuse); + + /* + if (fieldInfo.name.equals("id")) { + docsEnum.termID = Integer.parseInt(term().utf8ToString()); + } else { + docsEnum.termID = -1; + } + */ + assert docsEnum != null; + return docsEnum; + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + assert !eof; + //System.out.println("BTR.d&p this=" + this); + if (fieldInfo.omitTermFreqAndPositions) { + return null; + } else { + currentFrame.decodeMetaData(); + DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, currentFrame.state, skipDocs, reuse); + //System.out.println(" return d&pe=" + dpe); + return dpe; + } + } + + @Override + public void seek(BytesRef target, TermState otherState) throws IOException { + if (DEBUG) { + System.out.println("BTTR.seek termState seg=" + segment + " target=" + target.utf8ToString() + " " + target + " state=" + otherState + " this=" + this); + } + eof = false; + // nocommit -- how to fix BlockTreeTermState to + // carry over seek state...? + if (target.compareTo(term) != 0 || !termExists) { + assert otherState != null && otherState instanceof BlockTreeTermState; + currentFrame = staticFrame; + // nocommit -- how to disallow next() after this? + currentFrame.state.copyFrom(otherState); + term.copy(target); + currentFrame.metaDataUpto = currentFrame.state.termBlockOrd; + assert currentFrame.metaDataUpto > 0; + } else { + if (DEBUG) { + System.out.println(" skip seek: already on target state=" + currentFrame.state); + } + } + } + + @Override + public TermState termState() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + TermState ts = (TermState) currentFrame.state.clone(); + if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts); + return ts; + } + + @Override + public SeekStatus seek(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + + // Not static -- references term, postingsReader, + // fieldInfo, in + private final class Frame { + // Our index in stack[] + final int ord; + boolean hasTerms; + boolean lastWasTerm; + int depth; + + FST.Arc arc; + + // File pointer where this block was loaded from + long fp; + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(null); + + byte[] statBytes = new byte[64]; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(null); + + // Length of prefix shared by all terms in this block + int prefix; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read + int nextEnt; + + boolean scanned; + + long lastSubFP; + + // Next term to decode metaData; we decode metaData + // lazily so that scanning to find the matching term is + // fast and only if you find a match and app wants the + // stats or docs/positions enums, will we decode the + // metaData + int metaDataUpto; + + final BlockTreeTermState state; + + long subBlockFP; + //boolean subBlockHasTerms; + + public Frame(int ord) throws IOException { + this.ord = ord; + state = postingsReader.newTermState(); + state.totalTermFreq = -1; + } + + /* Does initial decode of next block of terms; this + doesn't actually decode the docFreq, totalTermFreq, + postings details (frq/prx offset, etc.) metadata; + it just loads them as byte[] blobs which are then + decoded on-demand if the metadata is ever requested + for any term in this block. This enables terms-only + intensive consumes (eg certain MTQs, respelling) to + not pay the price of decoding metadata they won't + use. */ + void loadBlock() throws IOException { + + if (nextEnt != -1) { + // Already loaded + return; + } + + in.seek(fp); + entCount = in.readVInt(); + prefix = in.readVInt(); + if (DEBUG) System.out.println(" [loadBlock fp=" + fp + " entCount=" + entCount + "]"); + + // TODO: if suffixes were stored in random-access + // array structure, then we could do binary search + // instead of linear scan to find target term; eg + // we could have simple array of offsets + + // term suffixes: + int numBytes = in.readVInt(); + if (suffixBytes.length < numBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + //if (DEBUG) System.out.println(" termSuffixes len=" + numBytes); + in.readBytes(suffixBytes, 0, numBytes); + suffixesReader.reset(suffixBytes, 0, numBytes); + + // stats + // nocommit: we could store stats for sub-blocks? + // is that at all useful...? + // nocommit: only if hasTerms? + numBytes = in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + //if (DEBUG) System.out.println(" stat bytes len=" + numBytes); + in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + metaDataUpto = 0; + + state.termBlockOrd = 0; + nextEnt = 0; + //scanned = false; + lastSubFP = -1; + + // nocommit: only if hasTerms? + postingsReader.readTermsBlock(in, fieldInfo, state); + + //blocksSinceSeek++; + //indexIsCurrent &= (blocksSinceSeek < indexReader.getDivisor()); + //System.out.println(" indexIsCurrent=" + indexIsCurrent); + } + + // nocommit -- maybe don't bother w/ this? just + // reload the block? it's gotta be rare + void rewind() throws IOException { + // Keeps the block loaded, but rewinds its state: + if (nextEnt > 0) { + if (DEBUG) { + System.out.println("BTTR: rewind frame ord=" + ord + " fp=" + fp + " hasTerms?=" + hasTerms + " nextEnt=" + nextEnt); + } + assert suffixBytes != null; + suffixesReader.rewind(); + assert statBytes != null; + statsReader.rewind(); + metaDataUpto = 0; + state.termBlockOrd = 0; + nextEnt = 0; + // nocommit: only if hasTerms? + postingsReader.resetTermsBlock(fieldInfo, state); + scanned = false; + lastSubFP = -1; + } + } + + // Decodes next entry; returns true if it's a sub-block + public boolean next() { + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + nextEnt++; + final int code = suffixesReader.readVInt(); + final int suffix = code >> 1; + term.length = prefix + suffix; + if (term.bytes.length < term.length) { + term.grow(term.length); + } + suffixesReader.readBytes(term.bytes, prefix, suffix); + if ((code & 1) == 0) { + // A normal term + state.termBlockOrd++; + return false; + } else { + // A sub-block + final long subCode = suffixesReader.readVLong(); + if (DEBUG) { + System.out.println(" subCode=" + subCode); + } + lastSubFP = fp - (subCode >>> 1); + subBlockFP = (lastSubFP<<1) | (subCode & 0x1); + //subBlockHasTerms = (code & 1) != 0; + return true; + } + } + + public void decodeMetaData() throws IOException { + + if (DEBUG) System.out.println("BTTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd + " this=" + SegmentTermsEnum.this); + + assert state.termBlockOrd > 0; + + // lazily catch up on metadata decode: + final int limit = state.termBlockOrd; + + // We must set/incr state.termCount because + // postings impl can look at this + state.termBlockOrd = metaDataUpto; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + state.docFreq = statsReader.readVInt(); + if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (!fieldInfo.omitTermFreqAndPositions) { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + + postingsReader.nextTerm(fieldInfo, state); + metaDataUpto++; + state.termBlockOrd++; + } + } + + // Used only by assert + private boolean prefixMatches(BytesRef target) { + for(int bytePos=0;bytePos fields = new ArrayList(); + private final String segment; + + // nocommit should take min block size? + public BlockTreeTermsWriter( + SegmentWriteState state, + BlockTreePostingsWriterBase postingsWriter, + int minTermsInBlock) + throws IOException + { + final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION); + out = state.directory.createOutput(termsFileName); + fieldInfos = state.fieldInfos; + this.minTermsInBlock = minTermsInBlock; + writeHeader(out); + + /* + if (state.segmentName.equals("_32")) { + DEBUG = true; + } else { + DEBUG = false; + } + */ + + final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(termsIndexFileName); + writeIndexHeader(indexOut); + + currentField = null; + this.postingsWriter = postingsWriter; + segment = state.segmentName; + + // System.out.println("BTW.init seg=" + state.segmentName); + + postingsWriter.start(out); // have consumer write its format/header + } + + protected void writeHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + out.writeLong(0); // leave space for end index pointer + } + + protected void writeIndexHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + out.writeLong(0); // leave space for end index pointer + } + + protected void writeTrailer(long dirStart) throws IOException { + out.seek(CodecUtil.headerLength(CODEC_NAME)); + out.writeLong(dirStart); + } + + protected void writeIndexTrailer(long dirStart) throws IOException { + indexOut.seek(CodecUtil.headerLength(CODEC_NAME)); + indexOut.writeLong(dirStart); + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + //DEBUG = field.name.equals("id"); + if (DEBUG2 || DEBUG) System.out.println("\nBTTW.addField seg=" + segment + " field=" + field.name); + assert currentField == null || currentField.name.compareTo(field.name) < 0; + currentField = field; + final TermsWriter terms = new TermsWriter(field); + fields.add(terms); + return terms; + } + + private static class PendingTerm { + public final BytesRef term; + public final TermStats stats; + + public PendingTerm(BytesRef term, TermStats stats) { + this.term = term; + this.stats = stats; + } + + @Override + public String toString() { + return term.utf8ToString(); + } + } + + private static class PendingBlock { + public final BytesRef prefix; + public final long fp; + public final FST index; + public final boolean hasTerms; + + public PendingBlock(BytesRef prefix, long fp, FST index, boolean hasTerms) { + this.prefix = prefix; + this.fp = fp; + this.index = index; + this.hasTerms = hasTerms; + } + + @Override + public String toString() { + return "BLOCK: " + prefix.utf8ToString(); + } + } + + class TermsWriter extends TermsConsumer { + private final FieldInfo fieldInfo; + private final long termsStartPointer; + private long numTerms; + long sumTotalTermFreq; + long indexStartFP; + long rootBlockFP; + + // Used only to partition terms into the block tree; we + // don't pull an FST from this builder: + private final NoOutputs noOutputs; + private final Builder blockBuilder; + + // PendingTerm or PendingBlock: + private final List pending = new ArrayList(); + + // This class assigns terms to blocks "naturally", ie, + // according to the number of terms under a given prefix + // that we encounter: + private class FindBlocks extends Builder.FreezeTail { + + @Override + public void freeze(final Builder.UnCompiledNode[] frontier, int prefixLenPlus1, final IntsRef lastInput) throws IOException { + + if (DEBUG) System.out.println(" freeze prefixLenPlus1=" + prefixLenPlus1); + + for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { + final Builder.UnCompiledNode node = frontier[idx]; + final Builder.UnCompiledNode parent = idx == 0 ? null : frontier[idx-1]; + + long totCount = 0; + + if (node.isFinal) { + totCount++; + } + + //System.out.println("VISIT node=" + node + " + //arcs=" + node.numArcs); + for(int arcIdx=0;arcIdx target = (Builder.UnCompiledNode) node.arcs[arcIdx].target; + totCount += target.inputCount; + target.clear(); + node.arcs[arcIdx].target = null; + } + node.numArcs = 0; + + boolean hasTerms = false; + + // nocommit fixup + + /* + if (idx <= 3) { + // TODO: instead, we should accum termCount & + // blockCount into UnCompiledNode? + for(int pendingIdx=0;pendingIdx= minTermsInBlock || idx == 0 || hasTerms) { + if (DEBUG2 || DEBUG) { + if (totCount < minTermsInBlock && idx != 0) { + System.out.println(" force block has terms"); + } + } + writeBlock(lastInput, idx, (int) totCount); + node.inputCount = 1; + } else { + // stragglers! carry count upwards + node.inputCount = totCount; + } + frontier[idx] = new Builder.UnCompiledNode(blockBuilder, idx); + } + } + } + + // Write the top count entries on the pending stack as a + // new block + void writeBlock(IntsRef prevTerm, int prefixLength, int count) throws IOException { + + final List slice = pending.subList(pending.size() - count, pending.size()); + boolean hasTerms = false; + for(Object obj : slice) { + if (obj instanceof PendingTerm) { + hasTerms = true; + break; + } + } + + final long startFP = out.getFilePointer(); + + final PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton(true); + final Builder indexBuilder = new Builder(FST.INPUT_TYPE.BYTE1, + 0, 0, true, fstOutputs); + indexBuilder.DEBUG = false; + out.writeVInt(count); + out.writeVInt(prefixLength); + + final BytesRef prefix = new BytesRef(prefixLength); + for(int m=0;m subIndex = new BytesRefFSTEnum(block.index); + BytesRefFSTEnum.InputOutput indexEnt; + while((indexEnt = subIndex.next()) != null) { + indexBuilder.add(indexEnt.input, indexEnt.output); + } + } + } + + // Write suffix byte[] blob + out.writeVInt((int) bytesWriter.getFilePointer()); + bytesWriter.writeTo(out); + bytesWriter.reset(); + + // 2nd pass: write the TermStats as byte[] blob + for(Object ent : slice) { + if (ent instanceof PendingTerm) { + PendingTerm term = (PendingTerm) ent; + assert term.term.startsWith(prefix); + bytesWriter.writeVInt(term.stats.docFreq); + if (!fieldInfo.omitTermFreqAndPositions) { + assert term.stats.totalTermFreq >= term.stats.docFreq; + bytesWriter.writeVLong(term.stats.totalTermFreq - term.stats.docFreq); + } + if (DEBUG) System.out.println(" write dF=" + term.stats.docFreq + " totTF=" + term.stats.totalTermFreq); + } + } + + out.writeVInt((int) bytesWriter.getFilePointer()); + bytesWriter.writeTo(out); + bytesWriter.reset(); + + // 3rd pass: have postings writer write block + postingsWriter.flushTermsBlock(termCount); + + // Replace slice w/ block: + slice.clear(); + final FST index = indexBuilder.finish(); + + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); + Util.toDot(index, w, false, false); + System.out.println("SAVED to out.dot"); + w.close(); + */ + + pending.add(new PendingBlock(prefix, startFP, index, hasTerms)); + } + + TermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + + noOutputs = NoOutputs.getSingleton(); + + // This Builder is just used transiently to fragment + // terms into "good" blocks; we don't save the + // resulting FST: + blockBuilder = new Builder(FST.INPUT_TYPE.BYTE1, + 0, 0, true, + noOutputs, + new FindBlocks()); + + termsStartPointer = out.getFilePointer(); + postingsWriter.setField(fieldInfo); + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + //System.out.println("BTW.startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment); + postingsWriter.startTerm(); + /* + if (fieldInfo.name.equals("id")) { + postingsWriter.termID = Integer.parseInt(text.utf8ToString()); + } else { + postingsWriter.termID = -1; + } + */ + return postingsWriter; + } + + @Override + public void finishTerm(BytesRef text, TermStats stats) throws IOException { + + assert stats.docFreq > 0; + if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq); + + blockBuilder.add(text, noOutputs.getNoOutput()); + pending.add(new PendingTerm(new BytesRef(text), stats)); + postingsWriter.finishTerm(stats); + numTerms++; + } + + // Finishes all terms in this field + @Override + public void finish(long sumTotalTermFreq) throws IOException { + if (numTerms > 0) { + blockBuilder.finish(); + + // We better have one final "root" block: + assert pending.size() == 1 && pending.get(0) instanceof PendingBlock: "pending.size()=" + pending.size() + " pending=" + pending; + final PendingBlock root = (PendingBlock) pending.get(0); + + // nocommit: no more??? + // EOF marker: + // out.writeVInt(0); + + this.sumTotalTermFreq = sumTotalTermFreq; + + // Write FST to index + indexStartFP = indexOut.getFilePointer(); + root.index.save(indexOut); + assert root.index.getEmptyOutput() != null; + rootBlockFP = root.index.getEmptyOutput(); + + if (SAVE_DOT_FILES || DEBUG2 || DEBUG) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(root.index, w, false, false); + System.out.println("SAVED to " + dotFileName); + w.close(); + } + } + } + + private final RAMOutputStream bytesWriter = new RAMOutputStream(); + } + + @Override + public void close() throws IOException { + + IOException ioe = null; + try { + + int nonZeroCount = 0; + for(TermsWriter field : fields) { + if (field.numTerms > 0) { + nonZeroCount++; + } + } + + final long dirStart = out.getFilePointer(); + final long indexDirStart = indexOut.getFilePointer(); + + out.writeVInt(nonZeroCount); + + for(TermsWriter field : fields) { + if (field.numTerms > 0) { + //System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms"); + out.writeVInt(field.fieldInfo.number); + out.writeVLong(field.numTerms); + // nocommit: we may not need termsStartPointer? + out.writeVLong(field.termsStartPointer); + out.writeVLong(field.rootBlockFP); + if (!field.fieldInfo.omitTermFreqAndPositions) { + out.writeVLong(field.sumTotalTermFreq); + } + indexOut.writeVLong(field.indexStartFP); + } + } + writeTrailer(dirStart); + writeIndexTrailer(indexDirStart); + } catch (IOException ioe2) { + ioe = ioe2; + } finally { + IOUtils.closeSafely(ioe, out, indexOut, postingsWriter); + } + } +} Index: lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java --- lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java Thu Apr 14 18:48:02 2011 -0400 @@ -43,7 +43,7 @@ private final Set knownExtensions = new HashSet(); - public final static String[] CORE_CODECS = new String[] {"Standard", "Pulsing", "PreFlex", "SimpleText"}; + public final static String[] CORE_CODECS = new String[] {"Standard", "StandardTree", "Pulsing", "PulsingTree", "PreFlex", "SimpleText"}; public synchronized void register(Codec codec) { if (codec.name == null) { @@ -79,7 +79,7 @@ public synchronized Codec lookup(String name) { final Codec codec = codecs.get(name); if (codec == null) - throw new IllegalArgumentException("required codec '" + name + "' not found"); + throw new IllegalArgumentException("required codec '" + name + "' not found; known codecs: " + codecs.keySet()); return codec; } Index: lucene/src/java/org/apache/lucene/index/codecs/CoreCodecProvider.java --- lucene/src/java/org/apache/lucene/index/codecs/CoreCodecProvider.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/CoreCodecProvider.java Thu Apr 14 18:48:02 2011 -0400 @@ -19,8 +19,10 @@ import org.apache.lucene.index.codecs.preflex.PreFlexCodec; import org.apache.lucene.index.codecs.pulsing.PulsingCodec; +import org.apache.lucene.index.codecs.pulsingtree.PulsingTreeCodec; import org.apache.lucene.index.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.index.codecs.standardtree.StandardTreeCodec; /** * A CodecProvider that registers all core codecs that ship @@ -42,8 +44,10 @@ public class CoreCodecProvider extends CodecProvider { public CoreCodecProvider() { register(new StandardCodec()); + register(new StandardTreeCodec(10)); register(new PreFlexCodec()); register(new PulsingCodec(1)); + register(new PulsingTreeCodec(1)); register(new SimpleTextCodec()); } } Index: lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java --- lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java Thu Apr 14 18:48:02 2011 -0400 @@ -33,6 +33,8 @@ public abstract void startTerm() throws IOException; + /** Flush lastN terms (which may not be all buffered + * terms!) as a block */ public abstract void flushTermsBlock() throws IOException; /** Finishes the current term */ Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Thu Apr 14 18:48:02 2011 -0400 @@ -755,7 +755,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef term, boolean useCache, /* ignored */ boolean exactOnly) throws IOException { if (DEBUG_SURROGATES) { System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString())); } Index: lucene/src/java/org/apache/lucene/index/codecs/pulsingtree/PulsingTreeCodec.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/index/codecs/pulsingtree/PulsingTreeCodec.java Thu Apr 14 18:48:02 2011 -0400 @@ -0,0 +1,123 @@ +package org.apache.lucene.index.codecs.pulsingtree; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.BlockTreePostingsWriterBase; +import org.apache.lucene.index.codecs.standardtree.StandardTreePostingsWriter; +import org.apache.lucene.index.codecs.BlockTreePostingsReaderBase; +import org.apache.lucene.index.codecs.standardtree.StandardTreePostingsReader; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.BlockTreeTermsReader; +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; +import org.apache.lucene.index.codecs.standardtree.StandardTreeCodec; +import org.apache.lucene.store.Directory; + +/** This codec "inlines" the postings for terms that have + * low docFreq. It wraps another codec, which is used for + * writing the non-inlined terms. + * + * Currently in only inlines docFreq=1 terms, and + * otherwise uses the normal "standard" codec. + * @lucene.experimental */ + +public class PulsingTreeCodec extends Codec { + + private final int freqCutoff; + + /** Terms with freq <= freqCutoff are inlined into terms + * dict. */ + public PulsingTreeCodec(int freqCutoff) { + name = "PulsingTree"; + this.freqCutoff = freqCutoff; + } + + @Override + public String toString() { + return name + "(freqCutoff=" + freqCutoff + ")"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + // We wrap StandardTreePostingsWriter, but any BlockTreePostingsWriterBase + // will work: + + BlockTreePostingsWriterBase docsWriter = new StandardTreePostingsWriter(state); + + // Terms that have <= freqCutoff number of docs are + // "pulsed" (inlined): + BlockTreePostingsWriterBase pulsingWriter = new PulsingTreePostingsWriter(freqCutoff, docsWriter); + + // Terms dict + boolean success = false; + try { + // nocommit make this 24 configurable + FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, 24); + success = true; + return ret; + } finally { + if (!success) { + pulsingWriter.close(); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + + // We wrap StandardTreePostingsReader, but any BlockTreeStandardPostingsReader + // will work: + BlockTreePostingsReaderBase docsReader = new StandardTreePostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId); + BlockTreePostingsReaderBase pulsingReader = new PulsingTreePostingsReader(docsReader); + + boolean success = false; + try { + FieldsProducer ret = new BlockTreeTermsReader( + state.dir, state.fieldInfos, state.segmentInfo.name, + pulsingReader, + state.readBufferSize, + StandardTreeCodec.TERMS_CACHE_SIZE, + state.codecId, + state.termsIndexDivisor); + success = true; + return ret; + } finally { + if (!success) { + pulsingReader.close(); + } + } + } + + @Override + public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException { + StandardTreePostingsReader.files(dir, segmentInfo, id, files); + BlockTreeTermsReader.files(dir, segmentInfo, id, files); + } + + @Override + public void getExtensions(Set extensions) { + StandardTreeCodec.getStandardExtensions(extensions); + } +} Index: lucene/src/java/org/apache/lucene/index/codecs/pulsingtree/PulsingTreePostingsReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/index/codecs/pulsingtree/PulsingTreePostingsReader.java Thu Apr 14 18:48:02 2011 -0400 @@ -0,0 +1,496 @@ +package org.apache.lucene.index.codecs.pulsingtree; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.codecs.BlockTreePostingsReaderBase; +import org.apache.lucene.index.codecs.BlockTreeTermState; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +/** Concrete class that reads the current doc/freq/skip + * postings format + * @lucene.experimental */ + +// TODO: -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +public class PulsingTreePostingsReader extends BlockTreePostingsReaderBase { + + // Fallback reader for non-pulsed terms: + final BlockTreePostingsReaderBase wrappedPostingsReader; + int maxPositions; + + public PulsingTreePostingsReader(BlockTreePostingsReaderBase wrappedPostingsReader) throws IOException { + this.wrappedPostingsReader = wrappedPostingsReader; + } + + @Override + public void init(IndexInput termsIn) throws IOException { + CodecUtil.checkHeader(termsIn, PulsingTreePostingsWriter.CODEC, + PulsingTreePostingsWriter.VERSION_START, PulsingTreePostingsWriter.VERSION_START); + maxPositions = termsIn.readVInt(); + wrappedPostingsReader.init(termsIn); + } + + private static class PulsingTermState extends BlockTreeTermState { + private byte[] postings; + private int postingsSize; // -1 if this term was not inlined + private BlockTreeTermState wrappedTermState; + + ByteArrayDataInput inlinedBytesReader; + private byte[] inlinedBytes; + + @Override + public Object clone() { + PulsingTermState clone; + clone = (PulsingTermState) super.clone(); + if (postingsSize != -1) { + clone.postings = new byte[postingsSize]; + System.arraycopy(postings, 0, clone.postings, 0, postingsSize); + } else { + assert wrappedTermState != null; + clone.wrappedTermState = (BlockTreeTermState) wrappedTermState.clone(); + } + return clone; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + PulsingTermState other = (PulsingTermState) _other; + postingsSize = other.postingsSize; + if (other.postingsSize != -1) { + if (postings == null || postings.length < other.postingsSize) { + postings = new byte[ArrayUtil.oversize(other.postingsSize, 1)]; + } + System.arraycopy(other.postings, 0, postings, 0, other.postingsSize); + } else { + wrappedTermState.copyFrom(other.wrappedTermState); + } + + // NOTE: we do not copy the + // inlinedBytes/inlinedBytesReader; these are only + // stored on the "primary" TermState. They are + // "transient" to cloned term states. + } + + @Override + public String toString() { + if (postingsSize == -1) { + return "PulsingTermState: not inlined: wrapped=" + wrappedTermState; + } else { + return "PulsingTermState: inlined size=" + postingsSize + " " + super.toString(); + } + } + } + + @Override + public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTreeTermState _termState) throws IOException { + //System.out.println("PR.readTermsBlock"); + final PulsingTermState termState = (PulsingTermState) _termState; + if (termState.inlinedBytes == null) { + termState.inlinedBytes = new byte[128]; + termState.inlinedBytesReader = new ByteArrayDataInput(null); + } + int len = termsIn.readVInt(); + //System.out.println(" len=" + len + " fp=" + termsIn.getFilePointer()); + if (termState.inlinedBytes.length < len) { + termState.inlinedBytes = new byte[ArrayUtil.oversize(len, 1)]; + } + termsIn.readBytes(termState.inlinedBytes, 0, len); + termState.inlinedBytesReader.reset(termState.inlinedBytes); + termState.wrappedTermState.termBlockOrd = 0; + wrappedPostingsReader.readTermsBlock(termsIn, fieldInfo, termState.wrappedTermState); + } + + @Override + public void resetTermsBlock(FieldInfo fieldInfo, BlockTreeTermState _termState) throws IOException { + final PulsingTermState termState = (PulsingTermState) _termState; + if (termState.inlinedBytes != null) { + termState.inlinedBytesReader.rewind(); + } + termState.wrappedTermState.termBlockOrd = 0; + wrappedPostingsReader.resetTermsBlock(fieldInfo, termState.wrappedTermState); + } + + @Override + public BlockTreeTermState newTermState() throws IOException { + PulsingTermState state = new PulsingTermState(); + state.wrappedTermState = wrappedPostingsReader.newTermState(); + return state; + } + + @Override + public void nextTerm(FieldInfo fieldInfo, BlockTreeTermState _termState) throws IOException { + //System.out.println("PR nextTerm"); + PulsingTermState termState = (PulsingTermState) _termState; + + // total TF, but in the omitTFAP case its computed based on docFreq. + final long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq; + //System.out.println(" count=" + count + " threshold=" + maxPositions); + + if (count <= maxPositions) { + + // Inlined into terms dict -- just read the byte[] blob in, + // but don't decode it now (we only decode when a DocsEnum + // or D&PEnum is pulled): + termState.postingsSize = termState.inlinedBytesReader.readVInt(); + if (termState.postings == null || termState.postings.length < termState.postingsSize) { + termState.postings = new byte[ArrayUtil.oversize(termState.postingsSize, 1)]; + } + // TODO: sort of silly to copy from one big byte[] + // (the blob holding all inlined terms' blobs for + // current term block) into another byte[] (just the + // blob for this term)... + termState.inlinedBytesReader.readBytes(termState.postings, 0, termState.postingsSize); + //System.out.println(" inlined bytes=" + termState.postingsSize); + } else { + //System.out.println(" not inlined"); + termState.postingsSize = -1; + // TODO: should we do full copyFrom? much heavier...? + termState.wrappedTermState.docFreq = termState.docFreq; + termState.wrappedTermState.totalTermFreq = termState.totalTermFreq; + wrappedPostingsReader.nextTerm(fieldInfo, termState.wrappedTermState); + termState.wrappedTermState.termBlockOrd++; + } + } + + // TODO: we could actually reuse, by having TL that + // holds the last wrapped reuse, and vice-versa + @Override + public DocsEnum docs(FieldInfo field, BlockTreeTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException { + PulsingTermState termState = (PulsingTermState) _termState; + if (termState.postingsSize != -1) { + PulsingDocsEnum postings; + if (reuse instanceof PulsingDocsEnum) { + postings = (PulsingDocsEnum) reuse; + if (!postings.canReuse(field)) { + postings = new PulsingDocsEnum(field); + } + } else { + postings = new PulsingDocsEnum(field); + } + return postings.reset(skipDocs, termState); + } else { + // TODO: not great that we lose reuse of PulsingDocsEnum in this case: + if (reuse instanceof PulsingDocsEnum) { + return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, null); + } else { + return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, reuse); + } + } + } + + // TODO: -- not great that we can't always reuse + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTreeTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (field.omitTermFreqAndPositions) { + return null; + } + //System.out.println("D&P: field=" + field.name); + + final PulsingTermState termState = (PulsingTermState) _termState; + + if (termState.postingsSize != -1) { + PulsingDocsAndPositionsEnum postings; + if (reuse instanceof PulsingDocsAndPositionsEnum) { + postings = (PulsingDocsAndPositionsEnum) reuse; + if (!postings.canReuse(field)) { + postings = new PulsingDocsAndPositionsEnum(field); + } + } else { + postings = new PulsingDocsAndPositionsEnum(field); + } + + return postings.reset(skipDocs, termState); + } else { + if (reuse instanceof PulsingDocsAndPositionsEnum) { + return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, null); + } else { + return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, reuse); + } + } + } + + private static class PulsingDocsEnum extends DocsEnum { + private final ByteArrayDataInput postings = new ByteArrayDataInput(null); + private final boolean omitTF; + private final boolean storePayloads; + private Bits skipDocs; + private int docID; + private int freq; + + public PulsingDocsEnum(FieldInfo fieldInfo) { + omitTF = fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + } + + public PulsingDocsEnum reset(Bits skipDocs, PulsingTermState termState) { + //System.out.println("PR docsEnum termState=" + termState + " docFreq=" + termState.docFreq); + assert termState.postingsSize != -1; + // nocommit -- reuse the last byte[] if we can? or + // can we directly ref termState's bytes...? dangerous? + final byte[] bytes = new byte[termState.postingsSize]; + System.arraycopy(termState.postings, 0, bytes, 0, termState.postingsSize); + postings.reset(bytes); + docID = 0; + freq = 1; + this.skipDocs = skipDocs; + return this; + } + + boolean canReuse(FieldInfo fieldInfo) { + return omitTF == fieldInfo.omitTermFreqAndPositions && storePayloads == fieldInfo.storePayloads; + } + + @Override + public int nextDoc() throws IOException { + //System.out.println("PR nextDoc this= "+ this); + while(true) { + if (postings.eof()) { + //System.out.println("PR END"); + return docID = NO_MORE_DOCS; + } + + final int code = postings.readVInt(); + //System.out.println(" read code=" + code); + if (omitTF) { + docID += code; + } else { + docID += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = postings.readVInt(); // else read freq + } + + // Skip positions + if (storePayloads) { + int payloadLength = -1; + for(int pos=0;pos= target) + return doc; + } + return docID = NO_MORE_DOCS; + } + } + + private static class PulsingDocsAndPositionsEnum extends DocsAndPositionsEnum { + private final ByteArrayDataInput postings = new ByteArrayDataInput(null); + private final boolean storePayloads; + + private Bits skipDocs; + private int docID; + private int freq; + private int posPending; + private int position; + private int payloadLength; + private BytesRef payload; + + private boolean payloadRetrieved; + + public PulsingDocsAndPositionsEnum(FieldInfo fieldInfo) { + storePayloads = fieldInfo.storePayloads; + } + + boolean canReuse(FieldInfo fieldInfo) { + return storePayloads == fieldInfo.storePayloads; + } + + public PulsingDocsAndPositionsEnum reset(Bits skipDocs, PulsingTermState termState) { + assert termState.postingsSize != -1; + final byte[] bytes = new byte[termState.postingsSize]; + System.arraycopy(termState.postings, 0, bytes, 0, termState.postingsSize); + postings.reset(bytes); + this.skipDocs = skipDocs; + payloadLength = 0; + docID = 0; + //System.out.println("PR d&p reset storesPayloads=" + storePayloads + " bytes=" + bytes.length + " this=" + this); + return this; + } + + @Override + public int nextDoc() throws IOException { + //System.out.println("PR d&p nextDoc this=" + this); + + while(true) { + //System.out.println(" cycle skip posPending=" + posPending); + + skipPositions(); + + if (postings.eof()) { + //System.out.println("PR END"); + return docID = NO_MORE_DOCS; + } + + final int code = postings.readVInt(); + docID += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = postings.readVInt(); // else read freq + } + posPending = freq; + + if (skipDocs == null || !skipDocs.get(docID)) { + //System.out.println(" return docID=" + docID + " freq=" + freq); + position = 0; + return docID; + } + } + } + + @Override + public int freq() { + return freq; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int advance(int target) throws IOException { + int doc; + while((doc=nextDoc()) != NO_MORE_DOCS) { + if (doc >= target) { + return doc; + } + } + return docID = NO_MORE_DOCS; + } + + @Override + public int nextPosition() throws IOException { + //System.out.println("PR d&p nextPosition posPending=" + posPending + " vs freq=" + freq); + + assert posPending > 0; + posPending--; + + if (storePayloads) { + if (!payloadRetrieved) { + //System.out.println("PR skip payload=" + payloadLength); + postings.skipBytes(payloadLength); + } + final int code = postings.readVInt(); + //System.out.println("PR code=" + code); + if ((code & 1) != 0) { + payloadLength = postings.readVInt(); + //System.out.println("PR new payload len=" + payloadLength); + } + position += code >> 1; + payloadRetrieved = false; + } else { + position += postings.readVInt(); + } + + //System.out.println("PR d&p nextPos return pos=" + position + " this=" + this); + return position; + } + + private void skipPositions() throws IOException { + while(posPending != 0) { + nextPosition(); + } + if (storePayloads && !payloadRetrieved) { + //System.out.println(" skip payload len=" + payloadLength); + postings.skipBytes(payloadLength); + payloadRetrieved = true; + } + } + + @Override + public boolean hasPayload() { + return storePayloads && !payloadRetrieved && payloadLength > 0; + } + + @Override + public BytesRef getPayload() throws IOException { + //System.out.println("PR getPayload payloadLength=" + payloadLength + " this=" + this); + if (payloadRetrieved) { + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + } + payloadRetrieved = true; + if (payloadLength > 0) { + if (payload == null) { + payload = new BytesRef(payloadLength); + } else { + payload.grow(payloadLength); + } + postings.readBytes(payload.bytes, 0, payloadLength); + payload.length = payloadLength; + return payload; + } else { + return null; + } + } + } + + @Override + public void close() throws IOException { + wrappedPostingsReader.close(); + } +} Index: lucene/src/java/org/apache/lucene/index/codecs/pulsingtree/PulsingTreePostingsWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/index/codecs/pulsingtree/PulsingTreePostingsWriter.java Thu Apr 14 18:48:02 2011 -0400 @@ -0,0 +1,366 @@ +package org.apache.lucene.index.codecs.pulsingtree; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.codecs.BlockTreePostingsWriterBase; +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; +import org.apache.lucene.index.codecs.TermStats; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +// TODO: we now inline based on total TF of the term, +// but it might be better to inline by "net bytes used" +// so that a term that has only 1 posting but a huge +// payload would not be inlined. Though this is +// presumably rare in practice... + +/** @lucene.experimental */ +public final class PulsingTreePostingsWriter extends BlockTreePostingsWriterBase { + + final static String CODEC = "PulsedPostingsTree"; + + // To add a new version, increment from the last one, and + // change VERSION_CURRENT to point to your new version: + final static int VERSION_START = 0; + + final static int VERSION_CURRENT = VERSION_START; + + private IndexOutput termsOut; + + private boolean omitTF; + private boolean storePayloads; + + private static class PendingTerm { + private final byte[] bytes; + public PendingTerm(byte[] bytes) { + this.bytes = bytes; + } + } + + private final List pendingTerms = new ArrayList(); + + // one entry per position + private final Position[] pending; + private int pendingCount = 0; // -1 once we've hit too many positions + private Position currentDoc; // first Position entry of current doc + + private static final class Position { + BytesRef payload; + int termFreq; // only incremented on first position for a given doc + int pos; + int docID; + } + + // TODO: -- lazy init this? ie, if every single term + // was inlined (eg for a "primary key" field) then we + // never need to use this fallback? Fallback writer for + // non-inlined terms: + final BlockTreePostingsWriterBase wrappedPostingsWriter; + + /** If the total number of positions (summed across all docs + * for this term) is <= maxPositions, then the postings are + * inlined into terms dict */ + public PulsingTreePostingsWriter(int maxPositions, BlockTreePostingsWriterBase wrappedPostingsWriter) throws IOException { + super(); + + pending = new Position[maxPositions]; + for(int i=0;i= the cutoff: + this.wrappedPostingsWriter = wrappedPostingsWriter; + } + + @Override + public void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); + termsOut.writeVInt(pending.length); // encode maxPositions in header + wrappedPostingsWriter.start(termsOut); + } + + @Override + public void startTerm() { + //System.out.println("PW startTerm"); + assert pendingCount == 0; + } + + // TODO: -- should we NOT reuse across fields? would + // be cleaner + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes + @Override + public void setField(FieldInfo fieldInfo) { + omitTF = fieldInfo.omitTermFreqAndPositions; + //System.out.println("PW field=" + fieldInfo.name + " omitTF=" + omitTF); + storePayloads = fieldInfo.storePayloads; + wrappedPostingsWriter.setField(fieldInfo); + if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) { + DEBUG = true; + } else { + DEBUG = false; + } + } + + private boolean DEBUG; + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + assert docID >= 0: "got docID=" + docID; + + /* + if (termID != -1) { + if (docID == 0) { + baseDocID = termID; + } else if (baseDocID + docID != termID) { + throw new RuntimeException("WRITE: baseDocID=" + baseDocID + " docID=" + docID + " termID=" + termID); + } + } + */ + + //System.out.println("PW doc=" + docID); + if (DEBUG) { + System.out.println("PW docID=" + docID); + } + + if (pendingCount == pending.length) { + push(); + //System.out.println("PW: wrapped.finishDoc"); + wrappedPostingsWriter.finishDoc(); + } + + if (pendingCount != -1) { + assert pendingCount < pending.length; + currentDoc = pending[pendingCount]; + currentDoc.docID = docID; + if (omitTF) { + pendingCount++; + } else { + currentDoc.termFreq = termDocFreq; + } + } else { + // We've already seen too many docs for this term -- + // just forward to our fallback writer + wrappedPostingsWriter.startDoc(docID, termDocFreq); + } + } + + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + + //System.out.println("PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes")); + if (pendingCount == pending.length) { + push(); + } + + if (pendingCount == -1) { + // We've already seen too many docs for this term -- + // just forward to our fallback writer + wrappedPostingsWriter.addPosition(position, payload); + } else { + // buffer up + final Position pos = pending[pendingCount++]; + pos.pos = position; + pos.docID = currentDoc.docID; + if (payload != null && payload.length > 0) { + if (pos.payload == null) { + pos.payload = new BytesRef(payload); + } else { + pos.payload.copy(payload); + } + } else if (pos.payload != null) { + pos.payload.length = 0; + } + } + } + + @Override + public void finishDoc() throws IOException { + //System.out.println("PW finishDoc"); + if (pendingCount == -1) { + wrappedPostingsWriter.finishDoc(); + } + } + + private final RAMOutputStream buffer = new RAMOutputStream(); + + private int baseDocID; + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(TermStats stats) throws IOException { + //System.out.println("PW finishTerm docCount=" + stats.docFreq); + + assert pendingCount > 0 || pendingCount == -1; + + if (pendingCount == -1) { + wrappedPostingsWriter.finishTerm(stats); + // Must add null entry to record terms that our + // wrapped postings impl added + pendingTerms.add(null); + } else { + + // There were few enough total occurrences for this + // term, so we fully inline our postings data into + // terms dict, now: + + // TODO: it'd be better to share this encoding logic + // in some inner codec that knows how to write a + // single doc / single position, etc. This way if a + // given codec wants to store other interesting + // stuff, it could use this pulsing codec to do so + + if (!omitTF) { + int lastDocID = 0; + int pendingIDX = 0; + while(pendingIDX < pendingCount) { + final Position doc = pending[pendingIDX]; + + final int delta = doc.docID - lastDocID; + lastDocID = doc.docID; + + //System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq); + + if (doc.termFreq == 1) { + buffer.writeVInt((delta<<1)|1); + } else { + buffer.writeVInt(delta<<1); + buffer.writeVInt(doc.termFreq); + } + + int lastPos = 0; + int lastPayloadLength = -1; + for(int posIDX=0;posIDX files) throws IOException { + StandardTreePostingsReader.files(dir, segmentInfo, id, files); + BlockTreeTermsReader.files(dir, segmentInfo, id, files); + } + + @Override + public void getExtensions(Set extensions) { + getStandardExtensions(extensions); + } + + public static void getStandardExtensions(Set extensions) { + extensions.add(FREQ_EXTENSION); + extensions.add(PROX_EXTENSION); + BlockTreeTermsReader.getExtensions(extensions); + } +} Index: lucene/src/java/org/apache/lucene/index/codecs/standardtree/StandardTreePostingsReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/index/codecs/standardtree/StandardTreePostingsReader.java Thu Apr 14 18:48:02 2011 -0400 @@ -0,0 +1,877 @@ +package org.apache.lucene.index.codecs.standardtree; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.codecs.BlockTreeTermState; +import org.apache.lucene.index.codecs.BlockTreePostingsReaderBase; +import org.apache.lucene.index.codecs.standard.DefaultSkipListReader; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; + +/** Concrete class that reads the current doc/freq/skip + * postings format. + * @lucene.experimental */ + +public class StandardTreePostingsReader extends BlockTreePostingsReaderBase { + + private final IndexInput freqIn; + private final IndexInput proxIn; + public static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + int skipInterval; + int maxSkipLevels; + int skipMinimum; + + //private String segment; + + public StandardTreePostingsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize, String codecId) throws IOException { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, StandardTreeCodec.FREQ_EXTENSION), + readBufferSize); + //this.segment = segmentInfo.name; + if (segmentInfo.getHasProx()) { + boolean success = false; + try { + proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, StandardTreeCodec.PROX_EXTENSION), + readBufferSize); + success = true; + } finally { + if (!success) { + freqIn.close(); + } + } + } else { + proxIn = null; + } + } + + public static void files(Directory dir, SegmentInfo segmentInfo, String id, Collection files) throws IOException { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, StandardTreeCodec.FREQ_EXTENSION)); + if (segmentInfo.getHasProx()) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, StandardTreeCodec.PROX_EXTENSION)); + } + } + + @Override + public void init(IndexInput termsIn) throws IOException { + + // Make sure we are talking to the matching past writer + CodecUtil.checkHeader(termsIn, StandardTreePostingsWriter.CODEC, + StandardTreePostingsWriter.VERSION_START, StandardTreePostingsWriter.VERSION_START); + + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + skipMinimum = termsIn.readInt(); + } + + // Must keep final because we do non-standard clone + private final static class StandardTermState extends BlockTreeTermState { + long freqOffset; + long proxOffset; + int skipOffset; + + // Only used by the "primary" TermState -- clones don't + // copy this (basically they are "transient"): + ByteArrayDataInput bytesReader; + byte[] bytes; + + @Override + public Object clone() { + StandardTermState other = new StandardTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + StandardTermState other = (StandardTermState) _other; + freqOffset = other.freqOffset; + proxOffset = other.proxOffset; + skipOffset = other.skipOffset; + + // Do not copy bytes, bytesReader (else TermState is + // very heavy, ie drags around the entire block's + // byte[]). On seek back, if next() is in fact used + // (rare!), they will be re-read from disk. + } + + @Override + public String toString() { + return super.toString() + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " skipOffset=" + skipOffset; + } + } + + @Override + public BlockTreeTermState newTermState() { + return new StandardTermState(); + } + + @Override + public void close() throws IOException { + try { + if (freqIn != null) { + freqIn.close(); + } + } finally { + if (proxIn != null) { + proxIn.close(); + } + } + } + + /* Reads but does not decode the byte[] blob holding + metadata for the current terms block */ + @Override + public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTreeTermState _termState) throws IOException { + final StandardTermState termState = (StandardTermState) _termState; + + final int len = termsIn.readVInt(); + + // nocommit: we should be able to do this?: + //if (len == 0) { + //return; + //} + + //if (DEBUG) System.out.println(" SPR.readTermsBlock bytes=" + len + " ts=" + _termState); + if (termState.bytes == null) { + termState.bytes = new byte[ArrayUtil.oversize(len, 1)]; + termState.bytesReader = new ByteArrayDataInput(null); + } else if (termState.bytes.length < len) { + termState.bytes = new byte[ArrayUtil.oversize(len, 1)]; + } + + termsIn.readBytes(termState.bytes, 0, len); + termState.bytesReader.reset(termState.bytes, 0, len); + } + + @Override + public void resetTermsBlock(FieldInfo fieldInfo, BlockTreeTermState _termState) throws IOException { + if (DEBUG) System.out.println(" SPR.resetTermsBlock ts=" + _termState); + final StandardTermState termState = (StandardTermState) _termState; + assert termState.bytes != null; + termState.bytesReader.rewind(); + } + + @Override + public void nextTerm(FieldInfo fieldInfo, BlockTreeTermState _termState) + throws IOException { + final StandardTermState termState = (StandardTermState) _termState; + //System.out.println("StandardR.nextTerm seg=" + segment); + final boolean isFirstTerm = termState.termBlockOrd == 0; + + if (isFirstTerm) { + termState.freqOffset = termState.bytesReader.readVLong(); + } else { + termState.freqOffset += termState.bytesReader.readVLong(); + } + //System.out.println(" dF=" + termState.docFreq); + //System.out.println(" freqFP=" + termState.freqOffset); + assert termState.freqOffset < freqIn.length(); + + if (termState.docFreq >= skipMinimum) { + termState.skipOffset = termState.bytesReader.readVInt(); + //System.out.println(" skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length()); + assert termState.freqOffset + termState.skipOffset < freqIn.length(); + } else { + // undefined + } + + if (!fieldInfo.omitTermFreqAndPositions) { + if (isFirstTerm) { + termState.proxOffset = termState.bytesReader.readVLong(); + } else { + termState.proxOffset += termState.bytesReader.readVLong(); + } + //System.out.println(" proxFP=" + termState.proxOffset); + } + } + + @Override + public DocsEnum docs(FieldInfo fieldInfo, BlockTreeTermState termState, Bits skipDocs, DocsEnum reuse) throws IOException { + SegmentDocsEnum docsEnum; + if (reuse == null || !(reuse instanceof SegmentDocsEnum)) { + docsEnum = new SegmentDocsEnum(freqIn); + } else { + docsEnum = (SegmentDocsEnum) reuse; + if (docsEnum.startFreqIn != freqIn) { + // If you are using ParellelReader, and pass in a + // reused DocsEnum, it could have come from another + // reader also using standard codec + docsEnum = new SegmentDocsEnum(freqIn); + } + } + return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTreeTermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (fieldInfo.omitTermFreqAndPositions) { + return null; + } + + // TODO: refactor + if (fieldInfo.storePayloads) { + SegmentDocsAndPositionsAndPayloadsEnum docsEnum; + if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsAndPayloadsEnum)) { + docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn); + } else { + docsEnum = (SegmentDocsAndPositionsAndPayloadsEnum) reuse; + if (docsEnum.startFreqIn != freqIn) { + // If you are using ParellelReader, and pass in a + // reused DocsEnum, it could have come from another + // reader also using standard codec + docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn); + } + } + return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs); + } else { + SegmentDocsAndPositionsEnum docsEnum; + if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) { + docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn); + } else { + docsEnum = (SegmentDocsAndPositionsEnum) reuse; + if (docsEnum.startFreqIn != freqIn) { + // If you are using ParellelReader, and pass in a + // reused DocsEnum, it could have come from another + // reader also using standard codec + docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn); + } + } + return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs); + } + } + + // Decodes only docs + private class SegmentDocsEnum extends DocsEnum { + final IndexInput freqIn; + final IndexInput startFreqIn; + + boolean omitTF; // does current field omit term freq? + boolean storePayloads; // does current field store payloads? + + int limit; // number of docs in this posting + int ord; // how many docs we've read + int doc; // doc we last read + int freq; // freq we last read + + Bits skipDocs; + + long freqOffset; + int skipOffset; + + boolean skipped; + DefaultSkipListReader skipper; + + public SegmentDocsEnum(IndexInput freqIn) throws IOException { + startFreqIn = freqIn; + this.freqIn = (IndexInput) freqIn.clone(); + } + + public SegmentDocsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException { + omitTF = fieldInfo.omitTermFreqAndPositions; + if (omitTF) { + freq = 1; + } + storePayloads = fieldInfo.storePayloads; + this.skipDocs = skipDocs; + freqOffset = termState.freqOffset; + skipOffset = termState.skipOffset; + + // TODO: for full enum case (eg segment merging) this + // seek is unnecessary; maybe we can avoid in such + // cases + freqIn.seek(termState.freqOffset); + limit = termState.docFreq; + assert limit > 0; + ord = 0; + doc = 0; + //System.out.println(" sde limit=" + limit + " freqFP=" + freqOffset); + + skipped = false; + + return this; + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (ord == limit) { + return doc = NO_MORE_DOCS; + } + + ord++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + } + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + return doc; + } + + @Override + public int read() throws IOException { + + final int[] docs = bulkResult.docs.ints; + final int[] freqs = bulkResult.freqs.ints; + int i = 0; + final int length = docs.length; + while (i < length && ord < limit) { + ord++; + // manually inlined call to next() for speed + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + } + + if (skipDocs == null || !skipDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + ++i; + } + } + + return i; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int advance(int target) throws IOException { + + if ((target - skipInterval) >= doc && limit >= skipMinimum) { + + // There are enough docs in the posting to have + // skip data, and it isn't too close. + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped since reset() was called, so now we + // load the skip data for this posting + + skipper.init(freqOffset + skipOffset, + freqOffset, 0, + limit, storePayloads); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + if (newOrd > ord) { + // Skipper moved + + ord = newOrd; + doc = skipper.getDoc(); + freqIn.seek(skipper.getFreqPointer()); + } + } + + // scan for the rest: + do { + nextDoc(); + } while (target > doc); + + return doc; + } + } + + // Decodes docs & positions. payloads are not present. + private class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum { + final IndexInput startFreqIn; + private final IndexInput freqIn; + private final IndexInput proxIn; + + int limit; // number of docs in this posting + int ord; // how many docs we've read + int doc; // doc we last read + int freq; // freq we last read + int position; + + Bits skipDocs; + + long freqOffset; + int skipOffset; + long proxOffset; + + int posPendingCount; + + boolean skipped; + DefaultSkipListReader skipper; + private long lazyProxPointer; + + public SegmentDocsAndPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException { + startFreqIn = freqIn; + this.freqIn = (IndexInput) freqIn.clone(); + this.proxIn = (IndexInput) proxIn.clone(); + } + + public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException { + assert !fieldInfo.omitTermFreqAndPositions; + assert !fieldInfo.storePayloads; + + this.skipDocs = skipDocs; + + // TODO: for full enum case (eg segment merging) this + // seek is unnecessary; maybe we can avoid in such + // cases + freqIn.seek(termState.freqOffset); + lazyProxPointer = termState.proxOffset; + + limit = termState.docFreq; + assert limit > 0; + + ord = 0; + doc = 0; + position = 0; + + skipped = false; + posPendingCount = 0; + + freqOffset = termState.freqOffset; + proxOffset = termState.proxOffset; + skipOffset = termState.skipOffset; + //System.out.println("StandardR.D&PE reset seg=" + segment + " limit=" + limit + " freqFP=" + freqOffset + " proxFP=" + proxOffset); + + return this; + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (ord == limit) { + //System.out.println("StandardR.D&PE seg=" + segment + " nextDoc return doc=END"); + return doc = NO_MORE_DOCS; + } + + ord++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + posPendingCount += freq; + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + position = 0; + + //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc); + return doc; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int advance(int target) throws IOException { + + //System.out.println("StandardR.D&PE advance target=" + target); + + if ((target - skipInterval) >= doc && limit >= skipMinimum) { + + // There are enough docs in the posting to have + // skip data, and it isn't too close + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped, since reset() was called, so now we + // load the skip data for this posting + + skipper.init(freqOffset+skipOffset, + freqOffset, proxOffset, + limit, false); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + if (newOrd > ord) { + // Skipper moved + ord = newOrd; + doc = skipper.getDoc(); + freqIn.seek(skipper.getFreqPointer()); + lazyProxPointer = skipper.getProxPointer(); + posPendingCount = 0; + position = 0; + } + } + + // Now, linear scan for the rest: + do { + nextDoc(); + } while (target > doc); + + return doc; + } + + @Override + public int nextPosition() throws IOException { + + if (lazyProxPointer != -1) { + proxIn.seek(lazyProxPointer); + lazyProxPointer = -1; + } + + // scan over any docs that were iterated without their positions + if (posPendingCount > freq) { + position = 0; + while(posPendingCount != freq) { + if ((proxIn.readByte() & 0x80) == 0) { + posPendingCount--; + } + } + } + + position += proxIn.readVInt(); + + posPendingCount--; + + assert posPendingCount >= 0: "nextPosition() was called too many times (more than freq() times) posPendingCount=" + posPendingCount; + + return position; + } + + /** Returns the payload at this position, or null if no + * payload was indexed. */ + @Override + public BytesRef getPayload() throws IOException { + throw new IOException("No payloads exist for this field!"); + } + + @Override + public boolean hasPayload() { + return false; + } + } + + // Decodes docs & positions & payloads + private class SegmentDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum { + final IndexInput startFreqIn; + private final IndexInput freqIn; + private final IndexInput proxIn; + + int limit; // number of docs in this posting + int ord; // how many docs we've read + int doc; // doc we last read + int freq; // freq we last read + int position; + + Bits skipDocs; + + long freqOffset; + int skipOffset; + long proxOffset; + + int posPendingCount; + int payloadLength; + boolean payloadPending; + + boolean skipped; + DefaultSkipListReader skipper; + private BytesRef payload; + private long lazyProxPointer; + + public SegmentDocsAndPositionsAndPayloadsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException { + startFreqIn = freqIn; + this.freqIn = (IndexInput) freqIn.clone(); + this.proxIn = (IndexInput) proxIn.clone(); + } + + public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException { + assert !fieldInfo.omitTermFreqAndPositions; + assert fieldInfo.storePayloads; + if (payload == null) { + payload = new BytesRef(); + payload.bytes = new byte[1]; + } + + this.skipDocs = skipDocs; + + // TODO: for full enum case (eg segment merging) this + // seek is unnecessary; maybe we can avoid in such + // cases + freqIn.seek(termState.freqOffset); + lazyProxPointer = termState.proxOffset; + + limit = termState.docFreq; + ord = 0; + doc = 0; + position = 0; + + skipped = false; + posPendingCount = 0; + payloadPending = false; + + freqOffset = termState.freqOffset; + proxOffset = termState.proxOffset; + skipOffset = termState.skipOffset; + //System.out.println("StandardR.D&PE reset seg=" + segment + " limit=" + limit + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " this=" + this); + + return this; + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (ord == limit) { + //System.out.println("StandardR.D&PE seg=" + segment + " nextDoc return doc=END"); + return doc = NO_MORE_DOCS; + } + + ord++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + posPendingCount += freq; + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + position = 0; + + //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc); + return doc; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int advance(int target) throws IOException { + + //System.out.println("StandardR.D&PE advance seg=" + segment + " target=" + target + " this=" + this); + + if ((target - skipInterval) >= doc && limit >= skipMinimum) { + + // There are enough docs in the posting to have + // skip data, and it isn't too close + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped, since reset() was called, so now we + // load the skip data for this posting + //System.out.println(" init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length()); + skipper.init(freqOffset+skipOffset, + freqOffset, proxOffset, + limit, true); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + if (newOrd > ord) { + // Skipper moved + ord = newOrd; + doc = skipper.getDoc(); + freqIn.seek(skipper.getFreqPointer()); + lazyProxPointer = skipper.getProxPointer(); + posPendingCount = 0; + position = 0; + payloadPending = false; + payloadLength = skipper.getPayloadLength(); + } + } + + // Now, linear scan for the rest: + do { + nextDoc(); + } while (target > doc); + + return doc; + } + + @Override + public int nextPosition() throws IOException { + + if (lazyProxPointer != -1) { + proxIn.seek(lazyProxPointer); + lazyProxPointer = -1; + } + + if (payloadPending && payloadLength > 0) { + // payload of last position as never retrieved -- skip it + proxIn.seek(proxIn.getFilePointer() + payloadLength); + payloadPending = false; + } + + // scan over any docs that were iterated without their positions + while(posPendingCount > freq) { + + final int code = proxIn.readVInt(); + + if ((code & 1) != 0) { + // new payload length + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + } + + assert payloadLength != -1; + proxIn.seek(proxIn.getFilePointer() + payloadLength); + + posPendingCount--; + position = 0; + payloadPending = false; + //System.out.println("StandardR.D&PE skipPos"); + } + + // read next position + if (payloadPending && payloadLength > 0) { + // payload wasn't retrieved for last position + proxIn.seek(proxIn.getFilePointer()+payloadLength); + } + + final int code = proxIn.readVInt(); + if ((code & 1) != 0) { + // new payload length + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + } + assert payloadLength != -1; + + payloadPending = true; + position += code >>> 1; + + posPendingCount--; + + assert posPendingCount >= 0: "nextPosition() was called too many times (more than freq() times) posPendingCount=" + posPendingCount; + + //System.out.println("StandardR.D&PE nextPos return pos=" + position); + return position; + } + + /** Returns the payload at this position, or null if no + * payload was indexed. */ + @Override + public BytesRef getPayload() throws IOException { + assert lazyProxPointer == -1; + assert posPendingCount < freq; + if (!payloadPending) { + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + } + if (payloadLength > payload.bytes.length) { + payload.grow(payloadLength); + } + + proxIn.readBytes(payload.bytes, 0, payloadLength); + payload.length = payloadLength; + payloadPending = false; + + return payload; + } + + @Override + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } +} Index: lucene/src/java/org/apache/lucene/index/codecs/standardtree/StandardTreePostingsWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/index/codecs/standardtree/StandardTreePostingsWriter.java Thu Apr 14 18:48:02 2011 -0400 @@ -0,0 +1,314 @@ +package org.apache.lucene.index.codecs.standardtree; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Consumes doc & freq, writing them using the current + * index file format */ + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.BlockTreePostingsWriterBase; +import org.apache.lucene.index.codecs.TermStats; +import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; + +/** @lucene.experimental */ +public final class StandardTreePostingsWriter extends BlockTreePostingsWriterBase { + final static String CODEC = "StandardTreePostingsWriterImpl"; + + public static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + final IndexOutput freqOut; + final IndexOutput proxOut; + final DefaultSkipListWriter skipListWriter; + /** Expert: The fraction of TermDocs entries stored in skip tables, + * used to accelerate {@link DocsEnum#advance(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + final int skipInterval = 16; + + /** + * Expert: minimum docFreq to write any skip data at all + */ + final int skipMinimum = skipInterval; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + final int maxSkipLevels = 10; + final int totalNumDocs; + IndexOutput termsOut; + + boolean omitTermFreqAndPositions; + boolean storePayloads; + // Starts a new term + long freqStart; + long proxStart; + FieldInfo fieldInfo; + int lastPayloadLength; + int lastPosition; + + //private String segment; + + public StandardTreePostingsWriter(SegmentWriteState state) throws IOException { + super(); + //this.segment = state.segmentName; + String fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardTreeCodec.FREQ_EXTENSION); + freqOut = state.directory.createOutput(fileName); + if (state.fieldInfos.hasProx()) { + // At least one field does not omit TF, so create the + // prox file + fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardTreeCodec.PROX_EXTENSION); + proxOut = state.directory.createOutput(fileName); + } else { + // Every field omits TF so we will write no prox file + proxOut = null; + } + + totalNumDocs = state.numDocs; + + skipListWriter = new DefaultSkipListWriter(skipInterval, + maxSkipLevels, + state.numDocs, + freqOut, + proxOut); + } + + @Override + public void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + termsOut.writeInt(skipMinimum); // write skipMinimum + } + + @Override + public void startTerm() { + freqStart = freqOut.getFilePointer(); + if (proxOut != null) { + proxStart = proxOut.getFilePointer(); + // force first payload to write its length + lastPayloadLength = -1; + } + skipListWriter.resetSkip(); + } + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes + @Override + public void setField(FieldInfo fieldInfo) { + //System.out.println("SPW: setField"); + if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) { + DEBUG = true; + } else { + DEBUG = false; + } + this.fieldInfo = fieldInfo; + omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + //System.out.println(" set init blockFreqStart=" + freqStart); + //System.out.println(" set init blockProxStart=" + proxStart); + } + + int lastDocID; + int df; + + /** Adds a new doc in this term. If this returns null + * then we just skip consuming positions/payloads. */ + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + //System.out.println("StandardW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq); + if (DEBUG) System.out.println("SPW.startDoc docID=" + docID); + + final int delta = docID - lastDocID; + + if (docID < 0 || (df > 0 && delta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + } + + if ((++df % skipInterval) == 0) { + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; + + lastDocID = docID; + if (omitTermFreqAndPositions) { + freqOut.writeVInt(delta); + } else if (1 == termDocFreq) { + freqOut.writeVInt((delta<<1) | 1); + } else { + freqOut.writeVInt(delta<<1); + freqOut.writeVInt(termDocFreq); + } + + lastPosition = 0; + } + + /** Add a new position & payload */ + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + //System.out.println("StandardW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer()); + assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true"; + assert proxOut != null; + + final int delta = position - lastPosition; + + assert delta > 0 || position == 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it) + + lastPosition = position; + + if (storePayloads) { + final int payloadLength = payload == null ? 0 : payload.length; + + if (payloadLength != lastPayloadLength) { + lastPayloadLength = payloadLength; + proxOut.writeVInt((delta<<1)|1); + proxOut.writeVInt(payloadLength); + } else { + proxOut.writeVInt(delta << 1); + } + + if (payloadLength > 0) { + proxOut.writeBytes(payload.bytes, payload.offset, payloadLength); + } + } else { + proxOut.writeVInt(delta); + } + } + + @Override + public void finishDoc() { + } + + private static class PendingTerm { + public final long freqStart; + public final long proxStart; + public final int skipOffset; + + public PendingTerm(long freqStart, long proxStart, int skipOffset) { + this.freqStart = freqStart; + this.proxStart = proxStart; + this.skipOffset = skipOffset; + } + } + + private final List pendingTerms = new ArrayList(); + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(TermStats stats) throws IOException { + + //System.out.println("StandardW.finishTerm seg=" + segment); + assert stats.docFreq > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert stats.docFreq == df; + + final int skipOffset; + if (df >= skipMinimum) { + skipOffset = (int) (skipListWriter.writeSkip(freqOut)-freqStart); + } else { + skipOffset = -1; + } + + pendingTerms.add(new PendingTerm(freqStart, proxStart, skipOffset)); + + lastDocID = 0; + df = 0; + } + + private final RAMOutputStream bytesWriter = new RAMOutputStream(); + + @Override + public void flushTermsBlock(int lastN) throws IOException { + if (DEBUG) System.out.println("SPW.flushTermsBlock lastN=" + lastN + " left=" + (pendingTerms.size()-lastN)); + + if (lastN == 0) { + termsOut.writeByte((byte) 0); + termsOut.writeByte((byte) 0); + return; + } + + final int limit = pendingTerms.size(); + final PendingTerm firstTerm = pendingTerms.get(limit - lastN); + // First term in block is abs coded: + bytesWriter.writeVLong(firstTerm.freqStart); + + if (firstTerm.skipOffset != -1) { + assert firstTerm.skipOffset > 0; + bytesWriter.writeVInt(firstTerm.skipOffset); + } + if (!omitTermFreqAndPositions) { + bytesWriter.writeVLong(firstTerm.proxStart); + } + long lastFreqStart = firstTerm.freqStart; + long lastProxStart = firstTerm.proxStart; + for(int idx=limit-lastN+1; idx 0; + bytesWriter.writeVInt(term.skipOffset); + } + if (!omitTermFreqAndPositions) { + bytesWriter.writeVLong(term.proxStart - lastProxStart); + lastProxStart = term.proxStart; + } + } + + termsOut.writeVInt((int) bytesWriter.getFilePointer()); + bytesWriter.writeTo(termsOut); + bytesWriter.reset(); + + // Remove the terms we just wrote: + pendingTerms.subList(limit-lastN, limit).clear(); + } + + @Override + public void close() throws IOException { + try { + freqOut.close(); + } finally { + if (proxOut != null) { + proxOut.close(); + } + } + } +} Index: lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java --- lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java Thu Apr 14 18:48:02 2011 -0400 @@ -135,7 +135,7 @@ * @throws UnsupportedOperationException */ @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef term, boolean useCache, boolean exactOnly) throws IOException { throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); } @@ -189,7 +189,7 @@ final BytesRef t = nextSeekTerm(actualTerm); // Make sure we always seek forward: assert actualTerm == null || t == null || getComparator().compare(t, actualTerm) > 0: "curTerm=" + actualTerm + " seekTerm=" + t; - if (t == null || tenum.seek(t, false) == SeekStatus.END) { + if (t == null || tenum.seek(t, false, false) == SeekStatus.END) { // no more terms to seek to or enum exhausted return null; } Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Thu Apr 14 18:48:02 2011 -0400 @@ -283,8 +283,9 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { - return actualEnum.seek(text, useCache); + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly) throws IOException { + // nocommit -- ok to just pass through exactOnly? + return actualEnum.seek(text, useCache, exactOnly); } @Override Index: lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java --- lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java Thu Apr 14 18:48:02 2011 -0400 @@ -241,7 +241,7 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly /* ignored */) throws IOException { int low = 1; int high = numOrd-1; Index: lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java --- lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java Thu Apr 14 18:48:02 2011 -0400 @@ -34,6 +34,10 @@ reset(bytes, 0, bytes.length); } + public void rewind() { + pos = 0; + } + public int getPosition() { return pos; } @@ -104,7 +108,7 @@ // NOTE: AIOOBE not EOF if you read too much @Override public void readBytes(byte[] b, int offset, int len) { - assert pos + len <= limit; + assert pos + len <= limit: "pos=" + pos + " len=" + len + " limit=" + limit; System.arraycopy(bytes, pos, b, offset, len); pos += len; } Index: lucene/src/java/org/apache/lucene/store/RAMOutputStream.java --- lucene/src/java/org/apache/lucene/store/RAMOutputStream.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/store/RAMOutputStream.java Thu Apr 14 18:48:02 2011 -0400 @@ -67,6 +67,30 @@ } } + // returns single byte[] copy holding all bytes written so far + public byte[] toByteArray() throws IOException { + flush(); + if (file.length > Integer.MAX_VALUE) { + throw new IllegalStateException("file is too large (" + file.length + " bytes) to fit in single byte[]"); + } + final int size = (int) file.length; + final byte[] bytes = new byte[size]; + + int pos = 0; + int buffer = 0; + while (pos < size) { + int length = BUFFER_SIZE; + int nextPos = pos + length; + if (nextPos > size) { // at the last buffer + length = size - pos; + } + System.arraycopy(file.getBuffer(buffer++), 0, bytes, pos, length); + pos = nextPos; + } + + return bytes; + } + /** Resets this to an empty file. */ public void reset() { currentBuffer = null; Index: lucene/src/java/org/apache/lucene/util/BytesRef.java --- lucene/src/java/org/apache/lucene/util/BytesRef.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/BytesRef.java Thu Apr 14 18:48:02 2011 -0400 @@ -68,6 +68,18 @@ this.bytes = new byte[capacity]; } + /** Incoming IntsRef values must be Byte.MIN_VALUE - + * Byte.MAX_VALUE. */ + public BytesRef(IntsRef intsRef) { + bytes = new byte[intsRef.length]; + for(int idx=0;idx= Byte.MIN_VALUE && v <= Byte.MAX_VALUE; + bytes[idx] = (byte) v; + } + length = intsRef.length; + } + /** * @param text Initialize the byte[] from the UTF8 bytes * for the provided Sring. This must be well-formed Index: lucene/src/java/org/apache/lucene/util/PerReaderTermState.java --- lucene/src/java/org/apache/lucene/util/PerReaderTermState.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/PerReaderTermState.java Thu Apr 14 18:48:02 2011 -0400 @@ -26,6 +26,7 @@ import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.TermsEnum.SeekStatus; @@ -44,6 +45,8 @@ private final TermState[] states; private int docFreq; + public static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + /** * Creates an empty {@link PerReaderTermState} from a {@link ReaderContext} */ @@ -84,14 +87,17 @@ final BytesRef bytes = term.bytes(); final PerReaderTermState perReaderTermState = new PerReaderTermState(context); final AtomicReaderContext[] leaves = ReaderUtil.leaves(context); + if (DEBUG) System.out.println("prts.build term=" + term); for (int i = 0; i < leaves.length; i++) { + if (DEBUG) System.out.println(" r=" + leaves[i].reader); final Fields fields = leaves[i].reader.fields(); if (fields != null) { final Terms terms = fields.terms(field); if (terms != null) { final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share! - if (SeekStatus.FOUND == termsEnum.seek(bytes, cache)) { + if (SeekStatus.FOUND == termsEnum.seek(bytes, cache, true)) { final TermState termState = termsEnum.termState(); + if (DEBUG) System.out.println(" found"); perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq()); } } Index: lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java --- lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Thu Apr 14 18:48:02 2011 -0400 @@ -22,7 +22,12 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; +// nocommit +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; + import java.io.IOException; +import java.util.List; +import java.util.ArrayList; /** * Builds a compact FST (maps an IntsRef term to an arbitrary @@ -52,6 +57,8 @@ private final FST fst; private final T NO_OUTPUT; + public boolean DEBUG = BlockTreeTermsWriter.DEBUG; + // simplistic pruning: we prune node (and all following // nodes) if less than this number of terms go through it: private final int minSuffixCount1; @@ -69,9 +76,22 @@ // current "frontier" private UnCompiledNode[] frontier; + // Expert: you pass an instance of this if you want to do + // something "custom" as suffixes are "frozen": + public static abstract class FreezeTail { + public abstract void freeze(final UnCompiledNode[] frontier, int prefixLenPlus1, IntsRef prevInput) throws IOException; + } + + private final FreezeTail freezeTail; + public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs outputs) { + this(inputType, minSuffixCount1, minSuffixCount2, doMinSuffix, outputs, null); + } + + public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs outputs, FreezeTail freezeTail) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; + this.freezeTail = freezeTail; fst = new FST(inputType, outputs); if (doMinSuffix) { dedupHash = new NodeHash(fst); @@ -120,94 +140,101 @@ return fn; } - private void compilePrevTail(int prefixLenPlus1) throws IOException { - assert prefixLenPlus1 >= 1; - //System.out.println(" compileTail " + prefixLenPlus1); - for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { - boolean doPrune = false; - boolean doCompile = false; + private void freezeTail(int prefixLenPlus1) throws IOException { + if (freezeTail != null) { + // Custom plugin: + freezeTail.freeze(frontier, prefixLenPlus1, lastInput); + } else { + for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { + final UnCompiledNode node = frontier[idx]; + final UnCompiledNode parent = idx == 0 ? null : frontier[idx-1]; - final UnCompiledNode node = frontier[idx]; - final UnCompiledNode parent = frontier[idx-1]; + if (parent == null) { + return; + } - if (node.inputCount < minSuffixCount1) { - doPrune = true; - doCompile = true; - } else if (idx > prefixLenPlus1) { - // prune if parent's inputCount is less than suffixMinCount2 - if (parent.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.inputCount == 1) { - // my parent, about to be compiled, doesn't make the cut, so - // I'm definitely pruned + boolean doPrune = false; + boolean doCompile = false; - // if pruneCount2 is 1, we keep only up - // until the 'distinguished edge', ie we keep only the - // 'divergent' part of the FST. if my parent, about to be - // compiled, has inputCount 1 then we are already past the - // distinguished edge. NOTE: this only works if - // the FST outputs are not "compressible" (simple - // ords ARE compressible). + if (node.inputCount < minSuffixCount1) { doPrune = true; + doCompile = true; + } else if (idx > prefixLenPlus1) { + // prune if parent's inputCount is less than suffixMinCount2 + if (parent.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1)) { + // my parent, about to be compiled, doesn't make the cut, so + // I'm definitely pruned + + // if pruneCount2 is 1, we keep only up + // until the 'distinguished edge', ie we keep only the + // 'divergent' part of the FST. if my parent, about to be + // compiled, has inputCount 1 then we are already past the + // distinguished edge. NOTE: this only works if + // the FST outputs are not "compressible" (simple + // ords ARE compressible). + doPrune = true; + } else { + // my parent, about to be compiled, does make the cut, so + // I'm definitely not pruned + doPrune = false; + } + doCompile = true; } else { - // my parent, about to be compiled, does make the cut, so - // I'm definitely not pruned - doPrune = false; + // if pruning is disabled (count is 0) we can always + // compile current node + doCompile = minSuffixCount2 == 0; } - doCompile = true; - } else { - // if pruning is disabled (count is 0) we can always - // compile current node - doCompile = minSuffixCount2 == 0; - } - //System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune); + //System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune); - if (node.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && node.inputCount == 1) { - // drop all arcs - for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; - target.clear(); + if (node.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1)) { + // drop all arcs + for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; + target.clear(); + } + node.numArcs = 0; } - node.numArcs = 0; - } - if (doPrune) { - // this node doesn't make it -- deref it - node.clear(); - parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node); - } else { + if (doPrune) { + // this node doesn't make it -- deref it + node.clear(); + parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node); + } else { - if (minSuffixCount2 != 0) { - compileAllTargets(node); - } - final T nextFinalOutput = node.output; + if (minSuffixCount2 != 0) { + compileAllTargets(node); + } + final T nextFinalOutput = node.output; - // We "fake" the node as being final if it has no - // outgoing arcs; in theory we could leave it - // as non-final (the FST can represent this), but - // FSTEnum, Util, etc., have trouble w/ non-final - // dead-end states: - final boolean isFinal = node.isFinal || node.numArcs == 0; + // We "fake" the node as being final if it has no + // outgoing arcs; in theory we could leave it + // as non-final (the FST can represent this), but + // FSTEnum, Util, etc., have trouble w/ non-final + // dead-end states: + final boolean isFinal = node.isFinal || node.numArcs == 0; - if (doCompile) { - // this node makes it and we now compile it. first, - // compile any targets that were previously - // undecided: - parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], - compileNode(node), - nextFinalOutput, - isFinal); - } else { - // replaceLast just to install - // nextFinalOutput/isFinal onto the arc - parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], - node, - nextFinalOutput, - isFinal); - // this node will stay in play for now, since we are - // undecided on whether to prune it. later, it - // will be either compiled or pruned, so we must - // allocate a new node: - frontier[idx] = new UnCompiledNode(this, idx); + if (doCompile) { + // this node makes it and we now compile it. first, + // compile any targets that were previously + // undecided: + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + compileNode(node), + nextFinalOutput, + isFinal); + } else { + // replaceLast just to install + // nextFinalOutput/isFinal onto the arc + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + node, + nextFinalOutput, + isFinal); + // this node will stay in play for now, since we are + // undecided on whether to prune it. later, it + // will be either compiled or pruned, so we must + // allocate a new node: + frontier[idx] = new UnCompiledNode(this, idx); + } } } } @@ -262,7 +289,23 @@ } public void add(IntsRef input, T output) throws IOException { - //System.out.println("\nFST ADD: input=" + input + " output=" + fst.outputs.outputToString(output)); + // nocommit + { + BytesRef b = new BytesRef(input.length); + for(int x=0;x implements Node { + public static final class UnCompiledNode implements Node { final Builder owner; - int numArcs; - Arc[] arcs; - T output; - boolean isFinal; - long inputCount; + public int numArcs; + public Arc[] arcs; + // TODO: instead of recording isFinal/output on the + // node, maybe we should use -1 arc to mean "end" (like + // we do when reading the FST). Would simplify much + // code here... + public T output; + public boolean isFinal; + public long inputCount; /** This node's depth, starting from the automaton root. */ - final int depth; + public final int depth; /** * @param depth Index: lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java --- lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Thu Apr 14 18:48:02 2011 -0400 @@ -26,7 +26,7 @@ import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode; // NOTE: while the FST is able to represent a non-final -// dead-end state (NON_FINAL_END_NODE=0), the layres above +// dead-end state (NON_FINAL_END_NODE=0), the layers above // (FSTEnum, Util) have problems with this!! /** Represents an FST using a compact byte[] format. @@ -107,10 +107,10 @@ public int label; public T output; - int target; + public int target; byte flags; - T nextFinalOutput; + public T nextFinalOutput; int nextArc; // This is non-zero if current arcs are fixed array: @@ -146,7 +146,7 @@ return flag(BIT_LAST_ARC); } - boolean isFinal() { + public boolean isFinal() { return flag(BIT_FINAL_ARC); } }; @@ -235,6 +235,7 @@ throw new IllegalStateException("empty output is already set: " + outputs.outputToString(emptyOutput) + " vs " + outputs.outputToString(v)); } emptyOutput = v; + //System.out.println(" fst.setEmptyOutput=" + v); // TODO: this is messy -- replace with sillyBytesWriter; maybe make // bytes private @@ -255,6 +256,10 @@ writer.posWrite = posSave; } + public T getEmptyOutput() { + return emptyOutput; + } + public void save(DataOutput out) throws IOException { if (startNode == -1) { throw new IllegalStateException("call finish first"); @@ -514,7 +519,7 @@ // non-array: linear scan arc.bytesPerArc = 0; //System.out.println(" scan"); - while(!arc.isLast()) { + while((arc.flags & BIT_LAST_ARC) == 0) { // skip this arc: readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { @@ -565,8 +570,7 @@ } } - // Not private because NodeHash needs access: - Arc readFirstRealArc(int address, Arc arc) throws IOException { + public Arc readFirstRealArc(int address, Arc arc) throws IOException { final BytesReader in = getBytesReader(address); Index: lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java --- lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java Thu Apr 14 18:48:02 2011 -0400 @@ -211,6 +211,7 @@ // Shape for states. final String stateShape = "circle"; + final String finalStateShape = "doublecircle"; // Emit DOT prologue. out.write("digraph FST {\n"); @@ -221,12 +222,33 @@ } emitDotState(out, "initial", "point", "white", ""); - emitDotState(out, Integer.toString(startArc.target), stateShape, - fst.isExpandedTarget(startArc) ? expandedNodeColor : null, - ""); + + final T NO_OUTPUT = fst.outputs.getNoOutput(); + + final FST.Arc scratchArc = new FST.Arc(); + + { + final String stateColor; + if (fst.isExpandedTarget(startArc)) { + stateColor = expandedNodeColor; + } else { + stateColor = null; + } + + final boolean isFinal; + final T finalOutput; + if (startArc.isFinal()) { + isFinal = true; + finalOutput = startArc.nextFinalOutput == NO_OUTPUT ? null : startArc.nextFinalOutput; + } else { + isFinal = false; + finalOutput = null; + } + + emitDotState(out, Integer.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput)); + } out.write(" initial -> " + startArc.target + "\n"); - final T NO_OUTPUT = fst.outputs.getNoOutput(); int level = 0; while (!nextLevelQueue.isEmpty()) { @@ -238,19 +260,47 @@ out.write("\n // Transitions and states at level: " + level + "\n"); while (!thisLevelQueue.isEmpty()) { final FST.Arc arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); - if (fst.targetHasArcs(arc)) { // scan all arcs final int node = arc.target; fst.readFirstTargetArc(arc, arc); - + + if (arc.label == FST.END_LABEL) { + // Skip it -- prior recursion took this into account already + assert !arc.isLast(); + fst.readNextArc(arc); + } + while (true) { + // Emit the unseen state and add it to the queue for the next level. if (arc.target >= 0 && !seen.get(arc.target)) { - final boolean isExpanded = fst.isExpandedTarget(arc); - emitDotState(out, Integer.toString(arc.target), stateShape, - isExpanded ? expandedNodeColor : null, - labelStates ? Integer.toString(arc.target) : ""); + /* + boolean isFinal = false; + T finalOutput = null; + fst.readFirstTargetArc(arc, scratchArc); + if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { + // target is final + isFinal = true; + finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; + System.out.println("dot hit final label=" + (char) scratchArc.label); + } + */ + final String stateColor; + if (fst.isExpandedTarget(arc)) { + stateColor = expandedNodeColor; + } else { + stateColor = null; + } + + final String finalOutput; + if (arc.nextFinalOutput != null && arc.nextFinalOutput != NO_OUTPUT) { + finalOutput = fst.outputs.outputToString(arc.nextFinalOutput); + } else { + finalOutput = ""; + } + + emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput); seen.set(arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); sameLevelStates.add(arc.target); @@ -263,15 +313,20 @@ outs = ""; } - final String cl; - if (arc.label == FST.END_LABEL) { - cl = "~"; - } else { - cl = printableLabel(arc.label); + if (!fst.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) { + // nocommit -- this is broken again? + // Tricky special case: sometimes, due to + // pruning, the builder can [sillily] produce + // an FST with an arc into the final end state + // (-1) but also with a next final output; in + // this case we pull that output up onto this + // arc + outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]"; } - out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n"); - + assert arc.label != FST.END_LABEL; + out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n"); + // Break the loop if we're on the last arc of this state. if (arc.isLast()) { break; @@ -293,7 +348,7 @@ } // Emit terminating state (always there anyway). - out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n"); + out.write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); out.write(" {rank=sink; -1 }\n"); out.write("}\n"); Index: lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java --- lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java Thu Apr 14 18:48:02 2011 -0400 @@ -29,6 +29,8 @@ import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.BlockTreeTermsReader; +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; import org.apache.lucene.index.codecs.BlockTermsReader; import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.Codec; @@ -38,6 +40,8 @@ import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.PostingsWriterBase; +import org.apache.lucene.index.codecs.BlockTreePostingsReaderBase; +import org.apache.lucene.index.codecs.BlockTreePostingsWriterBase; import org.apache.lucene.index.codecs.TermStats; import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase; @@ -48,6 +52,8 @@ import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory; import org.apache.lucene.index.codecs.pulsing.PulsingPostingsReaderImpl; import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl; +import org.apache.lucene.index.codecs.pulsingtree.PulsingTreePostingsReader; +import org.apache.lucene.index.codecs.pulsingtree.PulsingTreePostingsWriter; import org.apache.lucene.index.codecs.sep.IntIndexInput; import org.apache.lucene.index.codecs.sep.IntIndexOutput; import org.apache.lucene.index.codecs.sep.IntStreamFactory; @@ -55,6 +61,8 @@ import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.index.codecs.standardtree.StandardTreePostingsReader; +import org.apache.lucene.index.codecs.standardtree.StandardTreePostingsWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -131,7 +139,7 @@ final long seed = seedRandom.nextLong(); if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: writing to seg=" + state.segmentName + " seed=" + seed); + System.out.println("MockRandomCodec: writing to seg=" + state.segmentName + " codecID=" + state.codecId + " seed=" + seed); } final String seedFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, SEED_EXT); @@ -140,87 +148,122 @@ out.close(); final Random random = new Random(seed); - PostingsWriterBase postingsWriter; if (random.nextBoolean()) { - postingsWriter = new SepPostingsWriterImpl(state, new MockIntStreamFactory(random), skipInterval); + // Use BlockTree terms dict + + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: writing BlockTree terms dict"); + } + BlockTreePostingsWriterBase postingsWriter = new StandardTreePostingsWriter(state); + + if (random.nextBoolean()) { + final int totTFCutoff = _TestUtil.nextInt(random, 1, 20); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: pulsing postings with totTFCutoff=" + totTFCutoff); + } + postingsWriter = new PulsingTreePostingsWriter(totTFCutoff, postingsWriter); + } + + final int minTermsInBlock = _TestUtil.nextInt(random, 4, 100); + + boolean success = false; + try { + FieldsConsumer ret = new BlockTreeTermsWriter(state, postingsWriter, minTermsInBlock); + success = true; + return ret; + } finally { + if (!success) { + postingsWriter.close(); + } + } } else { + if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: writing Standard postings"); + System.out.println("MockRandomCodec: writing Block terms dict"); } - postingsWriter = new StandardPostingsWriter(state, skipInterval); - } - if (random.nextBoolean()) { - final int totTFCutoff = _TestUtil.nextInt(random, 1, 20); - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: pulsing postings with totTFCutoff=" + totTFCutoff); + PostingsWriterBase postingsWriter; + if (random.nextBoolean()) { + postingsWriter = new SepPostingsWriterImpl(state, new MockIntStreamFactory(random)); + } else { + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: writing Standard postings"); + } + postingsWriter = new StandardPostingsWriter(state); } - postingsWriter = new PulsingPostingsWriterImpl(totTFCutoff, postingsWriter); - } - final TermsIndexWriterBase indexWriter; - boolean success = false; + if (random.nextBoolean()) { + final int totTFCutoff = _TestUtil.nextInt(random, 1, 20); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: writing pulsing postings with totTFCutoff=" + totTFCutoff); + } + postingsWriter = new PulsingPostingsWriterImpl(totTFCutoff, postingsWriter); + } - try { - if (random.nextBoolean()) { - state.termIndexInterval = _TestUtil.nextInt(random, 1, 100); - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: fixed-gap terms index (tii=" + state.termIndexInterval + ")"); + boolean success = false; + + final TermsIndexWriterBase indexWriter; + try { + if (random.nextBoolean()) { + state.termIndexInterval = _TestUtil.nextInt(random, 1, 100); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: fixed-gap terms index (tii=" + state.termIndexInterval + ")"); + } + indexWriter = new FixedGapTermsIndexWriter(state); + } else { + final VariableGapTermsIndexWriter.IndexTermSelector selector; + final int n2 = random.nextInt(3); + if (n2 == 0) { + final int tii = _TestUtil.nextInt(random, 1, 100); + selector = new VariableGapTermsIndexWriter.EveryNTermSelector(tii); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: variable-gap terms index (tii=" + tii + ")"); + } + } else if (n2 == 1) { + final int docFreqThresh = _TestUtil.nextInt(random, 2, 100); + final int tii = _TestUtil.nextInt(random, 1, 100); + selector = new VariableGapTermsIndexWriter.EveryNOrDocFreqTermSelector(docFreqThresh, tii); + } else { + final long seed2 = random.nextLong(); + final int gap = _TestUtil.nextInt(random, 2, 40); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: random-gap terms index (max gap=" + gap + ")"); + } + selector = new VariableGapTermsIndexWriter.IndexTermSelector() { + final Random rand = new Random(seed2); + + @Override + public boolean isIndexTerm(BytesRef term, TermStats stats) { + return rand.nextInt(gap) == 17; + } + + @Override + public void newField(FieldInfo fieldInfo) { + } + }; + } + indexWriter = new VariableGapTermsIndexWriter(state, selector); } - indexWriter = new FixedGapTermsIndexWriter(state); - } else { - final VariableGapTermsIndexWriter.IndexTermSelector selector; - final int n2 = random.nextInt(3); - if (n2 == 0) { - final int tii = _TestUtil.nextInt(random, 1, 100); - selector = new VariableGapTermsIndexWriter.EveryNTermSelector(tii); - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: variable-gap terms index (tii=" + tii + ")"); + success = true; + } finally { + if (!success) { + postingsWriter.close(); + } + } + + success = false; + try { + FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter); + success = true; + return ret; + } finally { + if (!success) { + try { + postingsWriter.close(); + } finally { + indexWriter.close(); } - } else if (n2 == 1) { - final int docFreqThresh = _TestUtil.nextInt(random, 2, 100); - final int tii = _TestUtil.nextInt(random, 1, 100); - selector = new VariableGapTermsIndexWriter.EveryNOrDocFreqTermSelector(docFreqThresh, tii); - } else { - final long seed2 = random.nextLong(); - final int gap = _TestUtil.nextInt(random, 2, 40); - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: random-gap terms index (max gap=" + gap + ")"); - } - selector = new VariableGapTermsIndexWriter.IndexTermSelector() { - final Random rand = new Random(seed2); - - @Override - public boolean isIndexTerm(BytesRef term, TermStats stats) { - return rand.nextInt(gap) == 17; - } - - @Override - public void newField(FieldInfo fieldInfo) { - } - }; - } - indexWriter = new VariableGapTermsIndexWriter(state, selector); - } - success = true; - } finally { - if (!success) { - postingsWriter.close(); - } - } - - success = false; - try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter); - success = true; - return ret; - } finally { - if (!success) { - try { - postingsWriter.close(); - } finally { - indexWriter.close(); } } } @@ -233,96 +276,137 @@ final IndexInput in = state.dir.openInput(seedFileName); final long seed = in.readLong(); if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: reading from seg=" + state.segmentInfo.name + " seed=" + seed); + System.out.println("MockRandomCodec: reading from seg=" + state.segmentInfo.name + " codecID=" + state.codecId + " seed=" + seed); } in.close(); final Random random = new Random(seed); - PostingsReaderBase postingsReader; if (random.nextBoolean()) { - postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, - state.readBufferSize, new MockIntStreamFactory(random), state.codecId); + // Use BlockTree terms dict + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: reading BlockTree terms dict"); + } + BlockTreePostingsReaderBase postingsReader = new StandardTreePostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId); + + if (random.nextBoolean()) { + final int totTFCutoff = _TestUtil.nextInt(random, 1, 20); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: reading pulsing postings with totTFCutoff=" + totTFCutoff); + } + postingsReader = new PulsingTreePostingsReader(postingsReader); + } + + // randomness diverges from writer, here: + if (state.termsIndexDivisor != -1) { + state.termsIndexDivisor = _TestUtil.nextInt(random, 1, 10); + } + + final int termsCacheSize = _TestUtil.nextInt(random, 1, 1024); + + boolean success = false; + try { + FieldsProducer ret = new BlockTreeTermsReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.readBufferSize, + termsCacheSize, + state.codecId, + state.termsIndexDivisor); + success = true; + return ret; + } finally { + if (!success) { + postingsReader.close(); + } + } + } else { - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: reading Standard postings"); + + PostingsReaderBase postingsReader; + + if (random.nextBoolean()) { + postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, + state.readBufferSize, new MockIntStreamFactory(random), state.codecId); + } else { + postingsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId); } - postingsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId); - } - if (random.nextBoolean()) { - final int totTFCutoff = _TestUtil.nextInt(random, 1, 20); - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: reading pulsing postings with totTFCutoff=" + totTFCutoff); + if (random.nextBoolean()) { + final int totTFCutoff = _TestUtil.nextInt(random, 1, 20); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: reading pulsing postings with totTFCutoff=" + totTFCutoff); + } + postingsReader = new PulsingPostingsReaderImpl(postingsReader); } - postingsReader = new PulsingPostingsReaderImpl(postingsReader); - } - final TermsIndexReaderBase indexReader; - boolean success = false; + final TermsIndexReaderBase indexReader; + boolean success = false; - try { - if (random.nextBoolean()) { - // if termsIndexDivisor is set to -1, we should not touch it. It means a - // test explicitly instructed not to load the terms index. + try { + final boolean doFixedGap = random.nextBoolean(); + + // randomness diverges from writer, here: if (state.termsIndexDivisor != -1) { state.termsIndexDivisor = _TestUtil.nextInt(random, 1, 10); } - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: fixed-gap terms index (divisor=" + state.termsIndexDivisor + ")"); + if (doFixedGap) { + // if termsIndexDivisor is set to -1, we should not touch it. It means a + // test explicitly instructed not to load the terms index. + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: fixed-gap terms index (divisor=" + state.termsIndexDivisor + ")"); + } + indexReader = new FixedGapTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUnicodeComparator(), + state.codecId); + } else { + final int n2 = random.nextInt(3); + if (n2 == 1) { + random.nextInt(); + } else if (n2 == 2) { + random.nextLong(); + } + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: variable-gap terms index (divisor=" + state.termsIndexDivisor + ")"); + } + indexReader = new VariableGapTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + state.codecId); } - indexReader = new FixedGapTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator(), - state.codecId); - } else { - final int n2 = random.nextInt(3); - if (n2 == 1) { - random.nextInt(); - } else if (n2 == 2) { - random.nextLong(); + success = true; + } finally { + if (!success) { + postingsReader.close(); } - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: variable-gap terms index (divisor=" + state.termsIndexDivisor + ")"); - } - if (state.termsIndexDivisor != -1) { - state.termsIndexDivisor = _TestUtil.nextInt(random, 1, 10); - } - indexReader = new VariableGapTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - state.codecId); } - success = true; - } finally { - if (!success) { - postingsReader.close(); - } - } - final int termsCacheSize = _TestUtil.nextInt(random, 1, 1024); + final int termsCacheSize = _TestUtil.nextInt(random, 1, 1024); - success = false; - try { - FieldsProducer ret = new BlockTermsReader(indexReader, - state.dir, - state.fieldInfos, - state.segmentInfo.name, - postingsReader, - state.readBufferSize, - termsCacheSize, - state.codecId); - success = true; - return ret; - } finally { - if (!success) { - try { - postingsReader.close(); - } finally { - indexReader.close(); + success = false; + try { + FieldsProducer ret = new BlockTermsReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.readBufferSize, + termsCacheSize, + state.codecId); + success = true; + return ret; + } finally { + if (!success) { + try { + postingsReader.close(); + } finally { + indexReader.close(); + } } } } @@ -335,6 +419,7 @@ SepPostingsReaderImpl.files(segmentInfo, codecId, files); StandardPostingsReader.files(dir, segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); + BlockTreeTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files); @@ -353,6 +438,7 @@ public void getExtensions(Set extensions) { SepPostingsWriterImpl.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); + BlockTreeTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); VariableGapTermsIndexReader.getIndexExtensions(extensions); extensions.add(SEED_EXT); Index: lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java --- lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java Thu Apr 14 18:48:02 2011 -0400 @@ -397,7 +397,7 @@ System.err.println("NOTE: test params are: codec=" + codecDescription + ", locale=" + locale + ", timezone=" + (timeZone == null ? "(null)" : timeZone.getID())); - if (testsFailed) { + if (VERBOSE || testsFailed) { System.err.println("NOTE: all tests run in this JVM:"); System.err.println(Arrays.toString(testClassesRun.toArray())); System.err.println("NOTE: " + System.getProperty("os.name") + " " Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java Thu Apr 14 18:48:02 2011 -0400 @@ -305,7 +305,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) { + public SeekStatus seek(BytesRef term, boolean useCache, boolean exactOnly /* ignored */) { current = term.utf8ToString(); it = null; if (ramField.termToDocs.containsKey(current)) { Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java --- lucene/src/test/org/apache/lucene/index/TestCodecs.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java Thu Apr 14 18:48:02 2011 -0400 @@ -65,7 +65,8 @@ private static String[] fieldNames = new String[] {"one", "two", "three", "four"}; private final static int NUM_TEST_ITER = 20 * RANDOM_MULTIPLIER; - private final static int NUM_TEST_THREADS = 3; + // nocommit + private final static int NUM_TEST_THREADS = 1; private final static int NUM_FIELDS = 4; private final static int NUM_TERMS_RAND = 50; // must be > 16 to test skipping private final static int DOC_FREQ_RAND = 500; // must be > 16 to test skipping @@ -492,6 +493,9 @@ } // Test seek to non-existent terms: + if (VERBOSE) { + System.out.println("TEST: seek non-exist terms"); + } for(int i=0;i<100;i++) { final String text2 = _TestUtil.randomUnicodeString(random) + "."; status = termsEnum.seek(new BytesRef(text2)); @@ -500,6 +504,9 @@ } // Seek to each term, backwards: + if (VERBOSE) { + System.out.println("TEST: seek terms backwards"); + } for(int i=field.terms.length-1;i>=0;i--) { assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(new BytesRef(field.terms[i].text2))); assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); Index: lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java --- lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java Thu Apr 14 18:48:02 2011 -0400 @@ -445,7 +445,7 @@ */ if (VERBOSE) { - System.out.println("TEST: verify prefix=" + prefixRef.utf8ToString()); + System.out.println("TEST: verify prefix=" + (prefixRef==null ? "null" : prefixRef.utf8ToString())); System.out.println("TEST: all TERMS:"); TermsEnum allTE = MultiFields.getTerms(r, "field").iterator(); int ord = 0; @@ -463,7 +463,7 @@ Terms terms = r.fields().terms("field"); if (terms != null) { TermsEnum termsEnum = terms.iterator(); - TermsEnum.SeekStatus result = termsEnum.seek(prefixRef, false); + TermsEnum.SeekStatus result = termsEnum.seek(prefixRef, false, false); if (result != TermsEnum.SeekStatus.END) { assertFalse("term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), termsEnum.term().startsWith(prefixRef)); } else { Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java --- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java Thu Apr 14 18:48:02 2011 -0400 @@ -3052,7 +3052,13 @@ final int left = ids.size() - upto; final int inc = Math.min(left, _TestUtil.nextInt(random, 1, 20)); final int limit = upto + inc; + if (VERBOSE) { + System.out.println("TEST: cycle delete " + inc + " ids; " + (ids.size()-limit) + " will remain"); + } while(upto < limit) { + if (VERBOSE) { + System.out.println("TEST: del " + ids.get(upto)); + } w.deleteDocuments(new Term("id", ""+ids.get(upto++))); } final IndexReader r = w.getReader(); Index: lucene/src/test/org/apache/lucene/index/TestMultiFields.java --- lucene/src/test/org/apache/lucene/index/TestMultiFields.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestMultiFields.java Thu Apr 14 18:48:02 2011 -0400 @@ -29,10 +29,15 @@ int num = 2 * RANDOM_MULTIPLIER; for (int iter = 0; iter < num; iter++) { + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); _TestUtil.keepFullyDeletedSegments(w); + w.setInfoStream(VERBOSE ? System.out : null); Map> docs = new HashMap>(); Set deleted = new HashSet(); @@ -80,7 +85,7 @@ if (VERBOSE) { List termsList = new ArrayList(uniqueTerms); Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); - System.out.println("UTF16 order:"); + System.out.println("TEST: terms in UTF16 order:"); for(BytesRef b : termsList) { System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString())); } @@ -88,7 +93,9 @@ IndexReader reader = w.getReader(); w.close(); - //System.out.println("TEST reader=" + reader); + if (VERBOSE) { + System.out.println("TEST: reader=" + reader); + } Bits delDocs = MultiFields.getDeletedDocs(reader); for(int delDoc : deleted) { @@ -99,7 +106,7 @@ for(int i=0;i<100;i++) { BytesRef term = terms.get(random.nextInt(terms.size())); if (VERBOSE) { - System.out.println("TEST: seek to term= "+ UnicodeUtil.toHexString(term.utf8ToString())); + System.out.println("TEST: seek to term="+ UnicodeUtil.toHexString(term.utf8ToString()) + " " + term); } DocsEnum docsEnum = terms2.docs(delDocs, term, null); Index: lucene/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java --- lucene/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java Thu Apr 14 18:48:02 2011 -0400 @@ -226,7 +226,7 @@ Fields fields = MultiFields.getFields(reader); Terms cterms = fields.terms(term.field); TermsEnum ctermsEnum = cterms.iterator(); - SeekStatus ss = ctermsEnum.seek(new BytesRef(term.text()), false); + SeekStatus ss = ctermsEnum.seek(new BytesRef(term.text()), false, true); if (ss.equals(SeekStatus.FOUND)) { DocsEnum docsEnum = ctermsEnum.docs(bits, null); return toArray(docsEnum); Index: lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java --- lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java Thu Apr 14 18:48:02 2011 -0400 @@ -94,12 +94,18 @@ public static Query csrq(String f, String l, String h, boolean il, boolean ih) { TermRangeQuery query = TermRangeQuery.newStringRange(f, l, h, il, ih); query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); + if (VERBOSE) { + System.out.println("TEST: query=" + query); + } return query; } public static Query csrq(String f, String l, String h, boolean il, boolean ih, MultiTermQuery.RewriteMethod method) { TermRangeQuery query = TermRangeQuery.newStringRange(f, l, h, il, ih); query.setRewriteMethod(method); + if (VERBOSE) { + System.out.println("TEST: query=" + query + " method=" + method); + } return query; } @@ -275,6 +281,10 @@ IndexReader reader = signedIndexReader; IndexSearcher search = newSearcher(reader); + if (VERBOSE) { + System.out.println("TEST: reader=" + reader); + } + int medId = ((maxId - minId) / 2); String minIP = pad(minId); Index: lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java --- lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java Thu Apr 14 18:48:02 2011 -0400 @@ -80,7 +80,7 @@ System.out.println(" " + UnicodeUtil.toHexString(s)); } } - + reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); @@ -139,6 +139,9 @@ int num = CodecProvider.getDefault().getFieldCodec("field").equals("PreFlex") ? 100 * RANDOM_MULTIPLIER : 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String reg = AutomatonTestUtil.randomRegexp(random); + if (VERBOSE) { + System.out.println("TEST: regexp=" + reg); + } assertSame(reg); } } Index: lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java --- lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java Thu Apr 14 18:48:02 2011 -0400 @@ -59,10 +59,13 @@ field.setValue(df.format(i)); writer.addDocument(doc); } - + reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); + if (VERBOSE) { + System.out.println("TEST: setUp searcher=" + searcher); + } } private char N() { @@ -85,7 +88,11 @@ private void assertPatternHits(String pattern, int numHits) throws Exception { // TODO: run with different rewrites - Query wq = new WildcardQuery(new Term("field", fillPattern(pattern))); + final String filledPattern = fillPattern(pattern); + if (VERBOSE) { + System.out.println("TEST: run wildcard pattern=" + pattern + " filled=" + filledPattern); + } + Query wq = new WildcardQuery(new Term("field", filledPattern)); TopDocs docs = searcher.search(wq, 25); assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits); } Index: lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java --- lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java Thu Apr 14 18:48:02 2011 -0400 @@ -149,11 +149,14 @@ } for (int i=1;i<10;i++) { query = newPhraseQuery("field", English.intToEnglish(i)+" hundred", true, new AveragePayloadFunction()); + if (VERBOSE) { + System.out.println("TEST: run query=" + query); + } // all should have score = 3 because adjacent terms have payloads of 2,4 // and all the similarity factors are set to 1 hits = searcher.search(query, null, 100); assertTrue("hits is null and it shouldn't be", hits != null); - assertTrue("should be 100 hits", hits.totalHits == 100); + assertEquals("should be 100 hits", 100, hits.totalHits); for (int j = 0; j < hits.scoreDocs.length; j++) { ScoreDoc doc = hits.scoreDocs[j]; // System.out.println("Doc: " + doc.toString()); Index: lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java --- lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Thu Apr 14 15:39:03 2011 -0400 +++ lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Thu Apr 14 18:48:02 2011 -0400 @@ -24,18 +24,25 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; +import java.io.StringWriter; import java.io.Writer; import java.util.*; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SerialMergeScheduler; +import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IndexInput; @@ -810,15 +817,15 @@ } } - //System.out.println("TEST: after prune"); - /* - for(Map.Entry ent : prefixes.entrySet()) { - System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); - if (ent.getValue().isFinal) { - System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); + if (VERBOSE) { + System.out.println("TEST: after prune"); + for(Map.Entry> ent : prefixes.entrySet()) { + System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); + if (ent.getValue().isFinal) { + System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); + } } - } - */ + } if (prefixes.size() <= 1) { assertNull(fst); @@ -1011,7 +1018,7 @@ final BytesRef randomTerm = new BytesRef(getRandomString()); if (VERBOSE) { - System.out.println("TEST: seek " + randomTerm.utf8ToString() + " " + randomTerm); + System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm); } final TermsEnum.SeekStatus seekResult = termsEnum.seek(randomTerm); @@ -1063,10 +1070,10 @@ assertEquals(termsEnum.term().utf8ToString() + " != " + fstEnum.current().input.utf8ToString(), termsEnum.term(), fstEnum.current().input); if (storeOrd) { // fst stored the ord - assertEquals(termsEnum.ord(), ((Long) fstEnum.current().output).longValue()); + assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.ord(), ((Long) fstEnum.current().output).longValue()); } else { // fst stored the docFreq - assertEquals(termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); + assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); } } } @@ -1184,7 +1191,7 @@ } } - // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out + // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out public static void main(String[] args) throws IOException { int prune = 0; int limit = Integer.MAX_VALUE; @@ -1341,6 +1348,191 @@ assertEquals(42, (long) seekResult.output); } + public void testPrimaryKeys() throws Exception { + Directory dir = newDirectory(); + + for(int cycle=0;cycle<2;cycle++) { + if (VERBOSE) { + System.out.println("TEST: cycle=" + cycle); + } + RandomIndexWriter w = new RandomIndexWriter(random, dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); + Document doc = new Document(); + Field idField = newField("id", "", Field.Index.NOT_ANALYZED); + doc.add(idField); + + final long seed = random.nextLong(); + + final int NUM_IDS = (int) (1000*RANDOM_MULTIPLIER*(1.0+random.nextDouble())); + //final int NUM_IDS = (int) (377 * (1.0+random.nextDouble())); + if (VERBOSE) { + System.out.println("TEST: NUM_IDS=" + NUM_IDS); + } + final Set allIDs = new HashSet(); + for(int id=0;id allIDsList = new ArrayList(allIDs); + final List sortedAllIDsList = new ArrayList(allIDsList); + Collections.sort(sortedAllIDsList); + + // Sprinkle in some non-existent PKs: + Set outOfBounds = new HashSet(); + for(int idx=0;idx builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, outputs); + builder.add("stat", outputs.get(17)); + builder.add("station", outputs.get(10)); + final FST fst = builder.finish(); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + StringWriter w = new StringWriter(); + Util.toDot(fst, w, false, false); + w.close(); + assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1); + } + + public void testInternalFinalState() throws Exception { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + builder.add(new BytesRef("stat"), outputs.getNoOutput()); + builder.add(new BytesRef("station"), outputs.getNoOutput()); + final FST fst = builder.finish(); + StringWriter w = new StringWriter(); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + Util.toDot(fst, w, false, false); + w.close(); + assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1); + } + // Make sure raw FST can differentiate between final vs // non-final end nodes public void testNonFinalStopNodes() throws Exception { Index: solr/src/java/org/apache/solr/handler/component/TermsComponent.java --- solr/src/java/org/apache/solr/handler/component/TermsComponent.java Thu Apr 14 15:39:03 2011 -0400 +++ solr/src/java/org/apache/solr/handler/component/TermsComponent.java Thu Apr 14 18:48:02 2011 -0400 @@ -162,7 +162,7 @@ BytesRef term = null; if (lowerBytes != null) { - if (termsEnum.seek(lowerBytes, true) == TermsEnum.SeekStatus.END) { + if (termsEnum.seek(lowerBytes, true, false) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); Index: solr/src/java/org/apache/solr/request/SimpleFacets.java --- solr/src/java/org/apache/solr/request/SimpleFacets.java Thu Apr 14 15:39:03 2011 -0400 +++ solr/src/java/org/apache/solr/request/SimpleFacets.java Thu Apr 14 18:48:02 2011 -0400 @@ -644,7 +644,7 @@ // facet.offset when sorting by index order. if (startTermBytes != null) { - if (termsEnum.seek(startTermBytes, true) == TermsEnum.SeekStatus.END) { + if (termsEnum.seek(startTermBytes, true, false) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); Index: solr/src/java/org/apache/solr/request/UnInvertedField.java --- solr/src/java/org/apache/solr/request/UnInvertedField.java Thu Apr 14 15:39:03 2011 -0400 +++ solr/src/java/org/apache/solr/request/UnInvertedField.java Thu Apr 14 18:48:02 2011 -0400 @@ -223,13 +223,13 @@ TermsEnum te = getOrdTermsEnum(searcher.getIndexReader()); if (prefix != null && prefix.length() > 0) { BytesRef prefixBr = new BytesRef(prefix); - if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) { + if (te.seek(prefixBr, true, false) == TermsEnum.SeekStatus.END) { startTerm = numTermsInField; } else { startTerm = (int) te.ord(); } prefixBr.append(ByteUtils.bigTerm); - if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) { + if (te.seek(prefixBr, true, false) == TermsEnum.SeekStatus.END) { endTerm = numTermsInField; } else { endTerm = (int) te.ord(); Index: solr/src/java/org/apache/solr/search/function/FileFloatSource.java --- solr/src/java/org/apache/solr/search/function/FileFloatSource.java Thu Apr 14 15:39:03 2011 -0400 +++ solr/src/java/org/apache/solr/search/function/FileFloatSource.java Thu Apr 14 18:48:02 2011 -0400 @@ -261,7 +261,7 @@ continue; // go to next line in file.. leave values as default. } - if (termsEnum.seek(internalKey, false) != TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seek(internalKey, false, true) != TermsEnum.SeekStatus.FOUND) { if (notFoundCount<10) { // collect first 10 not found for logging notFound.add(key); } Index: solr/src/test/org/apache/solr/request/TestFaceting.java --- solr/src/test/org/apache/solr/request/TestFaceting.java Thu Apr 14 15:39:03 2011 -0400 +++ solr/src/test/org/apache/solr/request/TestFaceting.java Thu Apr 14 18:48:02 2011 -0400 @@ -103,7 +103,7 @@ // test seeking before term if (size>0) { - assertEquals(size>0, te.seek(new BytesRef("000"), true) != TermsEnum.SeekStatus.END); + assertEquals(size>0, te.seek(new BytesRef("000"), true, false) != TermsEnum.SeekStatus.END); assertEquals(0, te.ord()); assertEquals(t(0), te.term().utf8ToString()); }