Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Thu Mar 03 12:25:14 2011 -0500 @@ -44,7 +44,7 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) { + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly /* ignored */) { final Term t = new Term(field, text); int loc = Arrays.binarySearch(terms, t, InstantiatedTerm.termComparator); if (loc < 0) { Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Thu Mar 03 12:25:14 2011 -0500 @@ -855,7 +855,7 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) { + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly /* ignored */) { termUpto = Arrays.binarySearch(info.sortedTerms, text, termComparator); if (termUpto < 0) { // not found; choose successor termUpto = -termUpto -1; Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java Thu Mar 03 12:25:14 2011 -0500 @@ -541,7 +541,7 @@ // we have a non-empty index, check if the term exists currentTerm.copy(word); for (TermsEnum te : termsEnums) { - if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) { + if (te.seek(currentTerm, false, true) == TermsEnum.SeekStatus.FOUND) { continue terms; } } Index: lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java --- lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java Thu Mar 03 12:25:14 2011 -0500 @@ -371,9 +371,7 @@ } assert checkDeleteTerm(term); - // System.out.println(" term=" + term); - - if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seek(term.bytes(), false, true) == TermsEnum.SeekStatus.FOUND) { DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); if (docsEnum != null) { Index: lucene/src/java/org/apache/lucene/index/FilterIndexReader.java --- lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Thu Mar 03 12:25:14 2011 -0500 @@ -132,8 +132,8 @@ public FilterTermsEnum(TermsEnum in) { this.in = in; } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { - return in.seek(text, useCache); + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly) throws IOException { + return in.seek(text, useCache, exactOnly); } @Override Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java --- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Thu Mar 03 12:25:14 2011 -0500 @@ -139,7 +139,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef term, boolean useCache, boolean exactOnly) throws IOException { queue.clear(); numTop = 0; @@ -147,8 +147,13 @@ if (lastSeek != null && termComp.compare(lastSeek, term) <= 0) { seekOpt = true; } - lastSeekScratch.copy(term); - lastSeek = lastSeekScratch; + + if (!exactOnly) { + lastSeekScratch.copy(term); + lastSeek = lastSeekScratch; + } else { + lastSeek = null; + } for(int i=0;i(termsCacheSize); - //this.segment = segment; + this.segment = segment; in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION), readBufferSize); @@ -240,6 +241,11 @@ } } + // nocommit + public static int totBlockReadCount; + public static int totScanCount; + public static long totSeekNanos; + private class FieldReader extends Terms implements Closeable { final long numTerms; final FieldInfo fieldInfo; @@ -341,29 +347,35 @@ return BytesRef.getUTF8SortedAsUnicodeComparator(); } - // TODO: we may want an alternate mode here which is - // "if you are about to return NOT_FOUND I won't use - // the terms data from that"; eg FuzzyTermsEnum will - // (usually) just immediately call seek again if we - // return NOT_FOUND so it's a waste for us to fill in - // the term that was actually NOT_FOUND @Override - public SeekStatus seek(final BytesRef target, final boolean useCache) throws IOException { + public SeekStatus nextPossiblePrefix(BytesRef prefix) throws IOException { + if (indexEnum.nextPossiblePrefix(prefix)) { + return SeekStatus.NOT_FOUND; + } else { + return SeekStatus.FOUND; + } + // nocommit -- need END too + } + + @Override + public SeekStatus seek(final BytesRef target, final boolean useCache, final boolean exactOnly) throws IOException { + + // nocommit -- if exactOnly is true can we do even + // less work here...? eg if we are not gonna seek + // (just w/in block scan)...? if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } - - /* - System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); + + //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " exactOnly=" + exactOnly + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { - System.out.println(" nextIndexTerm=null"); + //System.out.println(" nextIndexTerm=null"); } else { - System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); + //System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } - */ // Check cache if (useCache) { @@ -421,7 +433,24 @@ // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: - in.seek(indexEnum.seek(target)); + final long ta0 = System.nanoTime(); + final long fp = indexEnum.seek(target); + totSeekNanos += System.nanoTime() - ta0; + + if (exactOnly && !indexEnum.lastSeekMayExist()) { + // Sometimes FST (VarGap) terms index is able to + // tell us that a given term cannot possibly + // exist in the terms dict, in which case we can + // fast path the NOT_FOUND return if caller only + // wants to seek if the term exists: + indexIsCurrent = false; + //System.out.println(" fast path NOT_FOUND"); + return SeekStatus.NOT_FOUND; + } + + in.seek(fp); + + //System.out.println(" seek: indexTerm=" + indexEnum.term().utf8ToString() + " " + indexEnum.term()); boolean result = nextBlock(); // Block must exist since, at least, the indexed term @@ -437,7 +466,6 @@ } term.copy(indexEnum.term()); - //System.out.println(" seek: term=" + term.utf8ToString()); } else { //System.out.println(" skip seek"); if (state.termCount == state.blockTermCount && !nextBlock()) { @@ -459,7 +487,7 @@ // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. - while(true) { + while (true) { // First, see if target term matches common prefix // in this block: @@ -481,6 +509,7 @@ state.termCount++; state.ord++; termSuffixesReader.skipBytes(termSuffixesReader.readVInt()); + totScanCount++; } final int suffix = termSuffixesReader.readVInt(); term.length = termBlockPrefix + suffix; @@ -519,6 +548,7 @@ // Test every term in this block while (true) { + totScanCount++; state.termCount++; state.ord++; @@ -528,6 +558,17 @@ final int termLen = termBlockPrefix + suffix; int bytePos = termSuffixesReader.getPosition(); + // nocommit + /* + { + BytesRef termx = new BytesRef(term); + termx.grow(termBlockPrefix + suffix); + System.arraycopy(termSuffixes, bytePos, termx.bytes, termBlockPrefix, suffix); + termx.length = termBlockPrefix + suffix; + //System.out.println(" scan term=" + termx.utf8ToString() + " suffix=" + suffix); + } + */ + boolean next = false; final int limit = target.offset + (termLen < target.length ? termLen : target.length); int targetPos = target.offset + termBlockPrefix; @@ -546,7 +587,7 @@ term.grow(term.length); } termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix); - //System.out.println(" NOT_FOUND"); + //System.out.println(" NOT_FOUND term=" + term.utf8ToString() + " suffix=" + suffix); return SeekStatus.NOT_FOUND; } } @@ -794,6 +835,8 @@ not pay the price of decoding metadata they won't use. */ private boolean nextBlock() throws IOException { + + totBlockReadCount++; // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded @@ -808,6 +851,7 @@ return false; } termBlockPrefix = in.readVInt(); + //System.out.println(" termBlockPrefix=" + termBlockPrefix); // term suffixes: int len = in.readVInt(); @@ -833,6 +877,7 @@ postingsReader.readTermsBlock(in, fieldInfo, state); blocksSinceSeek++; + // nocommit -- cache .getDivisor() indexIsCurrent &= (blocksSinceSeek < indexReader.getDivisor()); //System.out.println(" indexIsCurrent=" + indexIsCurrent); @@ -842,6 +887,11 @@ private void decodeMetaData() throws IOException { //System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" + state.termCount + " state=" + state); if (!seekPending) { + // TODO: cutover to random-access API + // here.... really stupid that we have to decode N + // wasted term metadata just to get to the N+1th + // that we really need... + // lazily catch up on metadata decode: final int limit = state.termCount; // We must set/incr state.termCount because @@ -869,7 +919,7 @@ metaDataUpto++; state.termCount++; } - //} else { + } else { //System.out.println(" skip! seekPending"); } } Index: lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java --- lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java Thu Mar 03 12:25:14 2011 -0500 @@ -64,7 +64,8 @@ private final TermsIndexWriterBase termsIndexWriter; private final List fields = new ArrayList(); - //private final String segment; + // nocommit + private final String segment; public BlockTermsWriter( TermsIndexWriterBase termsIndexWriter, @@ -79,7 +80,7 @@ writeHeader(out); currentField = null; this.postingsWriter = postingsWriter; - //segment = state.segmentName; + segment = state.segmentName; //System.out.println("BTW.init seg=" + state.segmentName); Index: lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java --- lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java Thu Mar 03 12:25:14 2011 -0500 @@ -140,6 +140,11 @@ } @Override + public boolean lastSeekMayExist() { + return true; + } + + @Override public BytesRef term() { return term; } Index: lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java --- lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java Thu Mar 03 12:25:14 2011 -0500 @@ -58,14 +58,27 @@ // into the main terms dict (_X.tis) file: public static abstract class FieldIndexEnum { + // nocommit -- pass down exactOnly here too? we don't + // need to waste effort moving to next term if it's not + // exact? + /** Seeks to "largest" indexed term that's <= * term; retruns file pointer index (into the main * terms index file) for that term */ public abstract long seek(BytesRef term) throws IOException; + /** Return false if the last term passed to seek cannot + * exist in the terms dict; else, true */ + public abstract boolean lastSeekMayExist() throws IOException; + /** Returns -1 at end */ public abstract long next() throws IOException; + // nocommit -- how to repr EOF? + public boolean nextPossiblePrefix(BytesRef prefix) throws IOException { + return false; + } + public abstract BytesRef term(); // Only impl'd if supportsOrd() returns true! Index: lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java --- lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java Thu Mar 03 12:25:14 2011 -0500 @@ -17,7 +17,10 @@ * limitations under the License. */ +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; @@ -34,6 +37,7 @@ import org.apache.lucene.util.automaton.fst.BytesRefFSTEnum; import org.apache.lucene.util.automaton.fst.FST; import org.apache.lucene.util.automaton.fst.PositiveIntOutputs; +import org.apache.lucene.util.automaton.fst.Util; /** See {@link VariableGapTermsIndexWriter} * @@ -52,11 +56,15 @@ // start of the field info data protected long dirOffset; + // nocommit + String segment; + public VariableGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, String codecId) throws IOException { in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION)); - + this.segment = segment; + boolean success = false; try { @@ -117,21 +125,37 @@ @Override public long seek(BytesRef target) throws IOException { - //System.out.println("VGR: seek field=" + fieldInfo.name + " target=" + target); + //System.out.println("VGR: seek target=" + target.utf8ToString() + " " + target); + // nocommit -- what will the bushy arcs do to this...? + // it's gonna report the right fp but wrong prefix + // term? does this somehow "work out ok"...? current = fstEnum.seekFloor(target); //System.out.println(" got input=" + current.input + " output=" + current.output); return current.output; } @Override + public boolean lastSeekMayExist() { + //System.out.println("VGR.lastSeekMayExist: return " + fstEnum.lastSeekMayExist()); + return fstEnum.lastSeekFloorMayExist(); + } + + @Override public long next() throws IOException { - //System.out.println("VGR: next field=" + fieldInfo.name); - current = fstEnum.next(); - if (current == null) { - //System.out.println(" eof"); - return -1; - } else { - return current.output; + //System.out.println("VGR: next current=" + current.input.utf8ToString()); + final Long curOutput = current.output; + while (true) { + // nocommit -- make an fstEnum.next() that skips + // ahead to the first pair that produces a new + // output? + current = fstEnum.next(); + if (current == null) { + //System.out.println(" eof"); + return -1; + } else if (!current.output.equals(curOutput)) { + return current.output; + } + //System.out.println(" vgr.next cycle term=" + current.input.utf8ToString()); } } @@ -144,6 +168,20 @@ public long seek(long ord) { throw new UnsupportedOperationException(); } + + @Override + public boolean nextPossiblePrefix(BytesRef prefix) throws IOException { + // nocommit -- how to convey hit == null? + final BytesRefFSTEnum.InputOutput hit = fstEnum.seekCeil(prefix); + if (hit != null && fstEnum.lastSeekCeilIsFull()) { + prefix.copy(hit.input); + // nocommit -- true maybe be overstating here? ie + // we may have just copied in the same prefix + return true; + } else { + return false; + } + } } @Override @@ -176,10 +214,20 @@ fst = new FST(clone, fstOutputs); clone.close(); - if (indexDivisor > 1) { + if (false) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + System.out.println("SAVED to " + dotFileName); + Util.toDot(fst, w, false, false); + w.close(); + } + + // nocommit + if (false && indexDivisor > 1) { + // nocommit -- must handle the thorns here!! // subsample final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, outputs); final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); BytesRefFSTEnum.InputOutput result; int count = indexDivisor; Index: lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java --- lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java Thu Mar 03 12:25:14 2011 -0500 @@ -156,8 +156,14 @@ // terms or as rare as every * 10 (eg 2560), // in the extremes. + // nocommit + private String segment; + + // nocommit -- enable setting allowed bushiness + public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy) throws IOException { final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION); + segment = state.segmentName; out = state.directory.createOutput(indexFileName); fieldInfos = state.fieldInfos; this.policy = policy; @@ -172,7 +178,7 @@ @Override public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException { - ////System.out.println("VGW: field=" + field.name); + //System.out.println("VGW: field=" + field.name); policy.newField(field); FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer); fields.add(writer); @@ -208,30 +214,34 @@ private final BytesRef lastTerm = new BytesRef(); private boolean first = true; + private Long lastOutput; public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { this.fieldInfo = fieldInfo; fstOutputs = PositiveIntOutputs.getSingleton(true); fstBuilder = new Builder(FST.INPUT_TYPE.BYTE1, - 0, 0, true, + 0, 0, true, true, fstOutputs); + fstBuilder.segment = segment; indexStart = out.getFilePointer(); - ////System.out.println("VGW: field=" + fieldInfo.name); + //System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in - fstBuilder.add(new BytesRef(), fstOutputs.get(termsFilePointer)); + lastOutput = fstOutputs.get(termsFilePointer); + fstBuilder.add(new BytesRef(), lastOutput, true); } @Override public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException { - //System.out.println("VGW: index term=" + text.utf8ToString()); + //System.out.println("VGW.check: seg=" + segment + " index term=" + fieldInfo.name + ":" + text.utf8ToString() + " lastFP=" + lastOutput); // NOTE: we must force the first term per field to be // indexed, in case policy doesn't: if (policy.isIndexTerm(text, stats) || first) { first = false; - //System.out.println(" YES"); return true; } else { + // nocommit -- hmm truncate the max length here...? + fstBuilder.add(text, lastOutput, false); lastTerm.copy(text); return false; } @@ -240,12 +250,18 @@ @Override public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException { final int lengthSave = text.length; + //System.out.println("VGW.add seg=" + segment + " fp=" + termsFilePointer); text.length = indexedTermPrefixLength(lastTerm, text); + lastOutput = fstOutputs.get(termsFilePointer); + final boolean trimmed = text.length < lengthSave; try { - fstBuilder.add(text, fstOutputs.get(termsFilePointer)); + fstBuilder.add(text, lastOutput, true); } finally { text.length = lengthSave; } + if (trimmed) { + fstBuilder.add(text, lastOutput, false); + } lastTerm.copy(text); } Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Thu Mar 03 12:25:14 2011 -0500 @@ -757,7 +757,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef term, boolean useCache, boolean exactOnly /* ignored */) throws IOException { if (DEBUG_SURROGATES) { System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString())); } Index: lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java --- lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Thu Mar 03 12:25:14 2011 -0500 @@ -130,7 +130,7 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { + public SeekStatus seek(BytesRef text, boolean useCache /* ignored */, boolean exactOnly /* ignored */) throws IOException { //System.out.println("seek to text=" + text.utf8ToString()); final BytesRefFSTEnum.InputOutput>> result = fstEnum.seekCeil(text); @@ -466,6 +466,7 @@ 0, 0, true, + false, new PairOutputs>(posIntOutputs, new PairOutputs(posIntOutputs, posIntOutputs))); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); Index: lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java --- lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java Thu Mar 03 12:25:14 2011 -0500 @@ -135,7 +135,7 @@ * @throws UnsupportedOperationException */ @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef term, boolean useCache, boolean exactOnly) throws IOException { throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); } @@ -189,7 +189,7 @@ final BytesRef t = nextSeekTerm(actualTerm); // Make sure we always seek forward: assert actualTerm == null || t == null || getComparator().compare(t, actualTerm) > 0: "curTerm=" + actualTerm + " seekTerm=" + t; - if (t == null || tenum.seek(t, false) == SeekStatus.END) { + if (t == null || tenum.seek(t, false, false) == SeekStatus.END) { // no more terms to seek to or enum exhausted return null; } Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Thu Mar 03 12:25:14 2011 -0500 @@ -216,7 +216,7 @@ } private BytesRef queuedBottom = null; - + @Override public BytesRef next() throws IOException { if (queuedBottom != null) { @@ -283,8 +283,9 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { - return actualEnum.seek(text, useCache); + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly) throws IOException { + // nocommit -- ok to pass-through...? + return actualEnum.seek(text, useCache, exactOnly); } @Override Index: lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java --- lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java Thu Mar 03 12:25:14 2011 -0500 @@ -241,7 +241,7 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { + public SeekStatus seek(BytesRef text, boolean useCache, boolean exactOnly /*ignored*/) throws IOException { int low = 1; int high = numOrd-1; Index: lucene/src/java/org/apache/lucene/util/PerReaderTermState.java --- lucene/src/java/org/apache/lucene/util/PerReaderTermState.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/util/PerReaderTermState.java Thu Mar 03 12:25:14 2011 -0500 @@ -84,14 +84,17 @@ final BytesRef bytes = term.bytes(); final PerReaderTermState perReaderTermState = new PerReaderTermState(context); final AtomicReaderContext[] leaves = ReaderUtil.leaves(context); + //System.out.println("prts.build term=" + term); for (int i = 0; i < leaves.length; i++) { + //System.out.println(" r=" + leaves[i].reader); final Fields fields = leaves[i].reader.fields(); if (fields != null) { final Terms terms = fields.terms(field); if (terms != null) { final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share! - if (SeekStatus.FOUND == termsEnum.seek(bytes, cache)) { + if (SeekStatus.FOUND == termsEnum.seek(bytes, cache, true)) { final TermState termState = termsEnum.termState(); + //System.out.println(" found"); perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq()); } } Index: lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java --- lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Thu Mar 03 12:25:14 2011 -0500 @@ -61,6 +61,10 @@ // terms go through it: private final int minSuffixCount2; + // used for terms index: marked terms must never be + // pruned, but unmarked terms may be prefix-pruned + private final boolean useDoNotPrune; + private final IntsRef lastInput = new IntsRef(); // NOTE: cutting this over to ArrayList instead loses ~6% @@ -69,9 +73,13 @@ // current "frontier" private UnCompiledNode[] frontier; - public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs outputs) { + // nocommit fix default + private double maxRatioPrefixArcs = 0.5; + + public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, boolean useDoNotPrune, Outputs outputs) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; + this.useDoNotPrune = useDoNotPrune; fst = new FST(inputType, outputs); if (doMinSuffix) { dedupHash = new NodeHash(fst); @@ -99,8 +107,19 @@ return dedupHash == null ? 0 : fst.nodeCount; } + /** If the number of prefix-only arcs divided by the + * total number of arcs is <= this value, then all + * prefix-only arcs are kept for the node. */ + public void setMaxRatioPrefixArcs(double v) { + maxRatioPrefixArcs = v; + } + + public double getMaxRatioPrefixArcs() { + return maxRatioPrefixArcs; + } + private CompiledNode compileNode(UnCompiledNode n) throws IOException { - + //System.out.println(" compileNode numArcs=" + n.numArcs); final int address; if (dedupHash != null) { if (n.numArcs == 0) { @@ -112,6 +131,7 @@ address = fst.addNode(n); } assert address != -2; + //System.out.println(" address=" + address); n.clear(); @@ -120,89 +140,207 @@ return fn; } + private int fullNodeCount; + private int extraArcCount; + private void compilePrevTail(int prefixLenPlus1) throws IOException { - assert prefixLenPlus1 >= 1; - //System.out.println(" compileTail " + prefixLenPlus1); + //System.out.println("seg=" + segment + " compileTail pref+1=" + prefixLenPlus1); for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { - boolean doPrune = false; - boolean doCompile = false; final UnCompiledNode node = frontier[idx]; - final UnCompiledNode parent = frontier[idx-1]; + final UnCompiledNode parent = idx == 0 ? null : frontier[idx-1]; - if (node.inputCount < minSuffixCount1) { - doPrune = true; - doCompile = true; - } else if (idx > prefixLenPlus1) { - // prune if parent's inputCount is less than suffixMinCount2 - if (parent.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.inputCount == 1) { - // my parent, about to be compiled, doesn't make the cut, so - // I'm definitely pruned - - // if pruneCount2 is 1, we keep only up - // until the 'distinguished edge', ie we keep only the - // 'divergent' part of the FST. if my parent, about to be - // compiled, has inputCount 1 then we are already past the - // distinguished edge. NOTE: this only works if - // the FST outputs are not "compressible" (simple - // ords ARE compressible). - doPrune = true; - } else { - // my parent, about to be compiled, does make the cut, so - // I'm definitely not pruned - doPrune = false; + { + final BytesRef prefix = new BytesRef(idx); + for(int m=0;m target = (UnCompiledNode) node.arcs[arcIdx].target; - target.clear(); + if (parent == null) { + return; } - node.numArcs = 0; - } - if (doPrune) { - // this node doesn't make it -- deref it - node.clear(); - parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node); - } else { + boolean doPrune = false; + boolean doCompile = false; - if (minSuffixCount2 != 0) { + if (node.inputCount < minSuffixCount1) { + doPrune = true; + doCompile = true; + } else if (idx > prefixLenPlus1) { + // prune if parent's inputCount is less than suffixMinCount2 + if (parent.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.inputCount == 1) { + // my parent, about to be compiled, doesn't make the cut, so + // I'm definitely pruned + + // if pruneCount2 is 1, we keep only up + // until the 'distinguished edge', ie we keep only the + // 'divergent' part of the FST. if my parent, about to be + // compiled, has inputCount 1 then we are already past the + // distinguished edge. NOTE: this only works if + // the FST outputs are not "compressible" (simple + // ords ARE compressible). + doPrune = true; + } else { + // my parent, about to be compiled, does make the cut, so + // I'm definitely not pruned + doPrune = false; + } + doCompile = true; + } else { + // if pruning is disabled (count is 0) we can always + // compile current node + doCompile = minSuffixCount2 == 0; + } + + //System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune); + + if (node.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && node.inputCount == 1) { + // drop all arcs + for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; + target.clear(); + node.arcs[arcIdx].target = null; + } + node.numArcs = 0; + } + + if (doPrune) { + // this node doesn't make it -- deref it + node.clear(); + parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node); + } else { + + if (minSuffixCount2 != 0) { + compileAllTargets(node); + } + final T nextFinalOutput = node.output; + + // We "fake" the node as being final if it has no + // outgoing arcs; in theory we could leave it + // as non-final (the FST can represent this), but + // FSTEnum, Util, etc., have trouble w/ non-final + // dead-end states: + final boolean isFinal = node.isFinal || node.numArcs == 0; + //System.out.println(" nfo=" + nextFinalOutput); + + if (doCompile) { + // this node makes it and we now compile it. first, + // compile any targets that were previously + // undecided: + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + compileNode(node), + nextFinalOutput, + isFinal, + node.isFull); + } else { + // replaceLast just to install + // nextFinalOutput/isFinal onto the arc + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + node, + nextFinalOutput, + isFinal, + node.isFull); + // this node will stay in play for now, since we are + // undecided on whether to prune it. later, it + // will be either compiled or pruned, so we must + // allocate a new node: + frontier[idx] = new UnCompiledNode(this, idx); + } + } + } else { + + // useDoNotPrune is true + + if (parent != null) { + // DoNotPrune always spreads backwards + parent.doNotPrune |= node.doNotPrune; + } + + if (!node.doNotPrune) { + // drop all arcs + for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; + target.clear(); + node.arcs[arcIdx].target = null; + } + node.numArcs = 0; + //System.out.println("seg=" + segment + " full prune!"); + } else { + int prefixOnlyCount = 0; + for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; + prefixOnlyCount += target.doNotPrune ? 0 : 1; + //System.out.println("seg=" + segment + " " + (target.doNotPrune ? "* " : " ") + (char) (node.arcs[arcIdx].label) + " ct=" + target.inputCount); + //target.clear(); + //node.arcs[arcIdx].target = null; + } + + // nocommit -- need policy here: + // nocommit -- shouldn't we insist numArcs > 0? but: ant test-core -Dtestcase=TestFSTs -Dtestmethod=testBasicFSA -Dtests.seed=-5244488352711851354:0 -Dtests.verbose=true >& /dev/shm/out.x + node.isFull = (node.numArcs > 0 && prefixOnlyCount == 0) || ((double) prefixOnlyCount)/node.numArcs <= maxRatioPrefixArcs; + //node.isFull = prefixOnlyCount == 0 || ((double) prefixOnlyCount)/node.numArcs <= maxRatioPrefixArcs; + + // nocommit + //node.isFull = true; + if (!node.isFull) { + // Too many prunable nodes -- prune them: + int upto = 0; + for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; + if (target.doNotPrune) { + final Arc old = node.arcs[upto]; + node.arcs[upto++] = node.arcs[arcIdx]; + node.arcs[arcIdx] = old; + } else { + node.arcs[arcIdx].target = null; + } + } + //System.out.println("seg=" + segment + " prune! [numArcs=" + upto + "]"); + node.numArcs = upto; + } else { + //System.out.println("seg=" + segment + " full prefix!"); + fullNodeCount++; + extraArcCount += prefixOnlyCount; + } + //System.out.println("seg=" + segment + " node.output=" + node.output); + //System.out.println("seg=" + segment + " node.isFinal=" + node.isFinal); compileAllTargets(node); } - final T nextFinalOutput = node.output; - final boolean isFinal = node.isFinal; - if (doCompile) { - // this node makes it and we now compile it. first, - // compile any targets that were previously - // undecided: - parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], - compileNode(node), - nextFinalOutput, - isFinal); + // NOTE: here we can compile the node, now, if it's + // marked for not-pruned; ie we need not wait until + // its parent is finished... + + if (parent != null) { + //System.out.println(" push to parent isFull=" + node.isFull + " isFinal=" + node.isFinal); + parent.replaceLast(lastInput.ints[lastInput.offset + idx - 1], + node, + node.output, + node.isFinal, + node.isFull); + /* + final Arc lastArc = parent.arcs[parent.numArcs-1]; + assert lastArc.label == lastInput.ints[lastInput.offset + idx - 1]; + assert lastArc.target == node; + + // Push node stats back onto its incoming arc: + lastArc.isFinal = node.isFinal; + lastArc.nextFinalOutput = node.output; + //lastArc.targetIsFullPrefix = node.isFull && !node.isFinal; + lastArc.targetIsFullPrefix = node.isFull; + //System.out.println(" set targetIsFullPrefix=" + node.isFull); + */ } else { - // replaceLast just to install - // nextFinalOutput/isFinal onto the arc - parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], - node, - nextFinalOutput, - isFinal); - // this node will stay in play for now, since we are - // undecided on whether to prune it. later, it - // will be either compiled or pruned, so we must - // allocate a new node: - frontier[idx] = new UnCompiledNode(this, idx); + fst.rootIsFullPrefix = node.isFull; + //System.out.println("ROOT isFullPrefix=" + node.isFull); } + + frontier[idx] = new UnCompiledNode(this, idx); } } } @@ -210,13 +348,17 @@ private final IntsRef scratchIntsRef = new IntsRef(10); public void add(BytesRef input, T output) throws IOException { + add(input, output, false); + } + + public void add(BytesRef input, T output, boolean doNotPrune) throws IOException { assert fst.getInputType() == FST.INPUT_TYPE.BYTE1; scratchIntsRef.grow(input.length); for(int i=0;i= pos1Stop || lastInput.ints[pos1] != input.ints[pos2]) { break; @@ -305,11 +462,13 @@ for(int idx=prefixLenPlus1;idx<=input.length;idx++) { frontier[idx-1].addArc(input.ints[input.offset + idx - 1], frontier[idx]); - //System.out.println(" incr tail " + idx); frontier[idx].inputCount++; + frontier[idx].doNotPrune |= doNotPrune; } final UnCompiledNode lastNode = frontier[input.length]; + // nocommit -- which is it!! + //lastNode.isFinal = !useDoNotPrune || doNotPrune; lastNode.isFinal = true; lastNode.output = NO_OUTPUT; @@ -327,6 +486,7 @@ if (lastOutput != NO_OUTPUT) { commonOutputPrefix = fst.outputs.common(output, lastOutput); + //System.out.println(" idx=" + idx + " commonOutput=" + commonOutputPrefix); assert validOutput(commonOutputPrefix); wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix); assert validOutput(wordSuffix); @@ -345,8 +505,6 @@ // save last input lastInput.copy(input); - - //System.out.println(" count[0]=" + frontier[0].inputCount); } private boolean validOutput(T output) { @@ -357,37 +515,45 @@ * nothing is accepted by the FST. */ public FST finish() throws IOException { + final UnCompiledNode root = frontier[0]; + + //System.out.println("seg=" + segment + " fst.Builder.finish: numTerms=" + frontier[0].inputCount + " fullNodeCount=" + fullNodeCount + " vs " + fst.getNodeCount() + " extraArcCount=" + extraArcCount + " vs " + fst.getArcCount()); + // minimize nodes in the last word's suffix - compilePrevTail(1); - //System.out.println("finish: inputCount=" + frontier[0].inputCount); - if (frontier[0].inputCount < minSuffixCount1 || frontier[0].inputCount < minSuffixCount2 || frontier[0].numArcs == 0) { + compilePrevTail(0); + + if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0) { if (fst.emptyOutput == null) { return null; } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) { // empty string got pruned return null; - } else { - fst.finish(compileNode(frontier[0]).address); - //System.out.println("compile addr = " + fst.getStartNode()); - return fst; } } else { if (minSuffixCount2 != 0) { - compileAllTargets(frontier[0]); + compileAllTargets(root); } - //System.out.println("NOW: " + frontier[0].numArcs); - fst.finish(compileNode(frontier[0]).address); } - + + //System.out.println(" finished node " + root.numArcs + " emptyOutput=" + fst.emptyOutput); + + fst.finish(compileNode(root).address); + return fst; } private void compileAllTargets(UnCompiledNode node) throws IOException { + //System.out.println("compileAllTargets: " + node.numArcs + " arcs node count=" + node.inputCount); for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + //System.out.println(" " + arcIdx + " of " + node.numArcs + ": arc=" + (char) arc.label + " isFull=" + arc.targetIsFullPrefix + " isFinal=" + arc.isFinal); if (!arc.target.isCompiled()) { // not yet compiled @SuppressWarnings("unchecked") final UnCompiledNode n = (UnCompiledNode) arc.target; + if (n.numArcs == 0) { + //System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label); + arc.isFinal = n.isFinal = true; + } arc.target = compileNode(n); } } @@ -397,6 +563,7 @@ public int label; // really an "unsigned" byte public Node target; public boolean isFinal; + public boolean targetIsFullPrefix; // true if target is "full prefix" public T output; public T nextFinalOutput; } @@ -418,6 +585,8 @@ static final class UnCompiledNode implements Node { final Builder owner; + boolean doNotPrune; + boolean isFull; int numArcs; Arc[] arcs; T output; @@ -448,6 +617,8 @@ public void clear() { numArcs = 0; + isFull = false; + doNotPrune = false; isFinal = false; output = owner.NO_OUTPUT; inputCount = 0; @@ -479,9 +650,10 @@ arc.target = target; arc.output = arc.nextFinalOutput = owner.NO_OUTPUT; arc.isFinal = false; + arc.targetIsFullPrefix = false; } - public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) { + public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal, boolean isFull) { assert numArcs > 0; final Arc arc = arcs[numArcs-1]; assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; @@ -489,6 +661,7 @@ //assert target.address != -2; arc.nextFinalOutput = nextFinalOutput; arc.isFinal = isFinal; + arc.targetIsFullPrefix = isFull; } public void deleteLast(int label, Node target) { @@ -515,6 +688,8 @@ assert owner.validOutput(arcs[arcIdx].output); } + //System.out.println("prependOutput=" + outputPrefix); + if (isFinal) { output = owner.fst.outputs.add(outputPrefix, output); assert owner.validOutput(output); Index: lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java --- lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Thu Mar 03 12:25:14 2011 -0500 @@ -27,6 +27,10 @@ import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode; +// NOTE: while the FST is able to represent a non-final +// dead-end state (NON_FINAL_END_NODE=0), the layres above +// (FSTEnum, Util) have problems with this!! + /** Represents an FST using a compact byte[] format. *

The format is similar to what's used by Morfologik * (http://sourceforge.net/projects/morfologik). @@ -48,6 +52,11 @@ // this when number of arcs is > NUM_ARCS_ARRAY: private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; + // Set if the target of this arc is a "full prefix" node, + // meaning it has all outgoing arcs for all terms from + // this node (not just indexed terms): + private final static int BIT_NEXT_FULL_PREFIX = 1 << 7; + /** * @see #shouldExpand(UnCompiledNode) */ @@ -82,6 +91,7 @@ // produces this output T emptyOutput; private byte[] emptyOutputBytes; + boolean rootIsFullPrefix; private byte[] bytes; int byteUpto = 0; @@ -147,9 +157,13 @@ boolean isFinal() { return flag(BIT_FINAL_ARC); } + + boolean isTargetFullPrefix() { + return flag(BIT_NEXT_FULL_PREFIX); + } }; - static boolean flag(int flags, int bit) { + static final boolean flag(int flags, int bit) { return (flags & bit) != 0; } @@ -182,6 +196,7 @@ } else { emptyOutput = null; } + rootIsFullPrefix = in.readByte() == 1; final byte t = in.readByte(); switch(t) { case 0: @@ -255,6 +270,8 @@ throw new IllegalStateException("call finish first"); } CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); + // TODO: really we should encode this as an arc, arriving + // to the root node, instead of special casing here: if (emptyOutput != null) { out.writeByte((byte) 1); out.writeVInt(emptyOutputBytes.length); @@ -262,6 +279,7 @@ } else { out.writeByte((byte) 0); } + out.writeByte((byte) (rootIsFullPrefix ? 1:0)); final byte t; if (inputType == INPUT_TYPE.BYTE1) { t = 0; @@ -331,7 +349,8 @@ bytesPerArc = new int[ArrayUtil.oversize(node.numArcs, 1)]; } // write a "false" first arc: - writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY); + int flags = BIT_ARCS_AS_FIXED_ARRAY; + writer.writeByte((byte) flags); writer.writeVInt(node.numArcs); // placeholder -- we'll come back and write the number // of bytes per arc here: @@ -347,12 +366,17 @@ final int lastArc = node.numArcs-1; + // nocommit -- maybe all these APIs should take a thread + // state... that'd hold/reuse the bytes reader? + int lastArcStart = writer.posWrite; int maxBytesPerArc = 0; + //System.out.println(" fst.addNode numArcs=" + node.numArcs); for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; int flags = 0; + //System.out.println(" " + arcIdx + " of " + node.numArcs + ": arc=" + (char) arc.label + " targetIsFullPrefix=" + arc.targetIsFullPrefix); if (arcIdx == lastArc) { flags += BIT_LAST_ARC; @@ -362,6 +386,8 @@ flags += BIT_TARGET_NEXT; } + boolean targetHasArcs = target.address > 0; + if (arc.isFinal) { flags += BIT_FINAL_ARC; if (arc.nextFinalOutput != NO_OUTPUT) { @@ -371,7 +397,9 @@ assert arc.nextFinalOutput == NO_OUTPUT; } - boolean targetHasArcs = target.address > 0; + if (arc.targetIsFullPrefix) { + flags += BIT_NEXT_FULL_PREFIX; + } if (!targetHasArcs) { flags += BIT_STOP_NODE; @@ -450,10 +478,10 @@ } lastFrozenNode = endAddress - 1; + //System.out.println(" return node addr=" + (endAddress-1)); /* - System.out.println(" return node addr=" + (endAddress-1)); for(int i=endAddress-1;i>=startAddress;i--) { - System.out.println(" bytes[" + i + "]=" + bytes[i]); + //System.out.println(" bytes[" + i + "]=" + bytes[i]); } */ @@ -470,6 +498,10 @@ arc.flags = BIT_LAST_ARC; } + if (rootIsFullPrefix) { + arc.flags |= BIT_NEXT_FULL_PREFIX; + } + // If there are no nodes, ie, the FST only accepts the // empty string, then startNode is 0, and then readFirstTargetArc arc.target = startNode; @@ -505,7 +537,7 @@ // non-array: linear scan arc.bytesPerArc = 0; //System.out.println(" scan"); - while(!arc.isLast()) { + while((arc.flags & FST.BIT_LAST_ARC) == 0) { // skip this arc: readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { @@ -585,13 +617,10 @@ * expanded array format. */ boolean isExpandedTarget(Arc follow) throws IOException { - if (follow.isFinal()) { + if (!targetHasArcs(follow)) { return false; } else { - final BytesReader in = getBytesReader(follow.target); - final byte b = in.readByte(); - - return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; + return flag(getBytesReader(follow.target).readByte(), BIT_ARCS_AS_FIXED_ARRAY); } } @@ -654,6 +683,7 @@ in = getBytesReader(arc.nextArc); } arc.flags = in.readByte(); + //System.out.println(" arc.flags=" + Integer.toHexString(arc.flags) + " isFull=" + (arc.flags & BIT_NEXT_FULL_PREFIX)); arc.label = readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { @@ -669,8 +699,13 @@ } if (arc.flag(BIT_STOP_NODE)) { - arc.target = FINAL_END_NODE; - arc.flags |= BIT_FINAL_ARC; + if (arc.flag(BIT_FINAL_ARC)) { + arc.target = FINAL_END_NODE; + } else { + arc.target = NON_FINAL_END_NODE; + } + // nocommit: hmm? + //arc.flags |= BIT_FINAL_ARC; arc.nextArc = in.pos; } else if (arc.flag(BIT_TARGET_NEXT)) { arc.nextArc = in.pos; Index: lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java --- lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java Thu Mar 03 12:25:14 2011 -0500 @@ -59,7 +59,7 @@ * current term and target term */ protected final void rewindPrefix() throws IOException { if (upto == 0) { - //System.out.println(" init"); + //System.out.println(" init: root full prefix=" + getArc(0).isTargetFullPrefix()); upto = 1; fst.readFirstTargetArc(getArc(0), getArc(1)); return; @@ -112,8 +112,6 @@ /** Seeks to smallest term that's >= target. */ protected void doSeekCeil() throws IOException { - //System.out.println(" advance len=" + target.length + " curlen=" + current.length); - // TODO: possibly caller could/should provide common // prefix length? ie this work may be redundant if // caller is in fact intersecting against its own @@ -130,6 +128,8 @@ int targetLabel = getTargetLabel(); //System.out.println(" init targetLabel=" + targetLabel); + lastSeekCeilIsFull = false; + // Now scan forward, matching the new suffix of the target while(true) { @@ -144,14 +144,14 @@ int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; - //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); + //System.out.println(" do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); boolean found = false; while (low <= high) { mid = (low + high) >>> 1; in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; - //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); + //System.out.println(" test low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) low = mid + 1; else if (cmp > 0) @@ -172,6 +172,8 @@ assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto-1], arc.output); if (targetLabel == FST.END_LABEL) { + // exact match + lastSeekCeilIsFull = true; return; } setCurrentLabel(arc.label); @@ -213,6 +215,7 @@ // recurse output[upto] = fst.outputs.add(output[upto-1], arc.output); if (targetLabel == FST.END_LABEL) { + lastSeekCeilIsFull = true; return; } setCurrentLabel(arc.label); @@ -228,6 +231,7 @@ upto--; while(true) { if (upto == 0) { + lastSeekCeilIsFull = true; return; } final FST.Arc prevArc = getArc(upto); @@ -248,6 +252,20 @@ } } + private boolean lastSeekCeilIsFull; + + /** Returns true if.. */ + public boolean lastSeekCeilIsFull() { + return lastSeekCeilIsFull; + } + + private boolean lastSeekFloorMayExist; + + // nocommit -- make seekCeil work too? + public boolean lastSeekFloorMayExist() { + return lastSeekFloorMayExist; + } + // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / // SEEK_END)? saves the eq check above? /** Seeks to largest term that's <= target. */ @@ -268,11 +286,13 @@ FST.Arc arc = getArc(upto); int targetLabel = getTargetLabel(); - //System.out.println("FE: init targetLabel=" + targetLabel); + lastSeekFloorMayExist = true; + + //System.out.println("FE: init targetLabel=" + (char) targetLabel + " " + (char) targetLabel + " " + targetLabel); // Now scan forward, matching the new suffix of the target while(true) { - //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast()); + //System.out.println(" cycle upto=" + upto + " targetIsFullPrefix=" + arc.isTargetFullPrefix() + " arc.label=" + arc.label + " (" + (char) arc.label + ") arc.target=" + arc.target + " targetLabel=" + targetLabel + " (" + (char) targetLabel + ") isLast?=" + arc.isLast()); if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { // Arcs are fixed array -- use binary search to find @@ -282,14 +302,14 @@ int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; - //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); + //System.out.println(" do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); boolean found = false; while (low <= high) { mid = (low + high) >>> 1; in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; - //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); + //System.out.println(" test low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) low = mid + 1; else if (cmp > 0) @@ -299,7 +319,7 @@ break; } } - + // NOTE: this code is dup'd w/ the code below (in // the outer else clause): if (found) { @@ -307,6 +327,7 @@ //System.out.println(" match! arcIdx=" + mid); arc.arcIdx = mid-1; fst.readNextRealArc(arc); + //System.out.println(" after read isFullPrefix=" + arc.isTargetFullPrefix()); assert arc.arcIdx == mid; assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto-1], arc.output); @@ -320,6 +341,7 @@ continue; } else if (high == -1) { //System.out.println(" before first"); + lastSeekFloorMayExist = !getArc(upto-1).isTargetFullPrefix(); // Very first arc is after our target // TODO: if each arc could somehow read the arc just // before, we can save this re-scan. The ceil case @@ -346,6 +368,7 @@ arc = getArc(upto); } } else { + lastSeekFloorMayExist = !getArc(upto-1).isTargetFullPrefix(); // There is a floor arc: arc.arcIdx = (low > high ? high : low)-1; //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1)); @@ -368,6 +391,10 @@ arc = fst.readFirstTargetArc(arc, getArc(upto)); targetLabel = getTargetLabel(); } else if (arc.label > targetLabel) { + // We only hit this if very first arc is already + // beyond our label + //System.out.println(" s0"); + lastSeekFloorMayExist = !getArc(upto-1).isTargetFullPrefix(); // TODO: if each arc could somehow read the arc just // before, we can save this re-scan. The ceil case // doesn't need this because it reads the next arc @@ -395,6 +422,9 @@ } else if (!arc.isLast()) { //System.out.println(" check next label=" + fst.readNextArcLabel(arc) + " (" + (char) fst.readNextArcLabel(arc) + ")"); if (fst.readNextArcLabel(arc) > targetLabel) { + //System.out.println(" s1"); + lastSeekFloorMayExist = !getArc(upto-1).isTargetFullPrefix(); + //System.out.println(" set lastSeekFloorMayExist=" + lastSeekFloorMayExist); pushLast(); return; } else { @@ -402,6 +432,8 @@ fst.readNextArc(arc); } } else { + lastSeekFloorMayExist = !getArc(upto-1).isTargetFullPrefix(); + //System.out.println(" s2 upto=" + upto + " target=" + getArc(upto-1).target + " lsme=" + lastSeekFloorMayExist + " lastTarget=" + getArc(upto-1).target); pushLast(); return; } @@ -430,19 +462,35 @@ // appending first arc all the way to the final node private void pushFirst() throws IOException { + //System.out.println("pushFirst"); + FST.Arc arc = arcs[upto]; assert arc != null; + // nocommit ob1? + lastSeekCeilIsFull = arcs[upto-1].isTargetFullPrefix(); + while (true) { + //System.out.println(" cycle upto=" + upto + " label=" + (char) arc.label + " " + arc.label + " output=" + fst.outputs.outputToString(arc.output) + " target=" + arc.target + " isFinal=" + arc.isFinal() + " nfo=" + arc.nextFinalOutput + " lastSeekCeilIsFull=" + lastSeekCeilIsFull); output[upto] = fst.outputs.add(output[upto-1], arc.output); if (arc.label == FST.END_LABEL) { // Final node + //System.out.println(" done output=" + fst.outputs.outputToString(output[upto])); break; } - //System.out.println(" pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" + fst.outputs.outputToString(output[upto])); setCurrentLabel(arc.label); incr(); + // We |= here instead of just = because if we go + // through any node that is not a full prefix, that + // "breaks" the chain. However, today, the FST is + // always constructed such that only the last node + // would break the chain. At, at most a non-leaf node + // will have prefix-arcs that are "final" (do not lead + // to other nodes). We only keep an non-leaf node if + // it participates in a "real" term. + lastSeekCeilIsFull |= arc.isTargetFullPrefix(); + final FST.Arc nextArc = getArc(upto); fst.readFirstTargetArc(arc, nextArc); arc = nextArc; Index: lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java --- lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java Thu Mar 03 12:25:14 2011 -0500 @@ -45,7 +45,8 @@ !arc.output.equals(scratchArc.output) || ((Builder.CompiledNode) arc.target).address != scratchArc.target || !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) || - arc.isFinal != scratchArc.isFinal()) { + arc.isFinal != scratchArc.isFinal() || + scratchArc.isTargetFullPrefix() != arc.targetIsFullPrefix) { return false; } @@ -79,6 +80,9 @@ if (arc.isFinal) { h += 17; } + if (arc.targetIsFullPrefix) { + h += 41; + } } //System.out.println(" ret " + (h&Integer.MAX_VALUE)); return h & Integer.MAX_VALUE; @@ -99,6 +103,9 @@ if (scratchArc.isFinal()) { h += 17; } + if (scratchArc.isTargetFullPrefix()) { + h += 41; + } if (scratchArc.isLast()) { break; } @@ -109,7 +116,7 @@ } public int add(Builder.UnCompiledNode node) throws IOException { - // System.out.println("hash: add count=" + count + " vs " + table.length); + //System.out.println("hash: add count=" + count + " vs " + table.length); final int h = hash(node); int pos = h & mask; int c = 0; Index: lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java --- lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java Thu Mar 03 12:25:14 2011 -0500 @@ -191,6 +191,10 @@ throws IOException { final String expandedNodeColor = "blue"; + final String fullPrefixNodeColor = "red"; + + final String fullPrefixAndExpandedNodeColor = "purple"; + // This is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); @@ -211,6 +215,7 @@ // Shape for states. final String stateShape = "circle"; + final String finalStateShape = "doublecircle"; // Emit DOT prologue. out.write("digraph FST {\n"); @@ -221,12 +226,39 @@ } emitDotState(out, "initial", "point", "white", ""); - emitDotState(out, Integer.toString(startArc.target), stateShape, - fst.isExpandedTarget(startArc) ? expandedNodeColor : null, - ""); + + final T NO_OUTPUT = fst.outputs.getNoOutput(); + + final FST.Arc scratchArc = new FST.Arc(); + + { + final String stateColor; + if (fst.isExpandedTarget(startArc)) { + if (startArc.isTargetFullPrefix()) { + stateColor = fullPrefixAndExpandedNodeColor; + } else { + stateColor = expandedNodeColor; + } + } else if (startArc.isTargetFullPrefix()) { + stateColor = fullPrefixNodeColor; + } else { + stateColor = null; + } + + final boolean isFinal; + final T finalOutput; + if (startArc.isFinal()) { + isFinal = true; + finalOutput = startArc.nextFinalOutput == NO_OUTPUT ? null : startArc.nextFinalOutput; + } else { + isFinal = false; + finalOutput = null; + } + + emitDotState(out, Integer.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput)); + } out.write(" initial -> " + startArc.target + "\n"); - final T NO_OUTPUT = fst.outputs.getNoOutput(); int level = 0; while (!nextLevelQueue.isEmpty()) { @@ -238,19 +270,53 @@ out.write("\n // Transitions and states at level: " + level + "\n"); while (!thisLevelQueue.isEmpty()) { final FST.Arc arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); - if (fst.targetHasArcs(arc)) { // scan all arcs final int node = arc.target; fst.readFirstTargetArc(arc, arc); - + + if (arc.label == FST.END_LABEL) { + // Skip it -- prior recursion took this into account already + assert !arc.isLast(); + fst.readNextArc(arc); + } + while (true) { + // Emit the unseen state and add it to the queue for the next level. if (arc.target >= 0 && !seen.get(arc.target)) { - final boolean isExpanded = fst.isExpandedTarget(arc); - emitDotState(out, Integer.toString(arc.target), stateShape, - isExpanded ? expandedNodeColor : null, - labelStates ? Integer.toString(arc.target) : ""); + /* + boolean isFinal = false; + T finalOutput = null; + fst.readFirstTargetArc(arc, scratchArc); + if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { + // target is final + isFinal = true; + finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; + System.out.println("dot hit final label=" + (char) scratchArc.label); + } + */ + final String stateColor; + if (fst.isExpandedTarget(arc)) { + if (arc.isTargetFullPrefix()) { + stateColor = fullPrefixAndExpandedNodeColor; + } else { + stateColor = expandedNodeColor; + } + } else if (arc.isTargetFullPrefix()) { + stateColor = fullPrefixNodeColor; + } else { + stateColor = null; + } + + final String finalOutput; + if (arc.nextFinalOutput != null && arc.nextFinalOutput != NO_OUTPUT) { + finalOutput = fst.outputs.outputToString(arc.nextFinalOutput); + } else { + finalOutput = ""; + } + + emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput); seen.set(arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); sameLevelStates.add(arc.target); @@ -263,14 +329,19 @@ outs = ""; } - final String cl; - if (arc.label == FST.END_LABEL) { - cl = "~"; - } else { - cl = printableLabel(arc.label); + if (!fst.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) { + // nocommit -- this is broken again? + // Tricky special case: sometimes, due to + // pruning, the builder can [sillily] produce + // an FST with an arc into the final end state + // (-1) but also with a next final output; in + // this case we pull that output up onto this + // arc + outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]"; } - out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n"); + assert arc.label != FST.END_LABEL; + out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.isLast()) { Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java Thu Mar 03 12:25:14 2011 -0500 @@ -305,7 +305,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) { + public SeekStatus seek(BytesRef term, boolean useCache, boolean exactOnly /* ignored */) { current = term.utf8ToString(); it = null; if (ramField.termToDocs.containsKey(current)) { Index: lucene/src/test/org/apache/lucene/index/TestMultiFields.java --- lucene/src/test/org/apache/lucene/index/TestMultiFields.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/test/org/apache/lucene/index/TestMultiFields.java Thu Mar 03 12:25:14 2011 -0500 @@ -29,10 +29,15 @@ int num = 2 * RANDOM_MULTIPLIER; for (int iter = 0; iter < num; iter++) { + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); _TestUtil.keepFullyDeletedSegments(w); + w.setInfoStream(VERBOSE ? System.out : null); Map> docs = new HashMap>(); Set deleted = new HashSet(); @@ -80,7 +85,7 @@ if (VERBOSE) { List termsList = new ArrayList(uniqueTerms); Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); - System.out.println("UTF16 order:"); + System.out.println("TEST: terms in UTF16 order:"); for(BytesRef b : termsList) { System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString())); } @@ -88,7 +93,9 @@ IndexReader reader = w.getReader(); w.close(); - //System.out.println("TEST reader=" + reader); + if (VERBOSE) { + System.out.println("TEST: reader=" + reader); + } Bits delDocs = MultiFields.getDeletedDocs(reader); for(int delDoc : deleted) { @@ -99,7 +106,7 @@ for(int i=0;i<100;i++) { BytesRef term = terms.get(random.nextInt(terms.size())); if (VERBOSE) { - System.out.println("TEST: seek to term= "+ UnicodeUtil.toHexString(term.utf8ToString())); + System.out.println("TEST: seek to term="+ UnicodeUtil.toHexString(term.utf8ToString()) + " " + term); } DocsEnum docsEnum = terms2.docs(delDocs, term, null); Index: lucene/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java --- lucene/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java Thu Mar 03 12:25:14 2011 -0500 @@ -226,7 +226,7 @@ Fields fields = MultiFields.getFields(reader); Terms cterms = fields.terms(term.field); TermsEnum ctermsEnum = cterms.iterator(); - SeekStatus ss = ctermsEnum.seek(new BytesRef(term.text()), false); + SeekStatus ss = ctermsEnum.seek(new BytesRef(term.text()), false, true); if (ss.equals(SeekStatus.FOUND)) { DocsEnum docsEnum = ctermsEnum.docs(bits, null); return toArray(docsEnum); Index: lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java --- lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java Thu Mar 03 12:25:14 2011 -0500 @@ -147,11 +147,14 @@ } for (int i=1;i<10;i++) { query = newPhraseQuery("field", English.intToEnglish(i)+" hundred", true, new AveragePayloadFunction()); + if (VERBOSE) { + System.out.println("TEST: run query=" + query); + } // all should have score = 3 because adjacent terms have payloads of 2,4 // and all the similarity factors are set to 1 hits = searcher.search(query, null, 100); assertTrue("hits is null and it shouldn't be", hits != null); - assertTrue("should be 100 hits", hits.totalHits == 100); + assertEquals("should be 100 hits", 100, hits.totalHits); for (int j = 0; j < hits.scoreDocs.length; j++) { ScoreDoc doc = hits.scoreDocs[j]; // System.out.println("Doc: " + doc.toString()); Index: lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java --- lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Thu Mar 03 11:42:17 2011 -0500 +++ lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Thu Mar 03 12:25:14 2011 -0500 @@ -25,6 +25,7 @@ import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; +import java.io.StringWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -38,13 +39,19 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SerialMergeScheduler; +import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IndexInput; @@ -127,7 +134,7 @@ } public void testBasicFSA() throws IOException { - String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"}; + String[] strings = new String[] {"commotion", "elastic", "elation", "ftation", "ftop", "plastic", "stat", "station", "stop"}; String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"}; IntsRef[] terms = new IntsRef[strings.length]; IntsRef[] terms2 = new IntsRef[strings2.length]; @@ -189,6 +196,59 @@ assertEquals(24, fst.getNodeCount()); assertEquals(30, fst.getArcCount()); } + + // Test next prefix + { + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + BytesRef term; + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, outputs); + final double v = random.nextDouble(); + builder.setMaxRatioPrefixArcs(v); + if (VERBOSE) { + System.out.println("TEST: now test nextPossiblePrefix maxRatioPrefixArcs=" + v); + } + + for(String term0 : strings) { + builder.add(new BytesRef(term0), NO_OUTPUT, true); + } + FST fst = builder.finish(); + + if (VERBOSE && fst != null) { + Writer w = new OutputStreamWriter(new FileOutputStream("basicprefix.dot"), "UTF-8"); + Util.toDot(fst, w, false, false); + w.close(); + System.out.println("SAVED basicprefix.dot"); + } + + BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + fstEnum.seekFloor(new BytesRef("station")); + assertTrue(fstEnum.lastSeekFloorMayExist()); + + if (VERBOSE) { + System.out.println("TEST: seek elaz"); + } + BytesRefFSTEnum.InputOutput hit = fstEnum.seekCeil(new BytesRef("elaz")); + assertNotNull(hit); + assertEquals(new BytesRef("ftation"), hit.input); + assertTrue(fstEnum.lastSeekCeilIsFull()); + + if (VERBOSE) { + System.out.println("TEST: seek d"); + } + hit = fstEnum.seekCeil(new BytesRef("d")); + assertNotNull(hit); + assertEquals(new BytesRef("elastic"), hit.input); + assertTrue(fstEnum.lastSeekCeilIsFull()); + + if (VERBOSE) { + System.out.println("TEST: seek pla"); + } + hit = fstEnum.seekCeil(new BytesRef("pla")); + assertNotNull(hit); + assertTrue(fstEnum.lastSeekCeilIsFull()); + assertEquals(new BytesRef("plastic"), hit.input); + } } } @@ -427,7 +487,7 @@ final Builder builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, prune1, prune2, - prune1==0 && prune2==0, outputs); + prune1==0 && prune2==0, false, outputs); for(InputOutput pair : pairs) { builder.add(pair.input, pair.output); @@ -491,7 +551,7 @@ assertNotNull(fst); - // visit valid paris in order -- make sure all words + // visit valid pairs in order -- make sure all words // are accepted, and FSTEnum's next() steps through // them correctly if (VERBOSE) { @@ -549,8 +609,8 @@ final IntsRef term = toIntsRef(getRandomString(), inputMode); int pos = Collections.binarySearch(pairs, new InputOutput(term, null)); if (pos < 0) { + // ok doesn't exist pos = -(pos+1); - // ok doesn't exist //System.out.println(" seek " + inputToString(inputMode, term)); final IntsRefFSTEnum.InputOutput seekResult; if (random.nextBoolean()) { @@ -721,6 +781,11 @@ T finalOutput; boolean isLeaf = true; boolean isFinal; + + @Override + public String toString() { + return "count=" + count + " output=" + output + " finalOutput=" + finalOutput + " isFinal=" + isFinal + " isLeaf=" + isLeaf; + } } // FST is pruned @@ -834,12 +899,15 @@ return; } - assertNotNull(fst); + if (fst == null) { + fail("prefixes.size()=" + prefixes.size() + " prefixes=" + prefixes + " prune1=" + prune1 + " prune2=" + prune2); + } // make sure FST only enums valid prefixes if (VERBOSE) { System.out.println("TEST: check pruned enum"); } + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); IntsRefFSTEnum.InputOutput current; while((current = fstEnum.next()) != null) { @@ -966,7 +1034,11 @@ IndexReader r = IndexReader.open(writer, true); writer.close(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, outputs); + + if (VERBOSE) { + System.out.println("TEST: done build index"); + } boolean storeOrd = random.nextBoolean(); if (VERBOSE) { @@ -1058,12 +1130,75 @@ } } } + + _testNextPossiblePrefix(terms.iterator()); } + if (VERBOSE) { + System.out.println("TEST: now close reader"); + } r.close(); dir.close(); } + private void _testNextPossiblePrefix(TermsEnum termsEnum) throws Exception { + + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + BytesRef term; + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, outputs); + final double v = random.nextDouble(); + builder.setMaxRatioPrefixArcs(v); + if (VERBOSE) { + System.out.println("TEST: now test nextPossiblePrefix maxRatioPrefixArcs=" + v); + } + + final List terms = new ArrayList(); + while((term = termsEnum.next()) != null) { + //builder.add(term, NO_OUTPUT, random.nextInt(30) == 17); + builder.add(term, NO_OUTPUT, true); + terms.add(new BytesRef(term)); + } + + final FST fst = builder.finish(); + + if (VERBOSE) { + System.out.println("TEST: done build FST " + terms.size() + " terms"); + } + + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + for(BytesRef term0 : terms) { + fstEnum.seekFloor(term0); + assertTrue(fstEnum.lastSeekFloorMayExist()); + } + + for(int iter=0;iter(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs); + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, false, outputs); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -1111,7 +1246,7 @@ } toIntsRef(w, inputMode, intsRef); builder.add(intsRef, - getOutput(intsRef, ord)); + getOutput(intsRef, ord), (ord % 32) == 0); ord++; if (ord % 500000 == 0) { @@ -1184,7 +1319,7 @@ } } - // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out + // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out public static void main(String[] args) throws IOException { final String wordsFileIn = args[0]; final String dirOut = args[1]; @@ -1273,7 +1408,7 @@ public void testSingleString() throws Exception { final Outputs outputs = NoOutputs.getSingleton(); - final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, outputs); b.add(new BytesRef("foobar"), outputs.getNoOutput()); final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(b.finish()); assertNull(fstEnum.seekFloor(new BytesRef("foo"))); @@ -1290,7 +1425,7 @@ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); // Build an FST mapping BytesRef -> Long - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, outputs); final BytesRef a = new BytesRef("a"); final BytesRef b = new BytesRef("b"); @@ -1324,6 +1459,63 @@ assertEquals(42, (long) seekResult.output); } + public void testPrimaryKeys() throws Exception { + Directory dir = newDirectory(); + + for(int cycle=0;cycle<2;cycle++) { + if (VERBOSE) { + System.out.println("TEST: cycle=" + cycle); + } + RandomIndexWriter w = new RandomIndexWriter(random, dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); + Document doc = new Document(); + Field idField = newField("id", "", Field.Index.NOT_ANALYZED); + doc.add(idField); + + final long seed = random.nextLong(); + + final int NUM_IDS = 1000*RANDOM_MULTIPLIER; + final Set allIDs = new HashSet(); + for(int id=0;id allIDsList = new ArrayList(allIDs); + Collections.shuffle(allIDsList, random); + + // verify + for(String id : allIDsList) { + if (VERBOSE) { + System.out.println("TEST: id=" + id); + } + assertEquals("id=" + id, 1, s.search(new TermQuery(new Term("id", id)), 1).totalHits); + } + + r.close(); + } + dir.close(); + } + /** * Test state expansion (array format) on close-to-root states. Creates * synthetic input that has one expanded state on each level. @@ -1335,7 +1527,7 @@ FST compile(String[] lines) throws IOException { final NoOutputs outputs = NoOutputs.getSingleton(); final Object nothing = outputs.getNoOutput(); - final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, outputs); int line = 0; final BytesRef term = new BytesRef(); @@ -1404,4 +1596,32 @@ FST.Arc arc = fst.getFirstArc(new FST.Arc()); s.verifyStateAndBelow(fst, arc, 1); } + + public void testFinalOutputOnEndState() throws Exception { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, false, outputs); + builder.add("stat", outputs.get(17)); + builder.add("station", outputs.get(10)); + final FST fst = builder.finish(); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + StringWriter w = new StringWriter(); + Util.toDot(fst, w, false, false); + w.close(); + assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1); + } + + public void testInternalFinalState() throws Exception { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, outputs); + builder.add(new BytesRef("stat"), outputs.getNoOutput(), true); + builder.add(new BytesRef("station"), outputs.getNoOutput(), true); + final FST fst = builder.finish(); + StringWriter w = new StringWriter(); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + Util.toDot(fst, w, false, false); + w.close(); + assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1); + } } Index: solr/src/java/org/apache/solr/handler/component/TermsComponent.java --- solr/src/java/org/apache/solr/handler/component/TermsComponent.java Thu Mar 03 11:42:17 2011 -0500 +++ solr/src/java/org/apache/solr/handler/component/TermsComponent.java Thu Mar 03 12:25:14 2011 -0500 @@ -162,7 +162,7 @@ BytesRef term = null; if (lowerBytes != null) { - if (termsEnum.seek(lowerBytes, true) == TermsEnum.SeekStatus.END) { + if (termsEnum.seek(lowerBytes, true, false) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); Index: solr/src/java/org/apache/solr/request/SimpleFacets.java --- solr/src/java/org/apache/solr/request/SimpleFacets.java Thu Mar 03 11:42:17 2011 -0500 +++ solr/src/java/org/apache/solr/request/SimpleFacets.java Thu Mar 03 12:25:14 2011 -0500 @@ -644,7 +644,7 @@ // facet.offset when sorting by index order. if (startTermBytes != null) { - if (termsEnum.seek(startTermBytes, true) == TermsEnum.SeekStatus.END) { + if (termsEnum.seek(startTermBytes, true, false) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); Index: solr/src/java/org/apache/solr/request/UnInvertedField.java --- solr/src/java/org/apache/solr/request/UnInvertedField.java Thu Mar 03 11:42:17 2011 -0500 +++ solr/src/java/org/apache/solr/request/UnInvertedField.java Thu Mar 03 12:25:14 2011 -0500 @@ -1111,7 +1111,7 @@ } @Override - public SeekStatus seek(BytesRef target, boolean useCache) { + public SeekStatus seek(BytesRef target, boolean useCache, boolean exactOnly) { throw new UnsupportedOperationException(); } } Index: solr/src/java/org/apache/solr/search/function/FileFloatSource.java --- solr/src/java/org/apache/solr/search/function/FileFloatSource.java Thu Mar 03 11:42:17 2011 -0500 +++ solr/src/java/org/apache/solr/search/function/FileFloatSource.java Thu Mar 03 12:25:14 2011 -0500 @@ -261,7 +261,7 @@ continue; // go to next line in file.. leave values as default. } - if (termsEnum.seek(internalKey, false) != TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seek(internalKey, false, true) != TermsEnum.SeekStatus.FOUND) { if (notFoundCount<10) { // collect first 10 not found for logging notFound.add(key); }