Index: TODO =================================================================== --- TODO (revision 1157907) +++ TODO (working copy) @@ -16,12 +16,9 @@ - test perf on single seg index too! - try *larger* maxItemsInBlock: could give better net perf? ie less seeking and more scanning - - run fuzzy q under profiler test if cutting over prefix query to .intersect is faster -"upgrade" postings reader/writer API to match block tree; then non-tree codec uses it too and we only need 1 impl of each postings - what to do about short terms that "force" a block to mark itself has hasTerms!!?? - maybe instead of "isLeafBlock" bit we encode "countUntilNextTerm"? this way, for a block that only has empty string term we can stop scanning quickly? - maybe make a "terms block cache" that holds low-prefix LRU term blocks in ram...? @@ -30,10 +27,6 @@ try forcing no hasTerms if depth < 2? -automaton q should apply maxlength test on each scan'd term - -intersect should use suffix ref - LATER: - maybe blocks should NOT store sub-block pointers? it's reudundant w/ the index... - hmm: maybe switch PKLookupTask to intersect!? do we have fast string builder? Index: lucene/src/test/org/apache/lucene/index/TestTermsEnum.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestTermsEnum.java (revision 1157907) +++ lucene/src/test/org/apache/lucene/index/TestTermsEnum.java (working copy) @@ -270,7 +270,7 @@ } a = DaciukMihovAutomatonBuilder.build(sortedAcceptTerms); } - final CompiledAutomaton c = new CompiledAutomaton(a, true); + final CompiledAutomaton c = new CompiledAutomaton(a, true, true); final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()]; final Set acceptTermsSet = new HashSet(); Index: lucene/src/test/org/apache/lucene/index/TestTermsEnum2.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestTermsEnum2.java (revision 1157907) +++ lucene/src/test/org/apache/lucene/index/TestTermsEnum2.java (working copy) @@ -165,7 +165,7 @@ for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); - CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton)); + CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), true); TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null); Automaton expected = BasicOperations.intersection(termsAutomaton, automaton); TreeSet found = new TreeSet(); Index: lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java (revision 1157907) +++ lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java (working copy) @@ -36,7 +36,7 @@ } Automaton a = BasicOperations.union(as); a.determinize(); - return new CompiledAutomaton(a, true); + return new CompiledAutomaton(a, true, true); } private void testFloor(CompiledAutomaton c, String input, String expected) { Index: lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java =================================================================== --- lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1157907) +++ lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) @@ -1707,6 +1707,7 @@ StringWriter w = new StringWriter(); Util.toDot(fst, w, false, false); w.close(); + //System.out.println(w.toString()); assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1); } @@ -1721,6 +1722,7 @@ //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); Util.toDot(fst, w, false, false); w.close(); + //System.out.println(w.toString()); assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1); } Index: lucene/src/java/org/apache/lucene/search/FuzzyQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FuzzyQuery.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/search/FuzzyQuery.java (working copy) @@ -138,7 +138,7 @@ @Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (!termLongEnough) { // can only match if it's exact - return new SingleTermsEnum(terms.iterator(), term); + return new SingleTermsEnum(terms.iterator(), term.bytes()); } return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength); } Index: lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java (working copy) @@ -34,9 +34,9 @@ private final BytesRef prefixRef; - public PrefixTermsEnum(TermsEnum tenum, Term prefix) throws IOException { + public PrefixTermsEnum(TermsEnum tenum, BytesRef prefixText) throws IOException { super(tenum); - setInitialSeekTerm(prefixRef = prefix.bytes()); + setInitialSeekTerm(this.prefixRef = prefixText); } @Override Index: lucene/src/java/org/apache/lucene/search/PrefixQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PrefixQuery.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/search/PrefixQuery.java (working copy) @@ -51,7 +51,7 @@ // no prefix -- match all terms for this field: return tenum; } - return new PrefixTermsEnum(tenum, prefix); + return new PrefixTermsEnum(tenum, prefix.bytes()); } /** Prints a user-readable version of this query. */ Index: lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java (working copy) @@ -89,13 +89,13 @@ // for assert: private BytesRef lastTerm; - private boolean compareToLastTerm(BytesRef t) { + private boolean compareToLastTerm(BytesRef t) throws IOException { if (lastTerm == null && t != null) { lastTerm = new BytesRef(t); } else if (t == null) { lastTerm = null; } else { - assert lastTerm.compareTo(t) < 0: "lastTerm=" + lastTerm + " t=" + t; + assert termsEnum.getComparator().compare(lastTerm, t) < 0: "lastTerm=" + lastTerm + " t=" + t; lastTerm.copy(t); } return true; @@ -106,10 +106,7 @@ final float boost = boostAtt.getBoost(); // make sure within a single seg we always collect - // terms in order -- nocommit dangerous, if codec - // uses its own termComp? should we nuke that - // ability? terms must always be "unsigned byte[] - // order"? + // terms in order assert compareToLastTerm(bytes); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); @@ -160,11 +157,7 @@ final Q q = getTopLevelQuery(); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp); - // nocommit -- how come I see 2 seekExacts for the same - // term in a row in the terms dict, around here...? - // could this be if we are using a non-intersect() capable terms impl (e.g. preflex) - // that its the off-by-one regarding 'start term' ? are we seeking backwards!!!!!! for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes); assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term; Index: lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java (working copy) @@ -39,10 +39,10 @@ * After calling the constructor the enumeration is already pointing to the term, * if it exists. */ - public SingleTermsEnum(TermsEnum tenum, Term singleTerm) throws IOException { + public SingleTermsEnum(TermsEnum tenum, BytesRef termText) throws IOException { super(tenum); - singleRef = singleTerm.bytes(); - setInitialSeekTerm(singleRef); + singleRef = termText; + setInitialSeekTerm(termText); } @Override Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy) @@ -154,6 +154,9 @@ } } + // for assert + private boolean automataBuilt; + /** initialize levenshtein DFAs up to maxDistance, if possible */ private List initAutomata(int maxDistance) { final List runAutomata = dfaAtt.automata(); @@ -172,8 +175,11 @@ UnicodeUtil.newString(termText, 0, realPrefixLength)); a = BasicOperations.concatenate(prefix, a); } - runAutomata.add(new CompiledAutomaton(a, true)); + runAutomata.add(new CompiledAutomaton(a, true, true)); } + + assert !automataBuilt; + automataBuilt = true; } return runAutomata; } Index: lucene/src/java/org/apache/lucene/search/AutomatonQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/AutomatonQuery.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/search/AutomatonQuery.java (working copy) @@ -51,20 +51,10 @@ public class AutomatonQuery extends MultiTermQuery { /** the automaton to match index terms against */ protected final Automaton automaton; + protected final CompiledAutomaton compiled; /** term containing the field, and possibly some pattern structure */ protected final Term term; - /** - * abstraction for returning a termsenum: - * in the ctor the query computes one of these, the actual - * implementation depends upon the automaton's structure. - */ - private abstract class TermsEnumFactory { - protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException; - } - - private final TermsEnumFactory factory; - /** * Create a new AutomatonQuery from an {@link Automaton}. * @@ -77,72 +67,12 @@ super(term.field()); this.term = term; this.automaton = automaton; - - if (BasicOperations.isEmpty(automaton)) { - // matches nothing - factory = new TermsEnumFactory() { - @Override - protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - return TermsEnum.EMPTY; - } - }; - } else if (BasicOperations.isTotal(automaton)) { - // matches all possible strings - factory = new TermsEnumFactory() { - @Override - protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - return terms.iterator(); - } - }; - } else { - final String singleton; - final String commonPrefix; - - if (automaton.getSingleton() == null) { - commonPrefix = SpecialOperations.getCommonPrefix(automaton); - if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) { - singleton = commonPrefix; - } else { - singleton = null; - } - } else { - commonPrefix = null; - singleton = automaton.getSingleton(); - } - - if (singleton != null) { - // matches a fixed string in singleton or expanded representation - factory = new TermsEnumFactory() { - @Override - protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - return new SingleTermsEnum(terms.iterator(), new Term(field, singleton)); - } - }; - } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate( - BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) { - // matches a constant prefix - factory = new TermsEnumFactory() { - @Override - protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - return new PrefixTermsEnum(terms.iterator(), new Term(field, commonPrefix)); - } - }; - } else { - final CompiledAutomaton compiled = - new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton)); - factory = new TermsEnumFactory() { - @Override - protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - return terms.intersect(compiled, null); - } - }; - } - } + this.compiled = new CompiledAutomaton(automaton); } @Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - return factory.getTermsEnum(terms, atts); + return compiled.getTermsEnum(terms); } @Override Index: lucene/src/java/org/apache/lucene/index/Terms.java =================================================================== --- lucene/src/java/org/apache/lucene/index/Terms.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/index/Terms.java (working copy) @@ -55,6 +55,9 @@ // TODO: eventually we could support seekCeil/Exact on // the returned enum, instead of only being able to seek // at the start + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } if (startTerm == null) { return new AutomatonTermsEnum(iterator(), compiled); } else { Index: lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java (working copy) @@ -83,6 +83,7 @@ super(tenum); this.finite = compiled.finite; this.runAutomaton = compiled.runAutomaton; + assert this.runAutomaton != null; this.commonSuffixRef = compiled.commonSuffixRef; this.allTransitions = compiled.sortedTransitions; Index: lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java (working copy) @@ -270,6 +270,9 @@ try { return b.utf8ToString() + " " + b; } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: return b.toString(); } } @@ -482,9 +485,9 @@ @Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { - // nocommit -- must check suffix ref - // nocommit -- if A is infinte maybe fallback to ATE? - // need to test... + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } return new IntersectEnum(compiled, startTerm); } @@ -1045,17 +1048,40 @@ continue nextTerm; } - // nocommit -- in the case where common suffix - // len is > currentFrame.suffix we should check - // the suffix of the currentFrame.prefix too! - final int lenToCheck = Math.min(currentFrame.suffix, compiledAutomaton.commonSuffixRef.length); - final byte[] b1 = currentFrame.suffixBytes; - int pos1 = currentFrame.startBytePos + currentFrame.suffix - lenToCheck; - final byte[] b2 = compiledAutomaton.commonSuffixRef.bytes; - int pos2 = compiledAutomaton.commonSuffixRef.offset + compiledAutomaton.commonSuffixRef.length - lenToCheck; - final int pos2End = pos2 + lenToCheck; - while(pos2 < pos2End) { - if (b1[pos1++] != b2[pos2++]) { + final byte[] suffixBytes = currentFrame.suffixBytes; + final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes; + + final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix; + final int suffixLenToCheck; + assert compiledAutomaton.commonSuffixRef.offset == 0; + int suffixBytesPos; + int commonSuffixBytesPos = 0; + + if (lenInPrefix > 0) { + // A prefix of the common suffix overlaps with + // the suffix of the block prefix so we first + // test whether the prefix part matches: + final byte[] termBytes = term.bytes; + int termBytesPos = currentFrame.prefix - lenInPrefix; + assert termBytesPos >= 0; + final int termBytesPosEnd = currentFrame.prefix; + while (termBytesPos < termBytesPosEnd) { + if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + if (DEBUG) { + System.out.println(" skip: common suffix mismatch (in prefix)"); + } + continue nextTerm; + } + } + suffixBytesPos = currentFrame.startBytePos; + } else { + suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length; + } + + // Test overlapping suffix part: + final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length; + while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { + if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { if (DEBUG) { System.out.println(" skip: common suffix mismatch"); } @@ -1064,6 +1090,12 @@ } } + // TODO: maybe we should do the same linear test + // that AutomatonTermsEnum does, so that if we + // reach a part of the automaton where .* is + // "temporarily" accepted, we just blindly .next() + // until the limit + // See if the term prefix matches the automaton: int state = currentFrame.state; for (int idx=0;idx slice = pending.subList(pending.size()-count, pending.size()); int lastLabel = -1; Index: lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (working copy) @@ -17,10 +17,17 @@ * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.codecs.BlockTreeTermsWriter; +import org.apache.lucene.search.PrefixTermsEnum; +import org.apache.lucene.search.SingleTermsEnum; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; /** @@ -31,29 +38,96 @@ * @lucene.experimental */ public class CompiledAutomaton { + public enum AUTOMATON_TYPE {NONE, ALL, SINGLE, PREFIX, NORMAL}; + public final AUTOMATON_TYPE type; + + // For PREFIX, this is the prefix term; for SINGLE this is + // the singleton term: + public final BytesRef term; + + // NOTE: the next 4 members are only non-null if type == + // NORMAL: public final ByteRunAutomaton runAutomaton; // TODO: would be nice if these sortedTransitions had "int // to;" instead of "State to;" somehow: public final Transition[][] sortedTransitions; public final BytesRef commonSuffixRef; - public final boolean finite; + public final Boolean finite; - // nocommit -- move pulling of a TermsEnum into here, so - // that we can optimize for cases where a simpler enume - // (prefix enum, all terms, no terms, etc.) can be used - - public CompiledAutomaton(Automaton automaton, boolean finite) { - Automaton utf8 = new UTF32ToUTF8().convert(automaton); - runAutomaton = new ByteRunAutomaton(utf8, true); - sortedTransitions = utf8.getSortedTransitions(); - this.finite = finite; - if (finite) { + public CompiledAutomaton(Automaton automaton) { + this(automaton, null, false); + } + + public CompiledAutomaton(Automaton automaton, Boolean finite, boolean forceCompile) { + if (!forceCompile && BasicOperations.isEmpty(automaton)) { + // matches nothing + type = AUTOMATON_TYPE.NONE; + term = null; commonSuffixRef = null; + runAutomaton = null; + sortedTransitions = null; + this.finite = null; + } else if (!forceCompile && BasicOperations.isTotal(automaton)) { + // matches all possible strings + type = AUTOMATON_TYPE.ALL; + term = null; + commonSuffixRef = null; + runAutomaton = null; + sortedTransitions = null; + this.finite = null; } else { - commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8); + final String commonPrefix; + final String singleton; + if (automaton.getSingleton() == null) { + commonPrefix = SpecialOperations.getCommonPrefix(automaton); + if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) { + singleton = commonPrefix; + } else { + singleton = null; + } + } else { + commonPrefix = null; + singleton = automaton.getSingleton(); + } + + if (!forceCompile && singleton != null) { + // matches a fixed string in singleton or expanded + // representation + type = AUTOMATON_TYPE.SINGLE; + term = new BytesRef(singleton); + commonSuffixRef = null; + runAutomaton = null; + sortedTransitions = null; + this.finite = null; + } else if (!forceCompile && BasicOperations.sameLanguage(automaton, BasicOperations.concatenate( + BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) { + // matches a constant prefix + type = AUTOMATON_TYPE.PREFIX; + term = new BytesRef(commonPrefix); + commonSuffixRef = null; + runAutomaton = null; + sortedTransitions = null; + this.finite = null; + } else { + type = AUTOMATON_TYPE.NORMAL; + term = null; + if (finite == null) { + this.finite = SpecialOperations.isFinite(automaton); + } else { + this.finite = finite; + } + Automaton utf8 = new UTF32ToUTF8().convert(automaton); + if (this.finite) { + commonSuffixRef = null; + } else { + commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8); + } + runAutomaton = new ByteRunAutomaton(utf8, true); + sortedTransitions = utf8.getSortedTransitions(); + } } } - + private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) { @@ -109,6 +183,30 @@ } } + // TODO: should this take startTerm too? This way + // Terms.intersect could forward to this method if type != + // NORMAL: + public TermsEnum getTermsEnum(Terms terms) throws IOException { + switch(type) { + case NONE: + return TermsEnum.EMPTY; + case ALL: + return terms.iterator(); + case SINGLE: + return new SingleTermsEnum(terms.iterator(), term); + case PREFIX: + // TODO: this is very likely faster than .intersect, + // but we should test and maybe cutover + return new PrefixTermsEnum(terms.iterator(), term); + case NORMAL: + return terms.intersect(this, null); + } + + // not reachable but compiler disagrees: + assert false; + return null; + } + /** Finds largest term accepted by this Automaton, that's * <= the provided input term. The result is placed in * output; it's fine for output and input to point to Index: lucene/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/Util.java (revision 1157907) +++ lucene/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -318,7 +318,6 @@ } if (!fst.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) { - // nocommit -- this is broken again? // Tricky special case: sometimes, due to // pruning, the builder can [sillily] produce // an FST with an arc into the final end state