Index: lucene/common-build.xml =================================================================== --- lucene/common-build.xml (revision 1397147) +++ lucene/common-build.xml (working copy) @@ -818,11 +818,11 @@ - + Index: lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (revision 1397147) +++ lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (working copy) @@ -35,7 +35,8 @@ String msg = "Test class requires enabled assertions, enable globally (-ea)" + " or for Solr/Lucene subpackages only: " + description.getClassName(); System.err.println(msg); - throw new Exception(msg); + // nocommit put back: + //throw new Exception(msg); } catch (AssertionError e) { // Ok, enabled. } Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1397147) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -22,6 +22,8 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FST.BytesReader; /** Static helper methods. * @@ -304,7 +306,10 @@ path.input.ints[path.input.length++] = path.arc.label; final int cmp = bottom.input.compareTo(path.input); path.input.length--; + + // We should never see dups: assert cmp != 0; + if (cmp < 0) { // Doesn't compete return; @@ -329,12 +334,20 @@ //newPath.input.ints[path.input.length] = path.arc.label; //newPath.input.length = path.input.length+1; - //System.out.println(" add path=" + newPath); + //System.out.println(" add path=" + newPath + (bottom == null ? "" : (" newPath.compareTo(bottom)=" + newPath.compareTo(bottom))) + " bottom=" + bottom + " topN=" + topN); + + // We should never see dups: + assert bottom == null || newPath.compareTo(bottom) != 0; queue.add(newPath); + if (bottom != null) { final FSTPath removed = queue.pollLast(); assert removed == bottom; - bottom = queue.last(); + if (queue.size() == 0) { + bottom = null; + } else { + bottom = queue.last(); + } //System.out.println(" now re-set bottom: " + bottom + " queue=" + queue); } else if (queue.size() == topN) { // Queue just filled up: @@ -854,4 +867,92 @@ w.close(); } */ + + /** + * Reads the first arc greater or equal that the given label into the provided + * arc in place and returns it iff found, otherwise return null. + * + * @param label the label to ceil on + * @param fst the fst to operate on + * @param follow the arc to follow reading the label from + * @param arc the arc to read into in place + * @param in the fst's {@link BytesReader} + */ + public static Arc readCeilArc(int label, FST fst, Arc follow, + Arc arc, BytesReader in) throws IOException { + if (label == FST.END_LABEL) { + if (follow.isFinal()) { + if (follow.target <= 0) { + arc.flags = FST.BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target; + arc.node = follow.target; + } + arc.output = follow.nextFinalOutput; + arc.label = FST.END_LABEL; + return arc; + } else { + return null; + } + } + + if (!FST.targetHasArcs(follow)) { + return null; + } + fst.readFirstTargetArc(follow, arc, in); + if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { + // Arcs are fixed array -- use binary search to find + // the target. + + int low = arc.arcIdx; + int high = arc.numArcs - 1; + int mid = 0; + // System.out.println("do arc array low=" + low + " high=" + high + + // " targetLabel=" + targetLabel); + while (low <= high) { + mid = (low + high) >>> 1; + in.pos = arc.posArcsStart; + in.skip(arc.bytesPerArc * mid + 1); + final int midLabel = fst.readLabel(in); + final int cmp = midLabel - label; + // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + + // mid + " midLabel=" + midLabel + " cmp=" + cmp); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + arc.arcIdx = mid-1; + return fst.readNextRealArc(arc, in); + } + } + if (low == arc.numArcs) { + // DEAD END! + return null; + } + + arc.arcIdx = (low > high ? high : low); + return fst.readNextRealArc(arc, in); + } + + // Linear scan + fst.readFirstRealTargetArc(follow.target, arc, in); + + while (true) { + // System.out.println(" non-bs cycle"); + // TODO: we should fix this code to not have to create + // object for the output of every arc we scan... only + // for the matching arc, if found + if (arc.label >= label) { + // System.out.println(" found!"); + return arc; + } else if (arc.isLast()) { + return null; + } else { + fst.readNextRealArc(arc, in); + } + } + } } Index: lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 1397147) +++ lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (working copy) @@ -240,6 +240,20 @@ a.deterministic = true; return a; } + + public static Automaton makeString(int[] word, int offset, int length) { + Automaton a = new Automaton(); + a.setDeterministic(true); + State s = new State(); + a.initial = s; + for (int i = offset; i < offset+length; i++) { + State s2 = new State(); + s.addTransition(new Transition(word[i], s2)); + s = s2; + } + s.accept = true; + return a; + } /** * Returns a new (deterministic and minimal) automaton that accepts the union Index: lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (revision 1397147) +++ lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (working copy) @@ -33,12 +33,13 @@ /** @lucene.internal */ public static final int MAXIMUM_SUPPORTED_DISTANCE = 2; /* input word */ - final String input; final int word[]; /* the automata alphabet. */ final int alphabet[]; + /* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */ + final int alphaMax; - /* the unicode ranges outside of alphabet */ + /* the ranges outside of alphabet */ final int rangeLower[]; final int rangeUpper[]; int numRanges = 0; @@ -50,12 +51,15 @@ * Optionally count transpositions as a primitive edit. */ public LevenshteinAutomata(String input, boolean withTranspositions) { - this.input = input; - int length = Character.codePointCount(input, 0, input.length()); - word = new int[length]; - for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { - word[j++] = cp = input.codePointAt(i); - } + this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions); + } + + /** + * Expert: Don't use this! + */ + public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) { + this.word = word; + this.alphaMax = alphaMax; // calculate the alphabet SortedSet set = new TreeSet(); @@ -81,9 +85,9 @@ lower = higher + 1; } /* add the final endpoint */ - if (lower <= Character.MAX_CODE_POINT) { + if (lower <= alphaMax) { rangeLower[numRanges] = lower; - rangeUpper[numRanges] = Character.MAX_CODE_POINT; + rangeUpper[numRanges] = alphaMax; numRanges++; } @@ -94,6 +98,15 @@ }; } + private static int[] codePoints(String input) { + int length = Character.codePointCount(input, 0, input.length()); + int word[] = new int[length]; + for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { + word[j++] = cp = input.codePointAt(i); + } + return word; + } + /** * Compute a DFA that accepts all strings within an edit distance of n. *

@@ -106,8 +119,9 @@ *

*/ public Automaton toAutomaton(int n) { - if (n == 0) - return BasicAutomata.makeString(input); + if (n == 0) { + return BasicAutomata.makeString(word, 0, word.length); + } if (n >= descriptions.length) return null; Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 1397147) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy) @@ -22,6 +22,7 @@ import java.io.OutputStreamWriter; import java.io.Writer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; @@ -88,6 +89,7 @@ final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final BytesRef term = termBytesAtt.getBytesRef(); in.reset(); Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (revision 1397147) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (working copy) @@ -36,6 +36,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.search.suggest.Lookup; // javadocs import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester; +import org.apache.lucene.search.suggest.analyzing.FuzzySuggester; import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup; import org.apache.lucene.search.suggest.jaspell.JaspellLookup; @@ -47,15 +48,18 @@ /** * Benchmarks tests for implementations of {@link Lookup} interface. */ -@Ignore("COMMENT ME TO RUN BENCHMARKS!") +//@Ignore("COMMENT ME TO RUN BENCHMARKS!") public class LookupBenchmarkTest extends LuceneTestCase { @SuppressWarnings("unchecked") private final List> benchmarkClasses = Arrays.asList( + FuzzySuggester.class, + AnalyzingSuggester.class, JaspellLookup.class, TSTLookup.class, FSTCompletionLookup.class, - WFSTCompletionLookup.class, - AnalyzingSuggester.class); + WFSTCompletionLookup.class + + ); private final static int rounds = 15; private final static int warmup = 5; @@ -212,8 +216,9 @@ final List input = new ArrayList(benchmarkInput.size()); for (TermFreq tf : benchmarkInput) { String s = tf.term.utf8ToString(); - input.add(s.substring(0, Math.min(s.length(), - minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)))); + String sub = s.substring(0, Math.min(s.length(), + minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))); + input.add(sub); } BenchmarkResult result = measure(new Callable() { @@ -250,7 +255,9 @@ } return new BenchmarkResult(times, warmup, rounds); } catch (Exception e) { + e.printStackTrace(); throw new RuntimeException(e); + } } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1397147) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -302,7 +302,7 @@ } } - private TokenStreamToAutomaton getTokenStreamToAutomaton() { + TokenStreamToAutomaton getTokenStreamToAutomaton() { if (preserveSep) { return new EscapingTokenStreamToAutomaton(); } else { @@ -324,6 +324,7 @@ BytesRef scratch = new BytesRef(); TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short) boolean success = false; byte buffer[] = new byte[8]; @@ -331,29 +332,8 @@ ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while ((surfaceForm = iterator.next()) != null) { - - // Analyze surface form: - TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); - - // Create corresponding automaton: labels are bytes - // from each analyzed token, with byte 0 used as - // separator between tokens: - Automaton automaton = ts2a.toAutomaton(ts); - ts.end(); - ts.close(); - - replaceSep(automaton); - - assert SpecialOperations.isFinite(automaton); - - // Get all paths from the automaton (there can be - // more than one path, eg if the analyzer created a - // graph using SynFilter or WDF): - - // TODO: we could walk & add simultaneously, so we - // don't have to alloc [possibly biggish] - // intermediate HashSet in RAM: - Set paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + Set paths = toFiniteStrings(surfaceForm, ts2a); + for (IntsRef path : paths) { Util.toBytesRef(path, scratch); @@ -495,33 +475,15 @@ try { - // TODO: is there a Reader from a CharSequence? - // Turn tokenstream into automaton: - TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); - Automaton automaton = getTokenStreamToAutomaton().toAutomaton(ts); - ts.end(); - ts.close(); + Automaton lookupAutomaton = toLookupAutomaton(key); - // TODO: we could use the end offset to "guess" - // whether the final token was a partial token; this - // would only be a heuristic ... but maybe an OK one. - // This way we could eg differentiate "net" from "net ", - // which we can't today... - - replaceSep(automaton); - - // TODO: we can optimize this somewhat by determinizing - // while we convert - BasicOperations.determinize(automaton); - final CharsRef spare = new CharsRef(); //System.out.println(" now intersect exactFirst=" + exactFirst); // Intersect automaton w/ suggest wFST and get all // prefix starting nodes & their outputs: - final List>> prefixPaths; - prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst); + final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //System.out.println(" prefixPaths: " + prefixPaths.size()); @@ -532,6 +494,7 @@ List results = new ArrayList(); if (exactFirst) { + final List>> prefixPaths = intersector.intersectExact(); Util.TopNSearcher> searcher; searcher = new Util.TopNSearcher>(fst, num, weightComparator); @@ -617,8 +580,10 @@ } } }; - + final List>> prefixPaths = intersector.intersectAll(); +// System.out.println(key); for (FSTUtil.Path> path : prefixPaths) { +// System.out.println(UnicodeUtil.newString(path.input.ints, path.input.offset, path.input.length)); searcher.addStartPaths(path.fstNode, path.output, true, path.input); } @@ -637,7 +602,56 @@ throw new RuntimeException(bogus); } } + + final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { + // Analyze surface form: + TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + Automaton automaton = ts2a.toAutomaton(ts); + ts.end(); + ts.close(); + + replaceSep(automaton); + + assert SpecialOperations.isFinite(automaton); + + // Get all paths from the automaton (there can be + // more than one path, eg if the analyzer created a + // graph using SynFilter or WDF): + + // TODO: we could walk & add simultaneously, so we + // don't have to alloc [possibly biggish] + // intermediate HashSet in RAM: + return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + } + + final Automaton toLookupAutomaton(final CharSequence key) throws IOException { + // TODO: is there a Reader from a CharSequence? + // Turn tokenstream into automaton: + TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); + Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + ts.end(); + ts.close(); + + // TODO: we could use the end offset to "guess" + // whether the final token was a partial token; this + // would only be a heuristic ... but maybe an OK one. + // This way we could eg differentiate "net" from "net ", + // which we can't today... + + replaceSep(automaton); + + // TODO: we can optimize this somewhat by determinizing + // while we convert + BasicOperations.determinize(automaton); + return automaton; + } + + + /** * Returns the weight associated with an input string, * or null if it does not exist. @@ -664,4 +678,25 @@ return left.output1.compareTo(right.output1); } }; + + protected PathIntersector getPathIntersector(Automaton automaton, FST> fst) { + return new PathIntersector(automaton, fst); + } + + protected static class PathIntersector { + protected List>> intersect; + protected final Automaton automaton; + protected final FST> fst; + public PathIntersector(Automaton automaton, FST> fst) { + this.automaton = automaton; + this.fst = fst; + } + public List>> intersectExact() throws IOException { + return intersect = FSTUtil.intersectPrefixPathsExact(automaton, fst); + } + + public List>> intersectAll() throws IOException { + return intersect == null ? intersect = FSTUtil.intersectPrefixPathsExact(automaton, fst) : intersect; + } + } } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (revision 1397147) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (working copy) @@ -22,10 +22,12 @@ import java.io.IOException; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; // TODO: move to core? nobody else uses it yet though... @@ -65,7 +67,7 @@ /** Enumerates all paths in the automaton that also * intersect the FST, accumulating the FST end node and * output for each path. */ - public static List> intersectPrefixPaths(Automaton a, FST fst) throws IOException { + public static List> intersectPrefixPathsExact(Automaton a, FST fst) throws IOException { final List> queue = new ArrayList>(); final List> endNodes = new ArrayList>(); @@ -88,7 +90,6 @@ IntsRef currentInput = path.input; for(Transition t : path.state.getTransitions()) { - // TODO: we can fix this if necessary: if (t.getMin() != t.getMax()) { throw new IllegalStateException("can only handle Transitions that match one character"); @@ -115,4 +116,85 @@ return endNodes; } + + /** + * nocommit javadoc + */ + public static List> intersectPrefixPaths(Automaton a, FST fst) throws IOException { + assert a.isDeterministic(); + final List> queue = new ArrayList>(); + final List> endNodes = new ArrayList>(); + queue.add(new Path(a.getInitialState(), fst + .getFirstArc(new FST.Arc()), fst.outputs.getNoOutput(), + new IntsRef())); + + final FST.Arc scratchArc = new FST.Arc(); + final FST.BytesReader fstReader = fst.getBytesReader(0); + + while (queue.size() != 0) { + final Path path = queue.remove(queue.size() - 1); + if (path.state.isAccept()) { + endNodes.add(path); + continue; + } +// System.out.println(UnicodeUtil.newString(path.input.ints, path.input.offset, path.input.length)); + + IntsRef currentInput = path.input; + for (Transition t : path.state.getTransitions()) { + + if (t.getMin() == t.getMax()) { + final FST.Arc nextArc = fst.findTargetArc(t.getMin(), + path.fstNode, scratchArc, fstReader); + if (nextArc != null) { + final IntsRef newInput = new IntsRef(currentInput.length + 1); + newInput.copyInts(currentInput); + newInput.ints[currentInput.length] = t.getMin(); + newInput.length = currentInput.length + 1; +// if (t.getDest().isAccept()) { +// System.out.println(UnicodeUtil.newString(newInput.ints, newInput.offset, newInput.length)); +// } + queue.add(new Path(t.getDest(), new FST.Arc() + .copyFrom(nextArc), fst.outputs + .add(path.output, nextArc.output), newInput)); + } + } else { + // TODO: + // if we accept the entire range possible in the FST (ie. 0 to 256) + // we can simply use the prefix as the accepted state instead of + // looking up all the + // ranges and terminate early here? + FST.Arc nextArc = Util.readCeilArc(t.getMin(), fst, path.fstNode, + scratchArc, fstReader); + while (nextArc != null && nextArc.label <= t.getMax()) { + assert nextArc.label <= t.getMax(); + assert nextArc.label >= t.getMin() : nextArc.label + " " + + t.getMin(); + final IntsRef newInput = new IntsRef(currentInput.length + 1); + newInput.copyInts(currentInput); + newInput.ints[currentInput.length] = nextArc.label; + newInput.length = currentInput.length + 1; +// if (t.getDest().isAccept()) { +// System.out.println(UnicodeUtil.newString(newInput.ints, newInput.offset, newInput.length)); +// } + queue.add(new Path(t.getDest(), new FST.Arc() + .copyFrom(nextArc), fst.outputs + .add(path.output, nextArc.output), newInput)); + final int label = nextArc.label; // used in assert + nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc, + fstReader); + assert nextArc == null || label < nextArc.label : "last: " + label + + " next: " + nextArc.label; + } + } + } + } + //System.out.println(); + + for (Path path2 : endNodes) { + if ("poales".equals(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length))) + System.out.println(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length)); + } + return endNodes; + } + }