Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (revision 1336717) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (working copy) @@ -19,6 +19,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; +import java.lang.reflect.Constructor; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; @@ -30,7 +31,11 @@ import java.util.concurrent.Callable; import org.apache.lucene.util.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.analyzing.AnalyzingCompletionLookup; import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup; import org.apache.lucene.search.suggest.jaspell.JaspellLookup; @@ -42,14 +47,15 @@ /** * Benchmarks tests for implementations of {@link Lookup} interface. */ -@Ignore("COMMENT ME TO RUN BENCHMARKS!") +//@Ignore("COMMENT ME TO RUN BENCHMARKS!") public class LookupBenchmarkTest extends LuceneTestCase { @SuppressWarnings("unchecked") private final List> benchmarkClasses = Arrays.asList( JaspellLookup.class, TSTLookup.class, FSTCompletionLookup.class, - WFSTCompletionLookup.class); + WFSTCompletionLookup.class, + AnalyzingCompletionLookup.class); private final static int rounds = 15; private final static int warmup = 5; @@ -144,7 +150,13 @@ * Create {@link Lookup} instance and populate it. */ private Lookup buildLookup(Class cls, TermFreq[] input) throws Exception { - Lookup lookup = cls.newInstance(); + Lookup lookup = null; + try { + lookup = cls.newInstance(); + } catch (InstantiationException e) { + Constructor ctor = cls.getConstructor(Analyzer.class); + lookup = ctor.newInstance(new MockAnalyzer(random, MockTokenizer.KEYWORD, false)); + } lookup.build(new TermFreqArrayIterator(input)); return lookup; } Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (revision 0) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (revision 0) @@ -0,0 +1,178 @@ +package org.apache.lucene.search.suggest.analyzing; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.search.suggest.Lookup.LookupResult; +import org.apache.lucene.search.suggest.TermFreq; +import org.apache.lucene.search.suggest.TermFreqArrayIterator; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class AnalyzingCompletionTest extends LuceneTestCase { + + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ + public void testKeyword() throws Exception { + TermFreq keys[] = new TermFreq[] { + new TermFreq("foo", 50), + new TermFreq("bar", 10), + new TermFreq("barbar", 12), + new TermFreq("barbara", 6) + }; + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); + suggester.build(new TermFreqArrayIterator(keys)); + + // top N of 2, but only foo is available + List results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2); + assertEquals(1, results.size()); + assertEquals("foo", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + + // top N of 1 for 'bar': we return this even though barbar is higher + results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("bar", results.get(0).key.toString()); + assertEquals(10, results.get(0).value, 0.01F); + + // top N Of 2 for 'b' + results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2); + assertEquals(2, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + assertEquals("bar", results.get(1).key.toString()); + assertEquals(10, results.get(1).value, 0.01F); + + // top N of 3 for 'ba' + results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3); + assertEquals(3, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + assertEquals("bar", results.get(1).key.toString()); + assertEquals(10, results.get(1).value, 0.01F); + assertEquals("barbara", results.get(2).key.toString()); + assertEquals(6, results.get(2).value, 0.01F); + } + + // TODO: more tests + /** + * basic "standardanalyzer" test with stopword removal + */ + public void testStandard() throws Exception { + TermFreq keys[] = new TermFreq[] { + new TermFreq("the ghost of christmas past", 50), + }; + + Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, true); + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(standard); + suggester.build(new TermFreqArrayIterator(keys)); + + List results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("the ghost of christmas past", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + + // omit the 'the' since its a stopword, its suggested anyway + results = suggester.lookup(_TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("the ghost of christmas past", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + } + + public void testRandom() throws Exception { + int numWords = atLeast(1000); + + final TreeMap slowCompletor = new TreeMap(); + final TreeSet allPrefixes = new TreeSet(); + + TermFreq[] keys = new TermFreq[numWords]; + + for (int i = 0; i < numWords; i++) { + String s; + while (true) { + // TODO: would be nice to fix this slowCompletor/comparator to + // use full range, but we might lose some coverage too... + s = _TestUtil.randomSimpleString(random()); + if (!slowCompletor.containsKey(s)) { + break; + } + } + + for (int j = 1; j < s.length(); j++) { + allPrefixes.add(s.substring(0, j)); + } + // we can probably do Integer.MAX_VALUE here, but why worry. + int weight = random().nextInt(1<<24); + slowCompletor.put(s, (long)weight); + keys[i] = new TermFreq(s, weight); + } + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false), false); + suggester.build(new TermFreqArrayIterator(keys)); + + for (String prefix : allPrefixes) { + + final int topN = _TestUtil.nextInt(random(), 1, 10); + List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN); + + // 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion + final List matches = new ArrayList(); + + // TODO: could be faster... but its slowCompletor for a reason + for (Map.Entry e : slowCompletor.entrySet()) { + if (e.getKey().startsWith(prefix)) { + matches.add(new LookupResult(e.getKey(), e.getValue().longValue())); + } + } + + assertTrue(matches.size() > 0); + Collections.sort(matches, new Comparator() { + public int compare(LookupResult left, LookupResult right) { + int cmp = Float.compare(right.value, left.value); + if (cmp == 0) { + return left.compareTo(right); + } else { + return cmp; + } + } + }); + if (matches.size() > topN) { + matches.subList(topN, matches.size()).clear(); + } + + assertEquals(matches.size(), r.size()); + + for(int hit=0;hitn shortest paths to retrieve top-ranked + * suggestions. + *

+ * NOTE: Although the {@link TermFreqIterator} API specifies + * floating point weights, input weights should be whole numbers. + * Input weights will be cast to a java integer, and any + * negative, infinite, or NaN values will be rejected. + * + * @see Util#shortestPaths(FST, FST.Arc, Comparator, int) + * @lucene.experimental + */ +public class AnalyzingCompletionLookup extends Lookup { + + /** + * FST: + * input is the analyzed form, with a null byte between terms + * weights are encoded as costs: (Integer.MAX_VALUE-weight) + * surface is the original, unanalyzed form. + */ + private FST> fst = null; + + /** + * Analyzer that will be used for analyzing suggestions + */ + private final Analyzer analyzer; + + /** + * True if exact match suggestions should always be returned first. + */ + private final boolean exactFirst; + + /** + * Calls {@link #AnalyzingCompletionLookup(Analyzer,boolean) AnalyzingCompletionLookup(analyzer, true)} + */ + public AnalyzingCompletionLookup(Analyzer analyzer) { + this(analyzer, true); + } + + /** + * Creates a new suggester. + * + * @param analyzer Analyzer that will be used for analyzing suggestions. + * @param exactFirst true if suggestions that match the + * prefix exactly should always be returned first, regardless + * of score. This has no performance impact, but could result + * in low-quality suggestions. + */ + public AnalyzingCompletionLookup(Analyzer analyzer, boolean exactFirst) { + this.analyzer = analyzer; + this.exactFirst = exactFirst; + } + + @Override + public void build(TermFreqIterator iterator) throws IOException { + String prefix = getClass().getSimpleName(); + File directory = Sort.defaultTempDir(); + File tempInput = File.createTempFile(prefix, ".input", directory); + File tempSorted = File.createTempFile(prefix, ".sorted", directory); + + Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); + Sort.ByteSequencesReader reader = null; + BytesRef scratch = new BytesRef(); + + BytesRef separator = new BytesRef(new byte[] { (byte)0 }); + + // encoding: + // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short) + boolean success = false; + byte buffer[] = new byte[8]; + try { + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + BytesRef spare; + while ((spare = iterator.next()) != null) { + + TokenStream ts = analyzer.tokenStream("", new StringReader(spare.utf8ToString())); + Automaton automaton = TokenStreamToAutomaton.toAutomaton(ts); + ts.end(); + ts.close(); + assert SpecialOperations.isFinite(automaton); + // nocommit: we should probably not wire this param to -1 but have a reasonable limit?! + Set paths = SpecialOperations.getFiniteStrings(automaton, -1); + for (IntsRef path : paths) { + scratch.length = 0; + // nocommit: terrible. + BytesRef nocommitScratch = new BytesRef(); + Util.toBytesRef(path, nocommitScratch); + scratch.append(separator); + scratch.append(nocommitScratch); + + // length of the analyzed text (FST input) + short analyzedLength = (short) scratch.length; + // compute the required length: + // analyzed sequence + 12 (separator) + weight (4) + surface + analyzedLength (short) + int requiredLength = analyzedLength + 2 + 4 + spare.length + 2; + + buffer = ArrayUtil.grow(buffer, requiredLength); + + output.reset(buffer); + output.writeBytes(scratch.bytes, scratch.offset, scratch.length); + output.writeByte((byte)0); // separator: not used, just for sort order + output.writeByte((byte)0); // separator: not used, just for sort order + output.writeInt((int)encodeWeight(iterator.weight())); + output.writeBytes(spare.bytes, spare.offset, spare.length); + output.writeShort(analyzedLength); + writer.write(buffer, 0, output.getPosition()); + } + } + writer.close(); + new Sort().sort(tempInput, tempSorted); + reader = new Sort.ByteSequencesReader(tempSorted); + + PairOutputs outputs = new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()); + Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); + + BytesRef previous = null; + BytesRef analyzed = new BytesRef(); + BytesRef surface = new BytesRef(); + IntsRef scratchInts = new IntsRef(); + ByteArrayDataInput input = new ByteArrayDataInput(); + while (reader.read(scratch)) { + input.reset(scratch.bytes, scratch.offset, scratch.length); + input.setPosition(input.length()-2); + short analyzedLength = input.readShort(); + + analyzed.bytes = scratch.bytes; + analyzed.offset = scratch.offset; + analyzed.length = analyzedLength; + + input.setPosition(analyzedLength + 2); // analyzed sequence + separator + long cost = input.readInt(); + + surface.bytes = scratch.bytes; + surface.offset = input.getPosition(); + surface.length = input.length() - input.getPosition() - 2; + + if (previous == null) { + previous = new BytesRef(); + } else if (analyzed.equals(previous)) { + continue; // nocommit: "extend" duplicates with useless increasing bytes (it wont matter) + } + Util.toIntsRef(analyzed, scratchInts); + // nocommit + // nocommit: why must i deep copy?! + builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface))); + previous.copyBytes(analyzed); + } + fst = builder.finish(); + success = true; + } finally { + if (success) { + IOUtils.close(reader, writer); + } else { + IOUtils.closeWhileHandlingException(reader, writer); + } + + tempInput.delete(); + tempSorted.delete(); + } + } + + @Override + public boolean store(OutputStream output) throws IOException { + try { + fst.save(new OutputStreamDataOutput(output)); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + try { + this.fst = new FST>(new InputStreamDataInput(input), new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton())); + } finally { + IOUtils.close(input); + } + return true; + } + + @Override + public List lookup(CharSequence key, boolean onlyMorePopular, int num) { + assert num > 0; + Arc> arc = new Arc>(); + + // match the prefix portion exactly + Pair prefixOutput = null; + try { + prefixOutput = lookupPrefix(key, arc); + } catch (IOException bogus) { throw new RuntimeException(bogus); } + + if (prefixOutput == null) { + return Collections.emptyList(); + } + + BytesRef prefix = prefixOutput.output2; + int prefixLength = prefix.length; + + List results = new ArrayList(num); + CharsRef spare = new CharsRef(); + if (exactFirst && arc.isFinal()) { + prefix.append(arc.nextFinalOutput.output2); + spare.grow(prefix.length); + UnicodeUtil.UTF8toUTF16(prefix, spare); + results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput.output1 + arc.nextFinalOutput.output1))); + if (--num == 0) { + return results; // that was quick + } + } + + // complete top-N + MinResult> completions[] = null; + try { + completions = Util.shortestPaths(fst, arc, weightComparator, num); + } catch (IOException bogus) { throw new RuntimeException(bogus); } + + for (MinResult> completion : completions) { + prefix.length = prefixLength; + // append suffix + prefix.append(completion.output.output2); + spare.grow(prefix.length); + UnicodeUtil.UTF8toUTF16(prefix, spare); + results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput.output1 + completion.output.output1))); + } + return results; + } + + private Pair lookupPrefix(CharSequence prefix, Arc> arc) throws /*Bogus*/IOException { + Pair output = fst.outputs.getNoOutput(); + BytesReader bytesReader = fst.getBytesReader(0); + + fst.getFirstArc(arc); + + // nocommit: isnt there a charsequencereader somewhere? + TokenStream ts = analyzer.tokenStream("", new StringReader(prefix.toString())); + + // nocommit: respect posincs + TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + BytesRef scratch = termAtt.getBytesRef(); + ts.reset(); + + while (ts.incrementToken()) { + // nocommit: bogus + // instead here, we should build an Automaton out of the tokenstream, + // and intersect that with the FST, producing N outputs, and complete from those? + if (posIncAtt.getPositionIncrement() == 0) { + continue; + } + // separator byte + // nocommit: it should be a separator per posInc, + // and separators should be optional (e.g. japanese completion) + if (fst.findTargetArc(0, arc, arc, bytesReader) == null) { + // TODO: should we fully consume? + ts.close(); + return null; + } else { + output = fst.outputs.add(output, arc.output); + } + + termAtt.fillBytesRef(); + byte[] bytes = scratch.bytes; + int pos = scratch.offset; + int end = pos + scratch.length; + while (pos < end) { + if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { + // TODO: should we fully consume? + ts.close(); + return null; + } else { + output = fst.outputs.add(output, arc.output); + } + } + } + + ts.end(); + ts.close(); + return output; + } + + /** + * Returns the weight associated with an input string, + * or null if it does not exist. + */ + public Object get(CharSequence key) { + // TODO: analyze, or just nuke this method!? + Arc> arc = new Arc>(); + Pair result = null; + try { + result = lookupPrefix(key, arc); + } catch (IOException bogus) { throw new RuntimeException(bogus); } + if (result == null || !arc.isFinal()) { + return null; + } else { + return Integer.valueOf(decodeWeight(result.output1 + arc.nextFinalOutput.output1)); + } + } + + /** cost -> weight */ + private static int decodeWeight(long encoded) { + return (int)(Integer.MAX_VALUE - encoded); + } + + /** weight -> cost */ + private static int encodeWeight(long value) { + if (value < 0 || value > Integer.MAX_VALUE) { + throw new UnsupportedOperationException("cannot encode value: " + value); + } + return Integer.MAX_VALUE - (int)value; + } + + static final Comparator> weightComparator = new Comparator> () { + public int compare(Pair left, Pair right) { + return left.output1.compareTo(right.output1); + } + }; +} Index: lucene/core/src/test/org/apache/lucene/util/automaton/TestSpecialOperations.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/automaton/TestSpecialOperations.java (revision 1336717) +++ lucene/core/src/test/org/apache/lucene/util/automaton/TestSpecialOperations.java (working copy) @@ -1,6 +1,11 @@ package org.apache.lucene.util.automaton; +import java.util.Set; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.fst.Util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -31,4 +36,20 @@ assertEquals(AutomatonTestUtil.isFiniteSlow(a), SpecialOperations.isFinite(b)); } } + + /** + * Basic test for getFiniteStrings + */ + public void testFiniteStrings() { + Automaton a = BasicOperations.union(BasicAutomata.makeString("dog"), BasicAutomata.makeString("duck")); + MinimizationOperations.minimize(a); + Set strings = SpecialOperations.getFiniteStrings(a, -1); + assertEquals(2, strings.size()); + IntsRef dog = new IntsRef(); + Util.toIntsRef(new BytesRef("dog"), dog); + assertTrue(strings.contains(dog)); + IntsRef duck = new IntsRef(); + Util.toIntsRef(new BytesRef("duck"), duck); + assertTrue(strings.contains(duck)); + } } Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 0) @@ -0,0 +1,134 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +/** Consumes a TokenStream and creates an {@link Automaton}. */ +public class TokenStreamToAutomaton { + + // nocommit: what bytes to steal! + + // We create transition w/ this label when posInc is 1: + public static final int POS_SEP = 0; + + // nocommit move to oal.util.automaton? + // nocommit: toFST? then we can translate atts into FST weights + + /** Pulls the graph (including {@link + * PositionLengthAttribute}) from the provided {@link + * TokenStream}, and creates the corresponding + * automaton where arcs are bytes from each term. */ + public static Automaton toAutomaton(TokenStream in) throws IOException { + final Automaton a = new Automaton(); + + final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); + final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final BytesRef term = termBytesAtt.getBytesRef(); + + in.reset(); + + // Only temporarily holds states ahead of our current + // position: + // nocommit maybe linked list...? + final Map posToState = new HashMap(); + + State currentFromState = null; + int pos = -1; + int lastEndPos = -1; + while (in.incrementToken()) { + int posInc = posIncAtt.getPositionIncrement(); + if (currentFromState == null && posInc == 0) { + // TODO: hmm are TS's still allowed to do this...? + posInc = 1; + } + + if (posInc > 0) { + // New node: + pos += posInc; + final State nextFromState; + final State lastEndState = posToState.get(pos); + if (lastEndState == null) { + // nocommit invalid assert!! if a syn matched + // over what is now a hole this assert falsely + // trips... make test!! + assert currentFromState == null; + nextFromState = a.getInitialState(); + } else { + nextFromState = new State(); + posToState.remove(pos); + // nocommit if posInc > 1 what to do...? multiple SEP? + lastEndState.addTransition(new Transition(POS_SEP, nextFromState)); + } + currentFromState = nextFromState; + } + + // nocommit: make test for this: + + // nocommit does posLengthAtt make it possible to + // create broken graph? ie what if posInc skips over + // the node created by a previous posLengthAtt!? + // hrm. actually: we must handle this case! it means + // eg a syn matched a stop word but then stop word was + // deleted... + + final int endPos = pos + posLengthAtt.getPositionLength(); + + termBytesAtt.fillBytesRef(); + State endState = posToState.get(endPos); + if (endState == null) { + endState = new State(); + posToState.put(endPos, endState); + } + + State lastState = currentFromState; + + for(int byteIDX=0;byteIDXlimit strings are accepted. If more than limit + * strings are accepted, null is returned. If limit<0, then + * the limit is infinite. + */ + // nocommit: probably not efficient + public static Set getFiniteStrings(Automaton a, int limit) { + HashSet strings = new HashSet(); + if (a.isSingleton()) { + if (limit > 0) { + // nocommit: yuck + IntsRef ref = new IntsRef(); + Util.toIntsRef(new BytesRef(a.singleton), ref); + strings.add(ref); + } else { + return null; + } + } else if (!getFiniteStrings(a.initial, new HashSet(), strings, new IntsRef(), limit)) + return null; + return strings; + } + + /** + * Returns the strings that can be produced from the given state, or + * false if more than limit strings are found. + * limit<0 means "infinite". + */ + // nocommit: probably not efficient + private static boolean getFiniteStrings(State s, HashSet pathstates, + HashSet strings, IntsRef path, int limit) { + pathstates.add(s); + for (Transition t : s.getTransitions()) { + if (pathstates.contains(t.to)) + return false; + for (int n = t.min; n <= t.max; n++) { + path.grow(path.length+1); + path.ints[path.length] = n; + path.length++; + if (t.to.accept) { + strings.add(IntsRef.deepCopyOf(path)); + if (limit >= 0 && strings.size() > limit) + return false; + } + if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) + return false; + path.length--; + } + } + pathstates.remove(s); + return true; + } }