Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (revision 1385122) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (working copy) @@ -19,6 +19,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; +import java.lang.reflect.Constructor; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; @@ -30,7 +31,11 @@ import java.util.concurrent.Callable; import org.apache.lucene.util.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.analyzing.AnalyzingCompletionLookup; import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup; import org.apache.lucene.search.suggest.jaspell.JaspellLookup; @@ -49,7 +54,8 @@ JaspellLookup.class, TSTLookup.class, FSTCompletionLookup.class, - WFSTCompletionLookup.class); + WFSTCompletionLookup.class, + AnalyzingCompletionLookup.class); private final static int rounds = 15; private final static int warmup = 5; @@ -144,7 +150,13 @@ * Create {@link Lookup} instance and populate it. */ private Lookup buildLookup(Class cls, TermFreq[] input) throws Exception { - Lookup lookup = cls.newInstance(); + Lookup lookup = null; + try { + lookup = cls.newInstance(); + } catch (InstantiationException e) { + Constructor ctor = cls.getConstructor(Analyzer.class); + lookup = ctor.newInstance(new MockAnalyzer(random, MockTokenizer.KEYWORD, false)); + } lookup.build(new TermFreqArrayIterator(input)); return lookup; } Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java (revision 1385122) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java (working copy) @@ -44,7 +44,13 @@ assertEquals(1, results.size()); assertEquals("foo", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); - + + // make sure we don't get a dup exact suggestion: + results = suggester.lookup(_TestUtil.stringToCharSequence("foo", random), true, 2); + assertEquals(1, results.size()); + assertEquals("foo", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + // top N of 1 for 'bar': we return this even though barbar is higher results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random), false, 1); assertEquals(1, results.size()); Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (revision 0) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (working copy) @@ -0,0 +1,283 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.search.suggest.Lookup.LookupResult; +import org.apache.lucene.search.suggest.TermFreq; +import org.apache.lucene.search.suggest.TermFreqArrayIterator; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class AnalyzingCompletionTest extends LuceneTestCase { + + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ + public void testKeyword() throws Exception { + TermFreq keys[] = new TermFreq[] { + new TermFreq("foo", 50), + new TermFreq("bar", 10), + new TermFreq("barbar", 12), + new TermFreq("barbara", 6) + }; + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); + suggester.build(new TermFreqArrayIterator(keys)); + + // top N of 2, but only foo is available + List results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2); + assertEquals(1, results.size()); + assertEquals("foo", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + + // top N of 1 for 'bar': we return this even though barbar is higher + results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("bar", results.get(0).key.toString()); + assertEquals(10, results.get(0).value, 0.01F); + + // top N Of 2 for 'b' + results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2); + assertEquals(2, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + assertEquals("bar", results.get(1).key.toString()); + assertEquals(10, results.get(1).value, 0.01F); + + // top N of 3 for 'ba' + results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3); + assertEquals(3, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + assertEquals("bar", results.get(1).key.toString()); + assertEquals(10, results.get(1).value, 0.01F); + assertEquals("barbara", results.get(2).key.toString()); + assertEquals(6, results.get(2).value, 0.01F); + } + + // TODO: more tests + /** + * basic "standardanalyzer" test with stopword removal + */ + public void testStandard() throws Exception { + TermFreq keys[] = new TermFreq[] { + new TermFreq("the ghost of christmas past", 50), + }; + + Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(standard); + suggester.build(new TermFreqArrayIterator(keys)); + + List results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("the ghost of christmas past", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + + // omit the 'the' since its a stopword, its suggested anyway + results = suggester.lookup(_TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("the ghost of christmas past", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + + // omit the 'the' and 'of' since they are stopwords, its suggested anyway + results = suggester.lookup(_TestUtil.stringToCharSequence("ghost chris", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("the ghost of christmas past", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + } + + public void testInputPathRequired() throws Exception { + TermFreq keys[] = new TermFreq[] { + new TermFreq("ab xc", 50), + new TermFreq("ba xd", 50), + }; + + // SynonymMap.Builder b = new SynonymMap.Builder(false); + // b.add(new CharsRef("ab"), new CharsRef("ba"), true); + // final SynonymMap map = b.build(); + + // The Analyzer below mimics the functionality of the SynonymAnalyzer + // using the above map, so that the suggest module does not need a dependency on the + // synonym module + + final Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + // TokenStream stream = new SynonymFilter(tokenizer, map, true); + // return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); + return new TokenStreamComponents(tokenizer) { + int tokenStreamCounter = 0; + final TokenStream[] tokenStreams = new TokenStream[]{ new CannedTokenStream( + new Token[] { + token("ab",1,1), + token("ba",0,1), + token("xc",1,1) + }), + + new CannedTokenStream( + new Token[] { + token("ba",1,1), + token("xd",1,1) + }), + + new CannedTokenStream( + new Token[] { + token("ab",1,1), + token("ba",0,1), + token("x",1,1) + }) + }; + + @Override + public TokenStream getTokenStream() { + TokenStream result = tokenStreams[tokenStreamCounter]; + tokenStreamCounter++; + return result; + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(analyzer); + suggester.build(new TermFreqArrayIterator(keys)); + List results = suggester.lookup("ab x", false, 1); + assertTrue(results.size() == 1); + } + + private static Token token(String term, int posInc, int posLength) { + final Token t = new Token(term, 0, 0); + t.setPositionIncrement(posInc); + t.setPositionLength(posLength); + return t; + } + + + private void printTokens(final Analyzer analyzer, String input) throws IOException { + System.out.println("Tokens for " + input); + TokenStream ts = analyzer.tokenStream("", new StringReader(input)); + ts.reset(); + final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); + final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); + + while(ts.incrementToken()) { + termBytesAtt.fillBytesRef(); + System.out.println(String.format("%s,%s,%s", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); + } + ts.end(); + ts.close(); + } + + + public void testRandom() throws Exception { + int numWords = atLeast(1000); + + final TreeMap slowCompletor = new TreeMap(); + final TreeSet allPrefixes = new TreeSet(); + + TermFreq[] keys = new TermFreq[numWords]; + + for (int i = 0; i < numWords; i++) { + String s; + while (true) { + // TODO: would be nice to fix this slowCompletor/comparator to + // use full range, but we might lose some coverage too... + s = _TestUtil.randomSimpleString(random()); + if (!slowCompletor.containsKey(s)) { + break; + } + } + + for (int j = 1; j < s.length(); j++) { + allPrefixes.add(s.substring(0, j)); + } + // we can probably do Integer.MAX_VALUE here, but why worry. + int weight = random().nextInt(1<<24); + slowCompletor.put(s, (long)weight); + keys[i] = new TermFreq(s, weight); + } + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false), false); + suggester.build(new TermFreqArrayIterator(keys)); + + for (String prefix : allPrefixes) { + + final int topN = _TestUtil.nextInt(random(), 1, 10); + List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN); + + // 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion + final List matches = new ArrayList(); + + // TODO: could be faster... but its slowCompletor for a reason + for (Map.Entry e : slowCompletor.entrySet()) { + if (e.getKey().startsWith(prefix)) { + matches.add(new LookupResult(e.getKey(), e.getValue().longValue())); + } + } + + assertTrue(matches.size() > 0); + Collections.sort(matches, new Comparator() { + public int compare(LookupResult left, LookupResult right) { + int cmp = Float.compare(right.value, left.value); + if (cmp == 0) { + return left.compareTo(right); + } else { + return cmp; + } + } + }); + if (matches.size() > topN) { + matches.subList(topN, matches.size()).clear(); + } + + assertEquals(matches.size(), r.size()); + + for(int hit=0;hit completions[] = null; try { - completions = Util.shortestPaths(fst, arc, weightComparator, num); - } catch (IOException bogus) { throw new RuntimeException(bogus); } + completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } BytesRef suffix = new BytesRef(8); for (MinResult completion : completions) { @@ -184,7 +186,7 @@ scratch.append(suffix); spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); - results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + completion.output))); + results.add(new LookupResult(spare.toString(), decodeWeight(completion.output))); } return results; } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (revision 0) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (working copy) @@ -0,0 +1,110 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.List; +import java.io.IOException; + +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.fst.FST; + +// TODO: move to core? nobody else uses it yet though... + +public class FSTUtil { + + /** Holds a pair (automaton, fst) of states and accumulated output in the intersected machine. */ + public static final class Path { + + /** Node in the automaton where path ends: */ + public final State state; + + /** Node in the FST where path ends: */ + public final FST.Arc fstNode; + + /** Output of the path so far: */ + T output; + + /** Input of the path so far: */ + public final IntsRef input; + + public Path(State state, FST.Arc fstNode, T output, IntsRef input) { + this.state = state; + this.fstNode = fstNode; + this.output = output; + this.input = input; + } + } + + /** Enumerates all paths in the automaton that also + * intersect the FST, accumulating the FST end node and + * output for each path. */ + public static List> intersectPrefixPaths(Automaton a, FST fst) throws IOException { + final List> queue = new ArrayList>(); + final List> endNodes = new ArrayList>(); + + queue.add(new Path(a.getInitialState(), + fst.getFirstArc(new FST.Arc()), + fst.outputs.getNoOutput(), + new IntsRef())); + + final FST.Arc scratchArc = new FST.Arc(); + final FST.BytesReader fstReader = fst.getBytesReader(0); + + //System.out.println("fst/a intersect"); + + while (queue.size() != 0) { + final Path path = queue.remove(queue.size()-1); + //System.out.println(" cycle path=" + path); + if (path.state.isAccept()) { + endNodes.add(path); + } + + IntsRef currentInput = path.input; + for(Transition t : path.state.getTransitions()) { + + // TODO: we can fix this if necessary: + if (t.getMin() != t.getMax()) { + throw new IllegalStateException("can only handle Transitions that match one character"); + } + + //System.out.println(" t=" + (char) t.getMin()); + + final FST.Arc nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader); + if (nextArc != null) { + //System.out.println(" fst matches"); + // Path continues: + IntsRef newInput = new IntsRef(currentInput.length + 1); + newInput.copyInts(currentInput); + newInput.ints[currentInput.length] = t.getMin(); + newInput.length = currentInput.length + 1; + + queue.add(new Path(t.getDest(), + new FST.Arc().copyFrom(nextArc), + fst.outputs.add(path.output, nextArc.output), + newInput)); + } + } + } + + return endNodes; + } +} \ No newline at end of file Property changes on: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java (revision 0) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java (working copy) @@ -0,0 +1,380 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.fst.Sort; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.fst.Util.MinResult; + +/** + * Suggester that first analyzes the surface form, adds the + * analyzed from to a weighted FST, and then does the same + * thing at lookup time. This means lookup is based on the + * analyzed form while suggestions are still the surface + * form(s). + + *

+ * This can result in powerful suggestor functionality. For + * example, if you use an analyzer removing stop words, + * then the partial text "ghost chr..." could see the + * suggestion "The Ghost of Christmas Past". If + * SynonymFilter is used to map wifi and wireless network to + * hotspot then the partial text "wirele..." could suggest + * "wifi router". Token normalization like stemmers, accent + * removal, etc., would allow suggestions to ignore such + * variations. + * + *

+ * NOTE: Although the {@link TermFreqIterator} API specifies + * floating point weights, input weights should be whole numbers. + * Input weights will be cast to a java integer, and any + * negative, infinite, or NaN values will be rejected. + * + * @lucene.experimental + */ +public class AnalyzingCompletionLookup extends Lookup { + + /** + * FST: + * input is the analyzed form, with a null byte between terms + * weights are encoded as costs: (Integer.MAX_VALUE-weight) + * surface is the original, unanalyzed form. + */ + private FST> fst = null; + + /** + * Analyzer that will be used for analyzing suggestions + */ + private final Analyzer analyzer; + + /** + * True if exact match suggestions should always be returned first. + */ + private final boolean exactFirst; + + /** + * Calls {@link #AnalyzingCompletionLookup(Analyzer,boolean) AnalyzingCompletionLookup(analyzer, true)} + */ + public AnalyzingCompletionLookup(Analyzer analyzer) { + this(analyzer, true); + } + + /** + * Creates a new suggester. + * + * @param analyzer Analyzer that will be used for analyzing suggestions. + * @param exactFirst true if suggestions that match the + * prefix exactly should always be returned first, regardless + * of score. This has no performance impact, but could result + * in low-quality suggestions. + */ + public AnalyzingCompletionLookup(Analyzer analyzer, boolean exactFirst) { + this.analyzer = analyzer; + this.exactFirst = exactFirst; + } + + @Override + public void build(TermFreqIterator iterator) throws IOException { + String prefix = getClass().getSimpleName(); + File directory = Sort.defaultTempDir(); + File tempInput = File.createTempFile(prefix, ".input", directory); + File tempSorted = File.createTempFile(prefix, ".sorted", directory); + + Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); + Sort.ByteSequencesReader reader = null; + BytesRef scratch = new BytesRef(); + + assert TokenStreamToAutomaton.POS_SEP < Byte.MAX_VALUE; + + BytesRef separator = new BytesRef(new byte[] { (byte)TokenStreamToAutomaton.POS_SEP }); + + // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short) + boolean success = false; + byte buffer[] = new byte[8]; + try { + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + BytesRef surfaceForm; + while ((surfaceForm = iterator.next()) != null) { + + // Analyze surface form: + TokenStream ts = analyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); + + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + Automaton automaton = TokenStreamToAutomaton.toAutomaton(ts); + ts.end(); + ts.close(); + assert SpecialOperations.isFinite(automaton); + + // Get all paths from the automaton (there can be + // more than one path, eg if the analyzer created a + // graph using SynFilter or WDF): + + // nocommit: we should probably not wire this param to -1 but have a reasonable limit?! + Set paths = SpecialOperations.getFiniteStrings(automaton, -1); + for (IntsRef path : paths) { + + Util.toBytesRef(path, scratch); + + // length of the analyzed text (FST input) + short analyzedLength = (short) scratch.length; + // compute the required length: + // analyzed sequence + 12 (separator) + weight (4) + surface + analyzedLength (short) + int requiredLength = analyzedLength + 2 + 4 + surfaceForm.length + 2; + + buffer = ArrayUtil.grow(buffer, requiredLength); + + output.reset(buffer); + output.writeBytes(scratch.bytes, scratch.offset, scratch.length); + output.writeByte((byte)0); // separator: not used, just for sort order + output.writeByte((byte)0); // separator: not used, just for sort order + output.writeInt(encodeWeight(iterator.weight())); + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); + output.writeShort(analyzedLength); + writer.write(buffer, 0, output.getPosition()); + } + } + writer.close(); + + // Sort all input/output pairs (required by FST.Builder): + new Sort().sort(tempInput, tempSorted); + reader = new Sort.ByteSequencesReader(tempSorted); + + PairOutputs outputs = new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()); + Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); + + // Build FST: + BytesRef previous = null; + BytesRef analyzed = new BytesRef(); + BytesRef surface = new BytesRef(); + IntsRef scratchInts = new IntsRef(); + ByteArrayDataInput input = new ByteArrayDataInput(); + while (reader.read(scratch)) { + input.reset(scratch.bytes, scratch.offset, scratch.length); + input.setPosition(input.length()-2); + short analyzedLength = input.readShort(); + + analyzed.bytes = scratch.bytes; + analyzed.offset = scratch.offset; + analyzed.length = analyzedLength; + + input.setPosition(analyzedLength + 2); // analyzed sequence + separator + long cost = input.readInt(); + + surface.bytes = scratch.bytes; + surface.offset = input.getPosition(); + surface.length = input.length() - input.getPosition() - 2; + + if (previous == null) { + previous = new BytesRef(); + } else if (analyzed.equals(previous)) { + // nocommit: "extend" duplicates with useless + // increasing bytes (it wont matter) ... or we + // could use multiple outputs for a single input? + // this would be more efficient? + continue; + } + Util.toIntsRef(analyzed, scratchInts); + // nocommit (why?) + builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface))); + previous.copyBytes(analyzed); + } + fst = builder.finish(); + + //Util.dotToFile(fst, "/tmp/suggest.dot"); + + success = true; + } finally { + if (success) { + IOUtils.close(reader, writer); + } else { + IOUtils.closeWhileHandlingException(reader, writer); + } + + tempInput.delete(); + tempSorted.delete(); + } + } + + @Override + public boolean store(OutputStream output) throws IOException { + try { + fst.save(new OutputStreamDataOutput(output)); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + try { + this.fst = new FST>(new InputStreamDataInput(input), new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton())); + } finally { + IOUtils.close(input); + } + return true; + } + + @Override + public List lookup(CharSequence key, boolean onlyMorePopular, int num) { + assert num > 0; + Arc> arc = new Arc>(); + + //System.out.println("lookup"); + + // TODO: is there a Reader from a CharSequence? + // Turn tokenstream into automaton: + Automaton automaton; + try { + TokenStream ts = analyzer.tokenStream("", new StringReader(key.toString())); + automaton = TokenStreamToAutomaton.toAutomaton(ts); + ts.end(); + ts.close(); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + + // TODO: we can optimize this somewhat by determinizing + // while we convert + automaton = Automaton.minimize(automaton); + + List results = new ArrayList(num); + CharsRef spare = new CharsRef(); + + //System.out.println(" now intersect exactFirst=" + exactFirst); + + // Intersect automaton w/ suggest wFST and get all + // prefix starting nodes & their outputs: + final List>> prefixPaths; + try { + prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + + // nocommit maybe nuke exactFirst...? but... it's useful? + if (exactFirst) { + for (FSTUtil.Path> path : prefixPaths) { + if (path.fstNode.isFinal()) { + BytesRef prefix = BytesRef.deepCopyOf(path.output.output2); + prefix.append(path.fstNode.nextFinalOutput.output2); + spare.grow(prefix.length); + UnicodeUtil.UTF8toUTF16(prefix, spare); + results.add(new LookupResult(spare.toString(), decodeWeight(path.output.output1 + path.fstNode.nextFinalOutput.output1))); + if (--num == 0) { + // nocommit hmm should we order all "exact" + // matches by their .output1s, then return those + // top n...? + return results; // that was quick + } + } + } + } + + Util.TopNSearcher> searcher = new Util.TopNSearcher>(fst, num, weightComparator); + for (FSTUtil.Path> path : prefixPaths) { + try { + searcher.addStartPaths(path.fstNode, path.output, !exactFirst, path.input); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + MinResult> completions[] = null; + try { + completions = searcher.search(); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + + for (MinResult> completion : completions) { + spare.grow(completion.output.output2.length); + UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); + results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1))); + } + + return results; + } + + /** + * Returns the weight associated with an input string, + * or null if it does not exist. + */ + public Object get(CharSequence key) { + throw new UnsupportedOperationException(); + } + + /** cost -> weight */ + private static int decodeWeight(long encoded) { + return (int)(Integer.MAX_VALUE - encoded); + } + + /** weight -> cost */ + private static int encodeWeight(long value) { + if (value < 0 || value > Integer.MAX_VALUE) { + throw new UnsupportedOperationException("cannot encode value: " + value); + } + return Integer.MAX_VALUE - (int)value; + } + + static final Comparator> weightComparator = new Comparator> () { + public int compare(Pair left, Pair right) { + return left.output1.compareTo(right.output1); + } + }; +} Property changes on: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java (revision 1385122) +++ lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java (working copy) @@ -1,133 +0,0 @@ -package org.apache.lucene.util; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// TODO: probably move this to core at some point (eg, -// cutover kuromoji, synfilter, LookaheadTokenFilter) - -/** Acts like forever growing T[], but internally uses a - * circular buffer to reuse instances of T. - * - * @lucene.internal */ -public abstract class RollingBuffer { - - /** - * Implement to reset an instance - */ - public static interface Resettable { - public void reset(); - } - - @SuppressWarnings("unchecked") private T[] buffer = (T[]) new RollingBuffer.Resettable[8]; - - // Next array index to write to: - private int nextWrite; - - // Next position to write: - private int nextPos; - - // How many valid Position are held in the - // array: - private int count; - - public RollingBuffer() { - for(int idx=0;idx 0) { - if (nextWrite == -1) { - nextWrite = buffer.length - 1; - } - buffer[nextWrite--].reset(); - count--; - } - nextWrite = 0; - nextPos = 0; - count = 0; - } - - // For assert: - private boolean inBounds(int pos) { - return pos < nextPos && pos >= nextPos - count; - } - - private int getIndex(int pos) { - int index = nextWrite - (nextPos - pos); - if (index < 0) { - index += buffer.length; - } - return index; - } - - /** Get T instance for this absolute position; - * this is allowed to be arbitrarily far "in the - * future" but cannot be before the last freeBefore. */ - public T get(int pos) { - //System.out.println("RA.get pos=" + pos + " nextPos=" + nextPos + " nextWrite=" + nextWrite + " count=" + count); - while (pos >= nextPos) { - if (count == buffer.length) { - @SuppressWarnings("unchecked") T[] newBuffer = (T[]) new Resettable[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - //System.out.println(" grow length=" + newBuffer.length); - System.arraycopy(buffer, nextWrite, newBuffer, 0, buffer.length-nextWrite); - System.arraycopy(buffer, 0, newBuffer, buffer.length-nextWrite, nextWrite); - for(int i=buffer.length;i index=" + index); - //assert buffer[index].pos == pos; - return buffer[index]; - } - - public void freeBefore(int pos) { - final int toFree = count - (nextPos - pos); - assert toFree >= 0; - assert toFree <= count: "toFree=" + toFree + " count=" + count; - int index = nextWrite - count; - if (index < 0) { - index += buffer.length; - } - for(int i=0;i strings = SpecialOperations.getFiniteStrings(a, -1); + assertEquals(2, strings.size()); + IntsRef dog = new IntsRef(); + Util.toIntsRef(new BytesRef("dog"), dog); + assertTrue(strings.contains(dog)); + IntsRef duck = new IntsRef(); + Util.toIntsRef(new BytesRef("duck"), duck); + assertTrue(strings.contains(duck)); + } } Index: lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1385122) +++ lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) @@ -2011,9 +2011,11 @@ //w.close(); Util.MinResult[] r = Util.shortestPaths(fst, - fst.getFirstArc(new FST.Arc()), - minLongComparator, - 3); + fst.getFirstArc(new FST.Arc()), + outputs.getNoOutput(), + minLongComparator, + 3, + true); assertEquals(3, r.length); assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), r[0].input); @@ -2053,9 +2055,11 @@ //w.close(); Util.MinResult>[] r = Util.shortestPaths(fst, - fst.getFirstArc(new FST.Arc>()), - minPairWeightComparator, - 3); + fst.getFirstArc(new FST.Arc>()), + outputs.getNoOutput(), + minPairWeightComparator, + 3, + true); assertEquals(3, r.length); assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), r[0].input); @@ -2127,7 +2131,7 @@ final int topN = _TestUtil.nextInt(random, 1, 10); - Util.MinResult[] r = Util.shortestPaths(fst, arc, minLongComparator, topN); + Util.MinResult[] r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minLongComparator, topN, true); // 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion final List> matches = new ArrayList>(); @@ -2231,7 +2235,7 @@ final int topN = _TestUtil.nextInt(random, 1, 10); - Util.MinResult>[] r = Util.shortestPaths(fst, arc, minPairWeightComparator, topN); + Util.MinResult>[] r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true); // 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion final List>> matches = new ArrayList>>(); Index: lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java (working copy) +++ lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java (working copy) @@ -112,6 +112,12 @@ return buffer[index]; } + /** Returns the maximum position looked up, or -1 if no + * position has been looked up sinc reset/init. */ + public int getMaxPos() { + return nextPos-1; + } + public void freeBefore(int pos) { final int toFree = count - (nextPos - pos); assert toFree >= 0; Index: lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 1385122) +++ lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (working copy) @@ -35,6 +35,8 @@ import java.util.Set; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.Util; /** * Special automata operations. @@ -209,4 +211,56 @@ a.clearNumberedStates(); return accept; } + + /** + * Returns the set of accepted strings, assuming that at most + * limit strings are accepted. If more than limit + * strings are accepted, null is returned. If limit<0, then + * the limit is infinite. + */ + public static Set getFiniteStrings(Automaton a, int limit) { + HashSet strings = new HashSet(); + if (a.isSingleton()) { + if (limit > 0) { + strings.add(Util.toUTF32(a.singleton, new IntsRef())); + } else { + return null; + } + } else if (!getFiniteStrings(a.initial, new HashSet(), strings, new IntsRef(), limit)) { + return null; + } + return strings; + } + + /** + * Returns the strings that can be produced from the given state, or + * false if more than limit strings are found. + * limit<0 means "infinite". + */ + private static boolean getFiniteStrings(State s, HashSet pathstates, + HashSet strings, IntsRef path, int limit) { + pathstates.add(s); + for (Transition t : s.getTransitions()) { + if (pathstates.contains(t.to)) { + return false; + } + for (int n = t.min; n <= t.max; n++) { + path.grow(path.length+1); + path.ints[path.length] = n; + path.length++; + if (t.to.accept) { + strings.add(IntsRef.deepCopyOf(path)); + if (limit >= 0 && strings.size() > limit) { + return false; + } + } + if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) { + return false; + } + path.length--; + } + } + pathstates.remove(s); + return true; + } } Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1385122) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -233,13 +233,14 @@ private static class FSTPath implements Comparable> { public FST.Arc arc; public T cost; - public final IntsRef input = new IntsRef(); + public final IntsRef input; final Comparator comparator; - public FSTPath(T cost, FST.Arc arc, Comparator comparator) { + public FSTPath(T cost, FST.Arc arc, Comparator comparator, IntsRef input) { this.arc = new FST.Arc().copyFrom(arc); this.cost = cost; this.comparator = comparator; + this.input = input; } @Override @@ -258,11 +259,15 @@ } } - private static class TopNSearcher { + /** Utility class to find top N shortest paths from start + * point(s). */ + public static class TopNSearcher { private final FST fst; - private final FST.Arc fromNode; + private final FST.BytesReader bytesReader; private final int topN; + + private final FST.Arc scratchArc = new FST.Arc(); final Comparator comparator; @@ -271,11 +276,13 @@ TreeSet> queue = null; - public TopNSearcher(FST fst, FST.Arc fromNode, int topN, Comparator comparator) { + public TopNSearcher(FST fst, int topN, Comparator comparator) { this.fst = fst; + this.bytesReader = fst.getBytesReader(0); this.topN = topN; - this.fromNode = fromNode; this.comparator = comparator; + + queue = new TreeSet>(); } // If back plus this arc is competitive then add to queue: @@ -308,12 +315,19 @@ // Queue isn't full yet, so any path we hit competes: } - final FSTPath newPath = new FSTPath(cost, path.arc, comparator); + // copy over the current input to the new input + // and add the arc.label to the end + IntsRef newInput = new IntsRef(path.input.length+1); + System.arraycopy(path.input.ints, 0, newInput.ints, 0, path.input.length); + newInput.ints[path.input.length] = path.arc.label; + newInput.length = path.input.length+1; + final FSTPath newPath = new FSTPath(cost, path.arc, comparator, newInput); - newPath.input.grow(path.input.length+1); - System.arraycopy(path.input.ints, 0, newPath.input.ints, 0, path.input.length); - newPath.input.ints[path.input.length] = path.arc.label; - newPath.input.length = path.input.length+1; + // this is pointless right? we do it above already: + //newPath.input.grow(path.input.length+1); + //System.arraycopy(path.input.ints, 0, newPath.input.ints, 0, path.input.length); + //newPath.input.ints[path.input.length] = path.arc.label; + //newPath.input.length = path.input.length+1; //System.out.println(" add path=" + newPath); queue.add(newPath); @@ -329,9 +343,42 @@ } } + /** Adds all leaving arcs, including 'finished' arc, if + * the node is final, from this node into the queue. */ + public void addStartPaths(FST.Arc node, T startOutput, boolean allowEmptyString, IntsRef input) throws IOException { + + T minArcCost = null; + FST.Arc minArc = null; + + // De-dup NO_OUTPUT since it must be a singleton: + if (startOutput.equals(fst.outputs.getNoOutput())) { + startOutput = fst.outputs.getNoOutput(); + } + + FSTPath path = new FSTPath(startOutput, node, comparator, input); + fst.readFirstTargetArc(node, path.arc, bytesReader); + + //System.out.println("add start paths"); + + // Bootstrap: find the min starting arc + while (true) { + if (allowEmptyString || path.arc.label != FST.END_LABEL) { + T arcScore = path.arc.output; + if (minArcCost == null || comparator.compare(arcScore, minArcCost) < 0) { + minArcCost = arcScore; + minArc = scratchArc.copyFrom(path.arc); + //System.out.println(" **"); + } + addIfCompetitive(path); + } + if (path.arc.isLast()) { + break; + } + fst.readNextArc(path.arc, bytesReader); + } + } + public MinResult[] search() throws IOException { - //System.out.println(" search topN=" + topN); - final FST.Arc scratchArc = new FST.Arc(); final List> results = new ArrayList>(); @@ -352,69 +399,21 @@ FSTPath path; if (queue == null) { + // Ran out of paths + break; + } - if (results.size() != 0) { - // Ran out of paths - break; - } + // Remove top path since we are now going to + // pursue it: + path = queue.pollFirst(); - // First pass (top path): start from original fromNode - if (topN > 1) { - queue = new TreeSet>(); - } + if (path == null) { + // There were less than topN paths available: + break; + } - T minArcCost = null; - FST.Arc minArc = null; + //System.out.println(" remove init path=" + path); - path = new FSTPath(NO_OUTPUT, fromNode, comparator); - fst.readFirstTargetArc(fromNode, path.arc, fstReader); - - // Bootstrap: find the min starting arc - while (true) { - T arcScore = path.arc.output; - //System.out.println(" arc=" + (char) path.arc.label + " cost=" + arcScore); - if (minArcCost == null || comparator.compare(arcScore, minArcCost) < 0) { - minArcCost = arcScore; - minArc = scratchArc.copyFrom(path.arc); - //System.out.println(" **"); - } - if (queue != null) { - addIfCompetitive(path); - } - if (path.arc.isLast()) { - break; - } - fst.readNextArc(path.arc, fstReader); - } - - assert minArc != null; - - if (queue != null) { - // Remove top path since we are now going to - // pursue it: - path = queue.pollFirst(); - //System.out.println(" remove init path=" + path); - assert path.arc.label == minArc.label; - if (bottom != null && queue.size() == topN-1) { - bottom = queue.last(); - //System.out.println(" set init bottom: " + bottom); - } - } else { - path.arc.copyFrom(minArc); - path.input.grow(1); - path.input.ints[0] = minArc.label; - path.input.length = 1; - path.cost = minArc.output; - } - - } else { - path = queue.pollFirst(); - if (path == null) { - // There were less than topN paths available: - break; - } - } - if (path.arc.label == FST.END_LABEL) { //System.out.println(" empty string! cost=" + path.cost); // Empty string! @@ -521,14 +520,19 @@ } /** Starting from node, find the top N min cost - * completions to a final node. + * completions to a final node. * *

NOTE: you must share the outputs when you build the * FST (pass doShare=true to {@link * PositiveIntOutputs#getSingleton}). */ + public static MinResult[] shortestPaths(FST fst, FST.Arc fromNode, T startOutput, Comparator comparator, int topN, + boolean allowEmptyString) throws IOException { + TopNSearcher searcher = new TopNSearcher(fst, topN, comparator); - public static MinResult[] shortestPaths(FST fst, FST.Arc fromNode, Comparator comparator, int topN) throws IOException { - return new TopNSearcher(fst, fromNode, topN, comparator).search(); + // since this search is initialized with a single start node + // it is okay to start with an empty input path here + searcher.addStartPaths(fromNode, startOutput, allowEmptyString, new IntsRef()); + return searcher.search(); } /** @@ -837,4 +841,14 @@ scratch.length = input.length; return scratch; } + + // Uncomment for debugging: + + /* + public static void dotToFile(FST fst, String filePath) throws IOException { + Writer w = new OutputStreamWriter(new FileOutputStream(filePath)); + toDot(fst, w, true, true); + w.close(); + } + */ } Index: lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (revision 1385122) +++ lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (working copy) @@ -118,7 +118,7 @@ private boolean valid(Long o) { assert o != null; - assert o == NO_OUTPUT || o > 0; + assert o == NO_OUTPUT || o > 0: "o=" + o; return true; } Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy) @@ -0,0 +1,206 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RollingBuffer; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +// TODO: maybe also toFST? then we can translate atts into FST outputs/weights +// nocommit need option to not add pos sep...? + +/** Consumes a TokenStream and creates an {@link Automaton} + * where the transition labels are UTF8 bytes from the {@link + * TermToBytesRefAttribute}. Between tokens we insert + * POS_SEP and for holes we insert HOLE. */ +public class TokenStreamToAutomaton { + + private static class Position implements RollingBuffer.Resettable { + // Any tokens that ended at our position arrive to this state: + State arriving; + + // Any tokens that start at our position leave from this state: + State leaving; + + @Override + public void reset() { + arriving = null; + leaving = null; + } + } + + private static class Positions extends RollingBuffer { + @Override + protected Position newInstance() { + return new Position(); + } + } + + /** We create transition between two adjacent tokens. */ + // nocommit should we ues 256? ie, outside of the utf8 + // byte range... + public static final int POS_SEP = 0; + + /** We add this arc to represent a hole. */ + // nocommit should we ues 257? ie, outside of the utf8 + // byte range... + public static final int HOLE = 1; + + /** Pulls the graph (including {@link + * PositionLengthAttribute}) from the provided {@link + * TokenStream}, and creates the corresponding + * automaton where arcs are bytes from each term. */ + public static Automaton toAutomaton(TokenStream in) throws IOException { + final Automaton a = new Automaton(); + + final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); + final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final BytesRef term = termBytesAtt.getBytesRef(); + + in.reset(); + + // Only temporarily holds states ahead of our current + // position: + + final RollingBuffer positions = new Positions(); + + int pos = -1; + Position posData = null; + + while (in.incrementToken()) { + int posInc = posIncAtt.getPositionIncrement(); + if (pos == -1 && posInc == 0) { + // TODO: hmm are TS's still allowed to do this...? + posInc = 1; + } + + if (posInc > 0) { + + // New node: + pos += posInc; + + posData = positions.get(pos); + assert posData.leaving == null; + + if (posData.arriving == null) { + // No token ever arrived to this position + if (pos == 0) { + // OK: this is the first token + posData.leaving = a.getInitialState(); + } else { + // This means there's a hole (eg, StopFilter + // does this): + posData.leaving = new State(); + addHoles(a.getInitialState(), positions, pos); + } + } else { + posData.leaving = new State(); + posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); + if (posInc > 1) { + // A token spanned over a hole; add holes + // "under" it: + addHoles(a.getInitialState(), positions, pos); + } + } + positions.freeBefore(pos); + } + + final int endPos = pos + posLengthAtt.getPositionLength(); + + termBytesAtt.fillBytesRef(); + final Position endPosData = positions.get(endPos); + if (endPosData.arriving == null) { + endPosData.arriving = new State(); + } + + State state = posData.leaving; + for(int byteIDX=0;byteIDX positions, int pos) { + Position posData = positions.get(pos); + Position prevPosData = positions.get(pos-1); + + while(posData.arriving == null || prevPosData.leaving == null) { + if (posData.arriving == null) { + posData.arriving = new State(); + posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); + } + if (prevPosData.leaving == null) { + if (pos == 1) { + prevPosData.leaving = startState; + } else { + prevPosData.leaving = new State(); + } + if (prevPosData.arriving != null) { + prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving)); + } + } + prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving)); + pos--; + if (pos <= 0) { + break; + } + posData = prevPosData; + prevPosData = positions.get(pos-1); + } + } +} Property changes on: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property