Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java (revision 0) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java (working copy) @@ -0,0 +1,153 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.suggest.Lookup.LookupResult; +import org.apache.lucene.search.suggest.TermFreq; +import org.apache.lucene.search.suggest.TermFreqArrayIterator; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LineFileDocs; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.Ignore; + +public class TestFreeTextSuggester extends LuceneTestCase { + + public void testBasic() throws Exception { + Iterable keys = shuffle( + new TermFreq("foo bar baz blah boo foo bar foo bee", 50) + ); + + FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(random())); + sug.build(new TermFreqArrayIterator(keys)); + + List results = sug.lookup("foo b", true, 10); + assertEquals(2, results.size()); + assertEquals("foo bar", results.get(0).key); + assertEquals(2, results.get(0).value); + assertEquals("foo bee", results.get(1).key); + assertEquals(1, results.get(1).value); + + results = sug.lookup("foo ", true, 10); + assertEquals(2, results.size()); + assertEquals("foo bar", results.get(0).key); + assertEquals(2, results.get(0).value); + assertEquals("foo bee", results.get(1).key); + assertEquals(1, results.get(1).value); + + // Try again after save/load: + File tmpDir = _TestUtil.getTempDir("FreeTextSuggesterTest"); + tmpDir.mkdir(); + + File path = new File(tmpDir, "suggester"); + + OutputStream os = new FileOutputStream(path); + sug.store(os); + os.close(); + + InputStream is = new FileInputStream(path); + sug = new FreeTextSuggester(new MockAnalyzer(random())); + sug.load(is); + is.close(); + + results = sug.lookup("foo b", true, 10); + assertEquals(2, results.size()); + assertEquals("foo bar", results.get(0).key); + assertEquals(2, results.get(0).value); + assertEquals("foo bee", results.get(1).key); + assertEquals(1, results.get(1).value); + + results = sug.lookup("foo ", true, 10); + assertEquals(2, results.size()); + assertEquals("foo bar", results.get(0).key); + assertEquals(2, results.get(0).value); + assertEquals("foo bee", results.get(1).key); + assertEquals(1, results.get(1).value); + } + + @Ignore + public void testWiki() throws Exception { + final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false); + // Skip header: + lfd.nextDoc(); + FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(random())); + sug.build(new TermFreqIterator() { + + private int count; + + @Override + public long weight() { + return 1; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public BytesRef next() { + Document doc; + try { + doc = lfd.nextDoc(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + if (doc == null) { + return null; + } + if (count++ == 10000) { + return null; + } + return new BytesRef(doc.get("body")); + } + }); + System.out.println(sug.sizeInBytes() + " bytes"); + + List results = sug.lookup("general r", true, 10); + System.out.println("results:"); + for(LookupResult result : results) { + System.out.println(" " + result); + } + } + + @SafeVarargs + public final Iterable shuffle(T...values) { + final List asList = new ArrayList(values.length); + for (T value : values) { + asList.add(value); + } + Collections.shuffle(asList, random()); + return asList; + } +} + Property changes on: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java (revision 0) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java (working copy) @@ -0,0 +1,612 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO +// - how come "history of " fails to find completions in +// aol? +// - maybe shingle never produces x _ y? or even _ y? +// - grr it's the "no pos inc after end" problem! +// - even minimal stemmer is awful: his -> hi; does +// stemmer take exception list? +// - test w/ syns +// - why does "music of " not differ from "music of"? it +// should be "music _" vs "music _ "? oh, no ... no pos +// inc set on end()? +// - can I combine all into a single FST so we can "share"? +// - pruning of low-freq ngrams? +// - optional fuzz? +// - shingle does not generate edge tokens (e.g. "_ foo") +// - make gram counting more ... efficient? +// - what to do w/ the incoming "weights"!? i could +// e.g. use that to further increment the ngram counts +// - how to pick the "best" surface form... +// - expose control over separator char for shingles +// - maybe instead of also storing surface form we could +// ... only present the analyzed form? this would +// require that analysis is "light" and that any +// injected syns are "presentable" +// - lingpipe tutorials http://alias-i.com/lingpipe/demos/tutorial/read-me.html +// - better name? Predictive? FreeText? OpenText? +// LMSuggester? LanguageModel? +// - exact first? + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AnalyzerWrapper; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.Sort; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util.MinResult; +import org.apache.lucene.util.fst.Util; + +/** + * NOTE: requires sizable temp disk space + * NOTE: resulting FSTs require sizable RAM + * + * @lucene.experimental + */ +public class FreeTextSuggester extends Lookup { + + public final static String CODEC_NAME = "freetextsuggest"; + public final static int VERSION_START = 0; + public final static int VERSION_CURRENT = VERSION_START; + + /** Holds 1gram, 2gram, 3gram models as a single FST. */ + //private FST> fst; + private FST fst; + + /** + * Analyzer that will be used for analyzing suggestions at + * index time. + */ + private final Analyzer indexAnalyzer; + + private long totTokens; + + /** + * Analyzer that will be used for analyzing suggestions at + * query time. + */ + private final Analyzer queryAnalyzer; + + // 2 = bigram, 3 = trigram + private final int grams; + + public static final int DEFAULT_GRAM = 2; + + public FreeTextSuggester(Analyzer analyzer) { + this(analyzer, analyzer, DEFAULT_GRAM); + } + + public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { + this(indexAnalyzer, queryAnalyzer, DEFAULT_GRAM); + } + + public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams) { + this.indexAnalyzer = addShingles(indexAnalyzer); + this.queryAnalyzer = addShingles(queryAnalyzer); + this.grams = grams; + } + + /** Returns byte size of the underlying FST. */ + public long sizeInBytes() { + if (fst == null) { + return 0; + } + return fst.sizeInBytes(); + } + + private static class AnalyzingComparator implements Comparator { + + private final ByteArrayDataInput readerA = new ByteArrayDataInput(); + private final ByteArrayDataInput readerB = new ByteArrayDataInput(); + private final BytesRef scratchA = new BytesRef(); + private final BytesRef scratchB = new BytesRef(); + + @Override + public int compare(BytesRef a, BytesRef b) { + readerA.reset(a.bytes, a.offset, a.length); + readerB.reset(b.bytes, b.offset, b.length); + + // By token: + scratchA.length = readerA.readShort(); + scratchA.bytes = a.bytes; + scratchA.offset = readerA.getPosition(); + + scratchB.bytes = b.bytes; + scratchB.length = readerB.readShort(); + scratchB.offset = readerB.getPosition(); + + int cmp = scratchA.compareTo(scratchB); + if (cmp != 0) { + return cmp; + } + readerA.skipBytes(scratchA.length); + readerB.skipBytes(scratchB.length); + + // By length (smaller surface forms sorted first): + cmp = a.length - b.length; + if (cmp != 0) { + return cmp; + } + + // By surface form: + scratchA.offset = readerA.getPosition(); + scratchA.length = a.length - scratchA.offset; + scratchB.offset = readerB.getPosition(); + scratchB.length = b.length - scratchB.offset; + + return scratchA.compareTo(scratchB); + } + } + + private Analyzer addShingles(final Analyzer other) { + // Tack on ShingleFilter to the end, to generate token ngrams: + return new AnalyzerWrapper() { + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return other; + } + + @Override + protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { + // nocommit should we change the separator char... + ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams); + return new TokenStreamComponents(components.getTokenizer(), shingles); + } + }; + } + + @Override + public void build(TermFreqIterator iterator) throws IOException { + String prefix = getClass().getSimpleName(); + File directory = Sort.defaultTempDir(); + File tempInput = File.createTempFile(prefix, ".input", directory); + File tempSorted = File.createTempFile(prefix, ".sorted", directory); + + if (iterator instanceof TermFreqPayloadIterator) { + throw new IllegalArgumentException("payloads are not supported"); + } + + Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); + Sort.ByteSequencesReader reader = null; + BytesRef scratch = new BytesRef(); + byte buffer[] = new byte[8]; + + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + + totTokens = 0; + boolean success = false; + try { + BytesRef surfaceForm; + + while ((surfaceForm = iterator.next()) != null) { + + TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString()); + TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); + OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + BytesRef token = termBytesAtt.getBytesRef(); + + String surfaceFormString = surfaceForm.utf8ToString(); + + ts.reset(); + while (ts.incrementToken()) { + termBytesAtt.fillBytesRef(); + totTokens++; + String surfaceToken = surfaceFormString.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); + UnicodeUtil.UTF16toUTF8(surfaceToken, 0, surfaceToken.length(), scratch); + int requiredLength = token.length + 2 + scratch.length; + + // nocommit maybe ShingleFilter could set token + // type by gram count? + // nocommit throw exc if empty string token tries + // to appear!! + + buffer = ArrayUtil.grow(buffer, requiredLength); + // nocommit fail here if the token has the SEP char! + + output.reset(buffer); + output.writeShort((short) token.length); + output.writeBytes(token.bytes, token.offset, token.length); + output.writeBytes(scratch.bytes, scratch.offset, scratch.length); + + // nocommit we could quite a bit more efficient by + // buffering up the "common" ngrams in ram, + // periodically flushing the "long tail" to the + // writer ... you can then configure how much RAM + // to use... + + assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength; + writer.write(buffer, 0, output.getPosition()); + } + + ts.end(); + ts.close(); + } + writer.close(); + + // Sort all input/output pairs (required by FST.Builder): + new Sort(new AnalyzingComparator()).sort(tempInput, tempSorted); + + // Free disk space: + tempInput.delete(); + + reader = new Sort.ByteSequencesReader(tempSorted); + + //PairOutputs outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); + //Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); + Outputs outputs = PositiveIntOutputs.getSingleton(); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + + IntsRef scratchInts = new IntsRef(); + BytesRef lastToken = new BytesRef(); + BytesRef token = new BytesRef(); + + BytesRef surface = new BytesRef(); + ByteArrayDataInput input = new ByteArrayDataInput(); + + // Count of the "current" ngram: + long dupCount = 0; + while (reader.read(scratch)) { + input.reset(scratch.bytes, scratch.offset, scratch.length); + + token.length = input.readShort(); + token.grow(token.length); + input.readBytes(token.bytes, 0, token.length); + + if (!token.bytesEquals(lastToken)) { + //if (lastGramCount == 2 && dupCount > 10) { + //System.out.println("add " + lastToken.utf8ToString() + " count=" + dupCount); + //} + //builder.add(Util.toIntsRef(lastToken, scratchInts), outputs.newPair(encodeWeight(dupCount), surface)); + builder.add(Util.toIntsRef(lastToken, scratchInts), encodeWeight(dupCount)); + dupCount = 1; + lastToken.copyBytes(token); + + // nocommit we pick the surface form that sorts + // first to "represent" this token ... is there a + // better/explicit way? + surface.length = scratch.length - input.getPosition(); + surface.grow(surface.length); + input.readBytes(surface.bytes, 0, surface.length); + } else { + dupCount++; + } + } + + if (dupCount > 0) { + //builder.add(Util.toIntsRef(lastToken, scratchInts), outputs.newPair(encodeWeight(dupCount), surface)); + builder.add(Util.toIntsRef(lastToken, scratchInts), encodeWeight(dupCount)); + } + + // nocommit must null check the fst: + fst = builder.finish(); + + //Util.dotToFile(fst, "/tmp/suggest.dot"); + + success = true; + } finally { + if (success) { + IOUtils.close(reader, writer); + } else { + IOUtils.closeWhileHandlingException(reader, writer); + } + + tempInput.delete(); + tempSorted.delete(); + } + } + + @Override + public boolean store(OutputStream output) throws IOException { + DataOutput out = new OutputStreamDataOutput(output); + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + out.writeVLong(totTokens); + // nocommit should we save grams? + // nocommit must null check the fst: + fst.save(out); + System.out.println("sizeInBytes=" + fst.sizeInBytes()); + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + DataInput in = new InputStreamDataInput(input); + CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START, VERSION_START); + totTokens = in.readVLong(); + + // nocommit should we load grams & verify it matches ctor? + + //fst = new FST>(in, + //new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + fst = new FST(in, PositiveIntOutputs.getSingleton()); + + return true; + } + + @Override + public List lookup(final CharSequence key, boolean onlyMorePopular, int num) { + try { + return _lookup(key, onlyMorePopular, num); + } catch (IOException ioe) { + // bogus: + throw new RuntimeException(ioe); + } + } + + private List _lookup(final CharSequence key, boolean onlyMorePopular, int num) throws IOException { + List tokens = new ArrayList(); + TokenStream ts = queryAnalyzer.tokenStream("", key.toString()); + TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); + OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + ts.reset(); + + BytesRef[] lastTokens = new BytesRef[grams]; + + // Run full analysis, but only save last 1gram, last + // 2gram, etc.: + BytesRef token = termBytesAtt.getBytesRef(); + int maxEndOffset = -1; + while(ts.incrementToken()) { + termBytesAtt.fillBytesRef(); + // nocommit maybe ShingleFilter could set token + // type by gram count? + int gramCount = 0; + for(int i=0;i maxEndOffset; + ts.close(); + + if (lastTokenEnded) { + // If user hit space after the last token, then + // "upgrade" all tokens. This way "foo " will suggest + // all bigrams starting w/ foo: + for(int i=grams-1;i>0;i--) { + token = lastTokens[i-1]; + if (token == null) { + continue; + } + token.grow(token.length+1); + // nocommit don't hardwire to space: + token.bytes[token.length] = ' '; + token.length++; + lastTokens[i] = token; + } + lastTokens[0] = null; + } + + //Arc> arc = new Arc>(); + Arc arc = new Arc(); + + // nocommit we need smoothing/backoff here, such that we + // "merge" suggestions across all the models, rather + // than just returning if 3gram found only 1 prediction + + BytesReader bytesReader = fst.getBytesReader(); + + // Try highest order models first, and if they return + // results, return that; else, fallback: + for(int gram=grams-1;gram>=0;gram--) { + //System.out.println("try " + (gram+1) + "gram:"); + token = lastTokens[gram]; + if (token == null) { + // Input didn't have enough tokens: + //System.out.println(" skip: not enough input"); + continue; + } + + // nocommit add fuzz here? + // match the prefix portion exactly + //Pair prefixOutput = null; + Long prefixOutput = null; + try { + prefixOutput = lookupPrefix(fst, bytesReader, token, arc); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + + if (prefixOutput == null) { + // This model never saw this prefix, e.g. the + // trigram model never saw context "purple mushroom" + continue; + } + + List results = new ArrayList(num); + CharsRef spare = new CharsRef(); + /* TODO: + if (exactFirst && arc.isFinal()) { + spare.grow(scratch.length); + UnicodeUtil.UTF8toUTF16(scratch, spare); + results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); + if (--num == 0) { + return results; // that was quick + } + } + */ + + // complete top-N + //MinResult> completions[] = null; + MinResult completions[] = null; + try { + + // Because we store multiple models in one FST + // (1gram, 2gram, 3gram), we must restrict the + // search so that it only considers the current + // model. For highest order model, this is not + // necessary since all completions in the FST + // must be from this model, but for lower order + // models we have to filter out the higher order + // ones: + + // nocommit must turn off pruning for lower order models! + //Util.TopNSearcher> searcher = new Util.TopNSearcher>(fst, num, num, weightComparator) { + Util.TopNSearcher searcher = new Util.TopNSearcher(fst, num, num, weightComparator) { + @Override + //protected void addIfCompetitive(Util.FSTPath> path) { + protected void addIfCompetitive(Util.FSTPath path) { + + // nocommit don't hardwire SEP byte to space!!: + if (path.arc.label != ' ') { + super.addIfCompetitive(path); + } else { + //System.out.println("prevent path"); + } + } + }; + + // since this search is initialized with a single start node + // it is okay to start with an empty input path here + searcher.addStartPaths(arc, prefixOutput, true, new IntsRef()); + + completions = searcher.search(); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + + int prefixLength = token.length; + + BytesRef suffix = new BytesRef(8); + //for (MinResult> completion : completions) { + for (MinResult completion : completions) { + token.length = prefixLength; + // append suffix + Util.toBytesRef(completion.input, suffix); + token.append(suffix); + spare.grow(token.length); + UnicodeUtil.UTF8toUTF16(token, spare); + results.add(new LookupResult(spare.toString(), (int) decodeWeight(completion.output))); + } + return results; + } + + return Collections.emptyList(); + } + + /** weight -> cost */ + private long encodeWeight(long ngramCount) { + return totTokens - ngramCount; + } + + /** cost -> weight */ + //private long decodeWeight(Pair output) { + private long decodeWeight(Long output) { + return (int)(totTokens - output); + } + + // NOTE: copied from WFSTCompletionLookup & tweaked + //private Pair lookupPrefix(FST> fst, FST.BytesReader bytesReader, + // BytesRef scratch, Arc> arc) throws /*Bogus*/IOException { + private Long lookupPrefix(FST fst, FST.BytesReader bytesReader, + BytesRef scratch, Arc arc) throws /*Bogus*/IOException { + + //Pair output = fst.outputs.getNoOutput(); + Long output = fst.outputs.getNoOutput(); + + fst.getFirstArc(arc); + + byte[] bytes = scratch.bytes; + int pos = scratch.offset; + int end = pos + scratch.length; + while (pos < end) { + if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { + return null; + } else { + output = fst.outputs.add(output, arc.output); + } + } + + return output; + } + + /* + static final Comparator> weightComparator = new Comparator> () { + @Override + public int compare(Pair left, Pair right) { + return left.output1.compareTo(right.output1); + } + }; + */ + static final Comparator weightComparator = new Comparator () { + @Override + public int compare(Long left, Long right) { + return left.compareTo(right); + } + }; + + /** + * Returns the weight associated with an input string, + * or null if it does not exist. + */ + public Object get(CharSequence key) { + throw new UnsupportedOperationException(); + } +} Property changes on: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1523427) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -238,11 +238,13 @@ } } - private static class FSTPath { + /** Represents a path in TopNSearcher. */ + public static class FSTPath { public FST.Arc arc; public T cost; public final IntsRef input; + /** Sole constructor */ public FSTPath(T cost, FST.Arc arc, IntsRef input) { this.arc = new FST.Arc().copyFrom(arc); this.cost = cost; @@ -300,7 +302,7 @@ } // If back plus this arc is competitive then add to queue: - private void addIfCompetitive(FSTPath path) { + protected void addIfCompetitive(FSTPath path) { assert queue != null;