Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision 1404830) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (working copy) @@ -810,10 +810,10 @@ Collections.shuffle(keys, random()); suggester.build(new TermFreqArrayIterator(keys)); - assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString()); - assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString()); - assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString()); - assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString()); + assertEquals("[[foo bar] baz/50, [foo bar]/40]", suggester.lookup("foobar", false, 5).toString()); + assertEquals("[[foo bar baz]/50]", suggester.lookup("foobarbaz", false, 5).toString()); + assertEquals("[[barbaz]/60, [barbaz]foo/10]", suggester.lookup("bar baz", false, 5).toString()); + assertEquals("[[barbazfoo]/10]", suggester.lookup("bar baz foo", false, 5).toString()); } private static String addRandomEdit(String string, int prefixLength) { @@ -1125,4 +1125,15 @@ } return ref; } + + public void testPrefixLength() throws Exception { + Analyzer a = new MockAnalyzer(random()); + AnalyzingSuggester suggester = new FuzzySuggester(a); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("abcfoo", 6), + new TermFreq("abcbar", 5), + })); + assertEquals("[[abc]foo/6, [abc]bar/5]", suggester.lookup("acb", false, 2).toString()); + } } Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 1404830) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (working copy) @@ -806,7 +806,7 @@ new TermFreq("a c b", 1), })); - List results = suggester.lookup("a", false, 4); + suggester.lookup("a", false, 4); } public void testExactFirstMissingResult() throws Exception { @@ -983,4 +983,15 @@ assertEquals("b", results.get(1).key); assertEquals(5, results.get(1).value); } + + public void testPrefixLength() throws Exception { + Analyzer a = new MockAnalyzer(random()); + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("abcfoo", 6), + new TermFreq("abcbar", 5), + })); + assertEquals("[[abc]foo/6, [abc]bar/5]", suggester.lookup("abc", false, 2).toString()); + } } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java (revision 1404830) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java (working copy) @@ -39,8 +39,15 @@ public static final class LookupResult implements Comparable { /** the key's text */ public final CharSequence key; + /** the key's weight */ public final long value; + + /** Set by some suggesters to indicate the length of + * the key that "roughly" corresponds to the user's + * input. The remainder of the key was derived by + * completion. */ + public final int prefixLength; /** * Create a new result from a key+weight pair. @@ -48,11 +55,26 @@ public LookupResult(CharSequence key, long value) { this.key = key; this.value = value; + prefixLength = -1; } + + /** + * Create a new result from key, weight and prefixLength. + */ + public LookupResult(CharSequence key, long value, int prefixLength) { + this.key = key; + this.value = value; + this.prefixLength = prefixLength; + } @Override public String toString() { - return key + "/" + value; + if (prefixLength != -1) { + String stringKey = key.toString(); + return "[" + stringKey.substring(0, prefixLength) + "]" + stringKey.substring(prefixLength, key.length()) + "/" + value; + } else { + return key + "/" + value; + } } /** Compare alphabetically. */ Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (revision 1404830) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (working copy) @@ -167,7 +167,7 @@ if (exactFirst && arc.isFinal()) { spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); - results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); + results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput), key.length())); if (--num == 0) { return results; // that was quick } @@ -189,7 +189,7 @@ scratch.append(suffix); spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); - results.add(new LookupResult(spare.toString(), decodeWeight(completion.output))); + results.add(new LookupResult(spare.toString(), decodeWeight(completion.output), key.length())); } return results; } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1404830) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -31,7 +31,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.fst.Sort; @@ -561,7 +560,11 @@ if (utf8Key.bytesEquals(completion.output.output2)) { spare.grow(completion.output.output2.length); UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); - results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1))); + String input = spare.toString(); + + int prefixLength = UnicodeUtil.codePointCount(completion.startOutput.output2); + + results.add(new LookupResult(input, decodeWeight(completion.output.output1), prefixLength)); break; } } @@ -618,8 +621,12 @@ for(MinResult> completion : completions) { spare.grow(completion.output.output2.length); UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); - LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1)); + String input = spare.toString(); + int prefixLength = UnicodeUtil.codePointCount(completion.startOutput.output2); + + LookupResult result = new LookupResult(input, decodeWeight(completion.output.output1), prefixLength); + // TODO: for fuzzy case would be nice to return // how many edits were required Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1404830) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -236,11 +236,13 @@ public FST.Arc arc; public T cost; public final IntsRef input; + public final T startOutput; - public FSTPath(T cost, FST.Arc arc, IntsRef input) { + public FSTPath(T cost, FST.Arc arc, IntsRef input, T startOutput) { this.arc = new FST.Arc().copyFrom(arc); this.cost = cost; this.input = input; + this.startOutput = startOutput; } @Override @@ -333,7 +335,7 @@ System.arraycopy(path.input.ints, 0, newInput.ints, 0, path.input.length); newInput.ints[path.input.length] = path.arc.label; newInput.length = path.input.length+1; - final FSTPath newPath = new FSTPath(cost, path.arc, newInput); + final FSTPath newPath = new FSTPath(cost, path.arc, newInput, path.startOutput); queue.add(newPath); @@ -351,7 +353,7 @@ startOutput = fst.outputs.getNoOutput(); } - FSTPath path = new FSTPath(startOutput, node, input); + FSTPath path = new FSTPath(startOutput, node, input, startOutput); fst.readFirstTargetArc(node, path.arc, bytesReader); //System.out.println("add start paths"); @@ -409,7 +411,7 @@ //System.out.println(" empty string! cost=" + path.cost); // Empty string! path.input.length--; - results.add(new MinResult(path.input, path.cost)); + results.add(new MinResult(path.input, path.cost, path.startOutput)); continue; } @@ -472,7 +474,7 @@ //System.out.println(" done!: " + path); T finalOutput = fst.outputs.add(path.cost, path.arc.output); if (acceptResult(path.input, finalOutput)) { - results.add(new MinResult(path.input, finalOutput)); + results.add(new MinResult(path.input, finalOutput, path.startOutput)); } else { rejectCount++; assert rejectCount + topN <= maxQueueDepth: "maxQueueDepth (" + maxQueueDepth + ") is too small for topN (" + topN + "): rejected " + rejectCount + " paths"; @@ -502,9 +504,12 @@ public final static class MinResult { public final IntsRef input; public final T output; - public MinResult(IntsRef input, T output) { + public final T startOutput; + + public MinResult(IntsRef input, T output, T startOutput) { this.input = input; this.output = output; + this.startOutput = startOutput; } } Index: lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 1404830) +++ lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -415,9 +415,11 @@ }; - /** Returns the number of code points in this utf8 - * sequence. Behavior is undefined if the utf8 sequence - * is invalid.*/ + /** Returns the number of complete code points in this utf8 + * sequence. If the utf8 sequence ends part way through a + * code point then that code point is not counted. If + * the utf8 sequence is otherwise invalid then behavior + * is undefined. */ public static int codePointCount(BytesRef utf8) { int upto = utf8.offset; final int limit = utf8.offset + utf8.length; @@ -427,6 +429,10 @@ codePointCount++; upto += utf8CodeLength[bytes[upto]&0xFF]; } + if (upto > limit) { + // nocommit need test coverage: + codePointCount--; + } return codePointCount; }