Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 1453943) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy) @@ -23,6 +23,7 @@ import java.io.Writer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; @@ -37,7 +38,9 @@ /** Consumes a TokenStream and creates an {@link Automaton} * where the transition labels are UTF8 bytes from the {@link * TermToBytesRefAttribute}. Between tokens we insert - * POS_SEP and for holes we insert HOLE. */ + * POS_SEP and for holes we insert HOLE. + * + * @lucene.experimental */ public class TokenStreamToAutomaton { /** Sole constructor. */ @@ -89,6 +92,7 @@ final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class); final BytesRef term = termBytesAtt.getBytesRef(); @@ -101,7 +105,7 @@ int pos = -1; Position posData = null; - + int maxOffset = 0; while (in.incrementToken()) { int posInc = posIncAtt.getPositionIncrement(); assert pos > -1 || posInc > 0; @@ -157,13 +161,26 @@ state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState)); state = nextState; } + + maxOffset = Math.max(maxOffset, offsetAtt.endOffset()); } + in.end(); + State endState = null; + if (offsetAtt.endOffset() > maxOffset) { + endState = new State(); + endState.setAccept(true); + } + pos++; while (pos <= positions.getMaxPos()) { posData = positions.get(pos); if (posData.arriving != null) { - posData.arriving.setAccept(true); + if (endState != null) { + posData.arriving.addTransition(new Transition(POS_SEP, endState)); + } else { + posData.arriving.setAccept(true); + } } pos++; } Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 1453943) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (working copy) @@ -567,6 +567,7 @@ while(true) { key = ""; analyzedKey = ""; + boolean lastRemoved = false; for(int token=0;token < numTokens;token++) { String s; while (true) { @@ -582,10 +583,12 @@ } key += s; if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) { + lastRemoved = true; if (preserveSep && preserveHoles) { analyzedKey += SEP; } } else { + lastRemoved = false; analyzedKey += s; } break; @@ -595,6 +598,10 @@ analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", ""); + if (preserveSep && lastRemoved) { + analyzedKey += SEP; + } + // Don't add same surface form more than once: if (!seen.contains(key)) { seen.add(key); @@ -642,6 +649,7 @@ // "Analyze" the key: String[] tokens = prefix.split(" "); StringBuilder builder = new StringBuilder(); + boolean lastRemoved = false; for(int i=0;i 0 && !builder.toString().endsWith(""+SEP)) { @@ -652,8 +660,10 @@ if (preserveSep && preserveHoles) { builder.append(SEP); } + lastRemoved = true; } else { builder.append(token); + lastRemoved = false; } } @@ -676,6 +686,10 @@ continue; } + if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) { + analyzedKey += SEP; + } + if (VERBOSE) { System.out.println(" analyzed: " + analyzedKey); } @@ -1060,4 +1074,15 @@ })); assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString()); } + + public void testEndingSpace() throws Exception { + Analyzer a = new MockAnalyzer(random()); + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("i love lucy", 7), + new TermFreq("isla de muerta", 8), + })); + assertEquals("[isla de muerta/8, i love lucy/7]", suggester.lookup("i", false, 3).toString()); + assertEquals("[i love lucy/7]", suggester.lookup("i ", false, 3).toString()); + } } Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision 1453943) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (working copy) @@ -594,6 +594,7 @@ while(true) { key = ""; analyzedKey = ""; + boolean lastRemoved = false; for(int token=0;token < numTokens;token++) { String s; while (true) { @@ -612,8 +613,10 @@ if (preserveSep && preserveHoles) { analyzedKey += '\u0000'; } + lastRemoved = true; } else { analyzedKey += s; + lastRemoved = false; } break; } @@ -622,6 +625,10 @@ analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", ""); + if (preserveSep && lastRemoved) { + analyzedKey += " "; + } + // Don't add same surface form more than once: if (!seen.contains(key)) { seen.add(key); @@ -669,6 +676,7 @@ // "Analyze" the key: String[] tokens = prefix.split(" "); StringBuilder builder = new StringBuilder(); + boolean lastRemoved = false; for(int i=0;i 0 && !builder.toString().endsWith(" ")) { @@ -679,8 +687,10 @@ if (preserveSep && preserveHoles) { builder.append("\u0000"); } + lastRemoved = true; } else { builder.append(token); + lastRemoved = false; } } @@ -704,6 +714,10 @@ continue; } + if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) { + analyzedKey += " "; + } + if (VERBOSE) { System.out.println(" analyzed: " + analyzedKey); } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1453943) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -736,7 +736,7 @@ // from each analyzed token, with byte 0 used as // separator between tokens: Automaton automaton = ts2a.toAutomaton(ts); - ts.end(); + //ts.end(); ts.close(); replaceSep(automaton); @@ -758,7 +758,7 @@ // Turn tokenstream into automaton: TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); - ts.end(); + //ts.end(); ts.close(); // TODO: we could use the end offset to "guess"