Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 511bf50bbfe670375d519588cdd88e249c9e1441) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision ) @@ -159,7 +159,7 @@ /** Represents the separation between tokens, if * PRESERVE_SEP was specified */ - private static final int SEP_LABEL = 0xff; + private static final int SEP_LABEL = '\u001F'; /** Marks end of the analyzed input and start of dedup * byte. */ @@ -306,44 +306,12 @@ } } - /** Just escapes the 0xff byte (which we still for SEP). */ - private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { - - final BytesRef spare = new BytesRef(); - - @Override - protected BytesRef changeToken(BytesRef in) { - int upto = 0; - for(int i=0;i results = new ArrayList(); - List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst); + List>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst); if (exactFirst) { @@ -864,6 +840,7 @@ ts.close(); replaceSep(automaton); + automaton = convertAutomaton(automaton); assert SpecialOperations.isFinite(automaton); Index: solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (revision 511bf50bbfe670375d519588cdd88e249c9e1441) +++ solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (revision ) @@ -33,23 +33,33 @@ public class FuzzyLookupFactory extends LookupFactory { /** + * If true, maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix + * will be measured in Unicode code points (actual letters) instead of bytes. + */ + public static final String UNICODE_AWARE = "unicodeAware"; + + /** * Maximum number of edits allowed, used by {@link LevenshteinAutomata#toAutomaton(int)} + * in bytes or Unicode code points (if {@link #UNICODE_AWARE} option is set to true). */ public static final String MAX_EDITS = "maxEdits"; /** * If transpositions are allowed, Fuzzy suggestions will be computed based on a primitive * edit operation. If it is false, it will be based on the classic Levenshtein algorithm. + * Transpositions of bytes or Unicode code points (if {@link #UNICODE_AWARE} option is set to true). */ public static final String TRANSPOSITIONS = "transpositions"; /** * Length of common (non-fuzzy) prefix for the suggestions + * in bytes or Unicode code points (if {@link #UNICODE_AWARE} option is set to true). */ public static final String NON_FUZZY_PREFIX = "nonFuzzyPrefix"; /** * Minimum length of lookup key before any edits are allowed for the suggestions + * in bytes or Unicode code points (if {@link #UNICODE_AWARE} option is set to true). */ public static final String MIN_FUZZY_LENGTH = "minFuzzyLength"; @@ -113,9 +123,13 @@ ? Integer.parseInt(params.get(MIN_FUZZY_LENGTH).toString()) :FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH; + boolean unicodeAware = (params.get(UNICODE_AWARE) != null) + ? Boolean.valueOf(params.get(UNICODE_AWARE).toString()) + : FuzzySuggester.DEFAULT_UNICODE_AWARE; + return new FuzzySuggester(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, maxEdits, - transpositions, nonFuzzyPrefix, minFuzzyLength); + transpositions, nonFuzzyPrefix, minFuzzyLength, unicodeAware); } @Override Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision 511bf50bbfe670375d519588cdd88e249c9e1441) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision ) @@ -48,7 +48,9 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.Util; public class FuzzySuggesterTest extends LuceneTestCase { @@ -60,7 +62,9 @@ keys.add(new TermFreq("boo" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); } keys.add(new TermFreq("foo bar boo far", 12)); - FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, + 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH); suggester.build(new TermFreqArrayIterator(keys)); int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { @@ -72,6 +76,27 @@ } } + public void testNonLatinRandomEdits() throws IOException { + List keys = new ArrayList(); + int numTerms = atLeast(100); + for (int i = 0; i < numTerms; i++) { + keys.add(new TermFreq("буу" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); + } + keys.add(new TermFreq("фуу бар буу фар", 12)); + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, + 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, true); + suggester.build(new TermFreqArrayIterator(keys)); + int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + String addRandomEdit = addRandomEdit("фуу бар буу", 0); + List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); + assertEquals(addRandomEdit, 1, results.size()); + assertEquals("фуу бар буу фар", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + } + } + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { TermFreq keys[] = new TermFreq[] { @@ -580,12 +605,13 @@ TermFreq[] keys = new TermFreq[numQueries]; boolean preserveSep = random().nextBoolean(); + boolean unicodeAware = random().nextBoolean(); final int numStopChars = random().nextInt(10); final boolean preserveHoles = random().nextBoolean(); if (VERBOSE) { - System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); + System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); } for (int i = 0; i < numQueries; i++) { @@ -606,7 +632,7 @@ if (token > 0) { key += " "; } - if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') { + if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length())-1) != ' ' : analyzedKey.charAt(analyzedKey.length()-1) != ' ')) { analyzedKey += " "; } key += s; @@ -659,7 +685,7 @@ Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); FuzzySuggester suggester = new FuzzySuggester(a, a, - preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3); + preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3, unicodeAware); suggester.build(new TermFreqArrayIterator(keys)); for (String prefix : allPrefixes) { @@ -728,7 +754,7 @@ // us the "answer key" (ie maybe we have a bug in // suggester.toLevA ...) ... but testRandom2() fixes // this: - Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)); + Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey))); assertTrue(automaton.isDeterministic()); // TODO: could be faster... but its slowCompletor for a reason BytesRef spare = new BytesRef(); @@ -878,7 +904,8 @@ // NOTE: can only use ascii here so that, in // UTF8 byte space it's still a single // insertion: - int x = random().nextInt(128); + // bytes 0x1e and 0x1f are reserved + int x = random().nextBoolean() ? random().nextInt(30) : 32 + random().nextInt(128 - 32); builder.append((char) x); for (int j = i; j < input.length; j++) { builder.append(input[j]); Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 511bf50bbfe670375d519588cdd88e249c9e1441) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision ) @@ -24,7 +24,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; -import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -48,8 +47,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.TermFreq; import org.apache.lucene.search.suggest.TermFreqArrayIterator; @@ -594,7 +591,7 @@ } } - private static char SEP = '\uFFFF'; + private static char SEP = '\u001F'; public void testRandom() throws Exception { @@ -822,70 +819,6 @@ } } - public void testStolenBytes() throws Exception { - - // First time w/ preserveSep, second time without: - for(int i=0;i<2;i++) { - - final Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - - // TokenStream stream = new SynonymFilter(tokenizer, map, true); - // return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); - return new TokenStreamComponents(tokenizer) { - int tokenStreamCounter = 0; - final TokenStream[] tokenStreams = new TokenStream[] { - new CannedBinaryTokenStream(new BinaryToken[] { - token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), - }), - new CannedTokenStream(new Token[] { - token("a",1,1), - token("a",1,1) - }), - new CannedTokenStream(new Token[] { - token("a",1,1), - token("a",1,1) - }), - new CannedBinaryTokenStream(new BinaryToken[] { - token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), - }) - }; - - @Override - public TokenStream getTokenStream() { - TokenStream result = tokenStreams[tokenStreamCounter]; - tokenStreamCounter++; - return result; - } - - @Override - protected void setReader(final Reader reader) throws IOException { - } - }; - } - }; - - TermFreq keys[] = new TermFreq[] { - new TermFreq("a a", 50), - new TermFreq("a b", 50), - }; - - AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, analyzer, AnalyzingSuggester.EXACT_FIRST | (i==0 ? AnalyzingSuggester.PRESERVE_SEP : 0), 256, -1); - suggester.build(new TermFreqArrayIterator(keys)); - List results = suggester.lookup("a a", false, 5); - assertEquals(1, results.size()); - assertEquals("a b", results.get(0).key); - assertEquals(50, results.get(0).value); - - results = suggester.lookup("a a", false, 5); - assertEquals(1, results.size()); - assertEquals("a a", results.get(0).key); - assertEquals(50, results.get(0).value); - } - } - public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { Analyzer a = new MockAnalyzer(random()); AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1); @@ -1192,5 +1125,25 @@ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, 1); suggester.build(new TermFreqArrayIterator(new TermFreq[] {new TermFreq("a", 1)})); assertEquals("[a/1]", suggester.lookup("a", false, 1).toString()); + } + + public void testIllegalLookupArgument() throws Exception { + Analyzer a = new MockAnalyzer(random()); + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("а где Люси?", 7), + })); + try { + suggester.lookup("а\u001E", false, 3); + fail("should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + System.out.println(e.getMessage()); + } + try { + suggester.lookup("а\u001F", false, 3); + fail("should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + System.out.println(e.getMessage()); + } } } Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 511bf50bbfe670375d519588cdd88e249c9e1441) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision ) @@ -32,7 +32,8 @@ // TODO: maybe also toFST? then we can translate atts into FST outputs/weights /** Consumes a TokenStream and creates an {@link Automaton} - * where the transition labels are UTF8 bytes from the {@link + * where the transition labels are UTF8 bytes (or Unicode + * code points if unicodeArcs is true) from the {@link * TermToBytesRefAttribute}. Between tokens we insert * POS_SEP and for holes we insert HOLE. * @@ -40,6 +41,7 @@ public class TokenStreamToAutomaton { private boolean preservePositionIncrements; + private boolean unicodeArcs; /** Sole constructor. */ public TokenStreamToAutomaton() { @@ -51,6 +53,12 @@ this.preservePositionIncrements = enablePositionIncrements; } + /** Whether to make transition labels Unicode code points instead of UTF8 bytes, + * false by default */ + public void setUnicodeArcs(boolean unicodeArcs) { + this.unicodeArcs = unicodeArcs; + } + private static class Position implements RollingBuffer.Resettable { // Any tokens that ended at our position arrive to this state: State arriving; @@ -80,15 +88,16 @@ } /** We create transition between two adjacent tokens. */ - public static final int POS_SEP = 256; + public static final int POS_SEP = 0x001f; /** We add this arc to represent a hole. */ - public static final int HOLE = 257; + public static final int HOLE = 0x001e; /** Pulls the graph (including {@link * PositionLengthAttribute}) from the provided {@link * TokenStream}, and creates the corresponding - * automaton where arcs are bytes from each term. */ + * automaton where arcs are bytes (or Unicode code points + * if unicodeArcs = true) from each term. */ public Automaton toAutomaton(TokenStream in) throws IOException { final Automaton a = new Automaton(); boolean deterministic = true; @@ -156,16 +165,34 @@ final int endPos = pos + posLengthAtt.getPositionLength(); termBytesAtt.fillBytesRef(); - final BytesRef term2 = changeToken(term); + final BytesRef termUTF8 = changeToken(term); + int[] termUnicode = null; final Position endPosData = positions.get(endPos); if (endPosData.arriving == null) { endPosData.arriving = new State(); } State state = posData.leaving; - for(int byteIDX=0;byteIDXUTF-8 =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision 511bf50bbfe670375d519588cdd88e249c9e1441) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision ) @@ -15,16 +15,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -import java.io.FileOutputStream; + import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; import java.util.Arrays; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; @@ -33,6 +32,7 @@ import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs.Pair; @@ -54,6 +54,9 @@ * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be * edited. We allow up to 1 (@link * #DEFAULT_MAX_EDITS} edit. + * If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits, + * minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code + * points (actual letters) instead of bytes. * *

* NOTE: This suggester does not boost suggestions that @@ -72,7 +75,13 @@ private final boolean transpositions; private final int nonFuzzyPrefix; private final int minFuzzyLength; + private final boolean unicodeAware; + /** Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix + * parameters in Unicode code points (actual letters) + * instead of bytes. */ + public static final boolean DEFAULT_UNICODE_AWARE = false; + /** * The default minimum length of the key passed to {@link * #lookup} before any edits are allowed. @@ -114,10 +123,21 @@ */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, - DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH); + DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE); } /** + * Creates a {@link FuzzySuggester} instance with DEFAULT_UNICODE_AWARE. + */ + public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, + int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + int maxEdits, boolean transpositions, int nonFuzzyPrefix, + int minFuzzyLength) { + this(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, maxEdits, + transpositions, nonFuzzyPrefix, minFuzzyLength, DEFAULT_UNICODE_AWARE); + } + + /** * Creates a {@link FuzzySuggester} instance. * * @param indexAnalyzer Analyzer that will be used for @@ -138,11 +158,12 @@ * Levenshtein algorithm. * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) + * @param unicodeAware operate Unicode code points instead of bytes. */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int nonFuzzyPrefix, - int minFuzzyLength) { + int minFuzzyLength, boolean unicodeAware) { super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions); if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); @@ -158,6 +179,7 @@ this.transpositions = transpositions; this.nonFuzzyPrefix = nonFuzzyPrefix; this.minFuzzyLength = minFuzzyLength; + this.unicodeAware = unicodeAware; } @Override @@ -176,7 +198,7 @@ // "compete") ... in which case I think the wFST needs // to be log weights or something ... - Automaton levA = toLevenshteinAutomata(lookupAutomaton); + Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); @@ -186,6 +208,24 @@ return FSTUtil.intersectPrefixPaths(levA, fst); } + @Override + protected Automaton convertAutomaton(Automaton a) { + if (unicodeAware) { + Automaton utf8automaton = new UTF32ToUTF8().convert(a); + BasicOperations.determinize(utf8automaton); + return utf8automaton; + } else { + return a; + } + } + + @Override + TokenStreamToAutomaton getTokenStreamToAutomaton() { + final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton(); + tsta.setUnicodeArcs(unicodeAware); + return tsta; + } + Automaton toLevenshteinAutomata(Automaton automaton) { final Set ref = SpecialOperations.getFiniteStrings(automaton, -1); Automaton subs[] = new Automaton[ref.size()]; @@ -203,7 +243,7 @@ // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). - LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); + LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already