Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.File;\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.OutputStream;\nimport java.io.StringReader;\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.TokenStreamToAutomaton;\nimport org.apache.lucene.search.spell.TermFreqIterator;\nimport org.apache.lucene.search.spell.TermFreqPayloadIterator;\nimport org.apache.lucene.search.suggest.Lookup;\nimport org.apache.lucene.search.suggest.Sort;\nimport org.apache.lucene.store.ByteArrayDataInput;\nimport org.apache.lucene.store.ByteArrayDataOutput;\nimport org.apache.lucene.store.DataInput;\nimport org.apache.lucene.store.DataOutput;\nimport org.apache.lucene.store.InputStreamDataInput;\nimport org.apache.lucene.store.OutputStreamDataOutput;\nimport org.apache.lucene.util.ArrayUtil;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.CharsRef;\nimport org.apache.lucene.util.IOUtils;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.UnicodeUtil;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.BasicOperations;\nimport org.apache.lucene.util.automaton.SpecialOperations;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.automaton.Transition;\nimport org.apache.lucene.util.fst.Builder;\nimport org.apache.lucene.util.fst.ByteSequenceOutputs;\nimport org.apache.lucene.util.fst.FST.BytesReader;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.PairOutputs.Pair;\nimport org.apache.lucene.util.fst.PairOutputs;\nimport org.apache.lucene.util.fst.PositiveIntOutputs;\nimport org.apache.lucene.util.fst.Util.MinResult;\nimport org.apache.lucene.util.fst.Util;\n\n/**\n * Suggester that first analyzes the surface form, adds the\n * analyzed form to a weighted FST, and then does the same\n * thing at lookup time. This means lookup is based on the\n * analyzed form while suggestions are still the surface\n * form(s).\n *\n *

\n * This can result in powerful suggester functionality. For\n * example, if you use an analyzer removing stop words, \n * then the partial text \"ghost chr...\" could see the\n * suggestion \"The Ghost of Christmas Past\". Note that\n * position increments MUST NOT be preserved for this example\n * to work, so you should call\n * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.\n *\n *

\n * If SynonymFilter is used to map wifi and wireless network to\n * hotspot then the partial text \"wirele...\" could suggest\n * \"wifi router\". Token normalization like stemmers, accent\n * removal, etc., would allow suggestions to ignore such\n * variations.\n *\n *

\n * When two matching suggestions have the same weight, they\n * are tie-broken by the analyzed form. If their analyzed\n * form is the same then the order is undefined.\n *\n *

\n * There are some limitations:\n *

\n * \n * @lucene.experimental\n */\npublic class AnalyzingSuggester extends Lookup {\n \n /**\n * FST: \n * input is the analyzed form, with a null byte between terms\n * weights are encoded as costs: (Integer.MAX_VALUE-weight)\n * surface is the original, unanalyzed form.\n */\n private FST> fst = null;\n \n /** \n * Analyzer that will be used for analyzing suggestions at\n * index time.\n */\n private final Analyzer indexAnalyzer;\n\n /** \n * Analyzer that will be used for analyzing suggestions at\n * query time.\n */\n private final Analyzer queryAnalyzer;\n \n /** \n * True if exact match suggestions should always be returned first.\n */\n private final boolean exactFirst;\n \n /** \n * True if separator between tokens should be preserved.\n */\n private final boolean preserveSep;\n\n /** Include this flag in the options parameter to {@link\n * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to always\n * return the exact match first, regardless of score. This\n * has no performance impact but could result in\n * low-quality suggestions. */\n public static final int EXACT_FIRST = 1;\n\n /** Include this flag in the options parameter to {@link\n * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to preserve\n * token separators when matching. */\n public static final int PRESERVE_SEP = 2;\n\n /** Represents the separation between tokens, if\n * PRESERVE_SEP was specified */\n private static final int SEP_LABEL = 0xff;\n\n /** Marks end of the analyzed input and start of dedup\n * byte. */\n private static final int END_BYTE = 0x0;\n\n /** Maximum number of dup surface forms (different surface\n * forms for the same analyzed form). */\n private final int maxSurfaceFormsPerAnalyzedForm;\n\n /** Maximum graph paths to index for a single analyzed\n * surface form. This only matters if your analyzer\n * makes lots of alternate paths (e.g. contains\n * SynonymFilter). */\n private final int maxGraphExpansions;\n\n /** Highest number of analyzed paths we saw for any single\n * input surface form. For analyzers that never create\n * graphs this will always be 1. */\n private int maxAnalyzedPathsForOneInput;\n\n private boolean hasPayloads;\n\n private static final int PAYLOAD_SEP = '\\u001f';\n\n /** Whether position holes should appear in the automaton. */\n private boolean preservePositionIncrements;\n\n /**\n * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)\n * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |\n * PRESERVE_SEP, 256, -1)}\n */\n public AnalyzingSuggester(Analyzer analyzer) {\n this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1);\n }\n\n /**\n * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)\n * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |\n * PRESERVE_SEP, 256, -1)}\n */\n public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1);\n }\n\n /**\n * Creates a new suggester.\n * \n * @param indexAnalyzer Analyzer that will be used for\n * analyzing suggestions while building the index.\n * @param queryAnalyzer Analyzer that will be used for\n * analyzing query text during lookup\n * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}\n * @param maxSurfaceFormsPerAnalyzedForm Maximum number of\n * surface forms to keep for a single analyzed form.\n * When there are too many surface forms we discard the\n * lowest weighted ones.\n * @param maxGraphExpansions Maximum number of graph paths\n * to expand from the analyzed form. Set this to -1 for\n * no limit.\n */\n public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions) {\n this.indexAnalyzer = indexAnalyzer;\n this.queryAnalyzer = queryAnalyzer;\n if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {\n throw new IllegalArgumentException(\"options should only contain EXACT_FIRST and PRESERVE_SEP; got \" + options);\n }\n this.exactFirst = (options & EXACT_FIRST) != 0;\n this.preserveSep = (options & PRESERVE_SEP) != 0;\n\n // NOTE: this is just an implementation limitation; if\n // somehow this is a problem we could fix it by using\n // more than one byte to disambiguate ... but 256 seems\n // like it should be way more then enough.\n if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {\n throw new IllegalArgumentException(\"maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: \" + maxSurfaceFormsPerAnalyzedForm + \")\");\n }\n this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;\n\n if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {\n throw new IllegalArgumentException(\"maxGraphExpansions must -1 (no limit) or > 0 (got: \" + maxGraphExpansions + \")\");\n }\n this.maxGraphExpansions = maxGraphExpansions;\n preservePositionIncrements = true;\n }\n\n /** Whether to take position holes (position increment > 1) into account when\n * building the automaton, true by default. */\n public void setPreservePositionIncrements(boolean preservePositionIncrements) {\n this.preservePositionIncrements = preservePositionIncrements;\n }\n\n /** Returns byte size of the underlying FST. */\n public long sizeInBytes() {\n return fst == null ? 0 : fst.sizeInBytes();\n }\n\n private void copyDestTransitions(State from, State to, List transitions) {\n if (to.isAccept()) {\n from.setAccept(true);\n }\n for(Transition t : to.getTransitions()) {\n transitions.add(t);\n }\n }\n\n // Replaces SEP with epsilon or remaps them if\n // we were asked to preserve them:\n private void replaceSep(Automaton a) {\n\n State[] states = a.getNumberedStates();\n\n // Go in reverse topo sort so we know we only have to\n // make one pass:\n for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {\n final State state = states[stateNumber];\n List newTransitions = new ArrayList();\n for(Transition t : state.getTransitions()) {\n assert t.getMin() == t.getMax();\n if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {\n if (preserveSep) {\n // Remap to SEP_LABEL:\n newTransitions.add(new Transition(SEP_LABEL, t.getDest()));\n } else {\n copyDestTransitions(state, t.getDest(), newTransitions);\n a.setDeterministic(false);\n }\n } else if (t.getMin() == TokenStreamToAutomaton.HOLE) {\n\n // Just remove the hole: there will then be two\n // SEP tokens next to each other, which will only\n // match another hole at search time. Note that\n // it will also match an empty-string token ... if\n // that's somehow a problem we can always map HOLE\n // to a dedicated byte (and escape it in the\n // input).\n copyDestTransitions(state, t.getDest(), newTransitions);\n a.setDeterministic(false);\n } else {\n newTransitions.add(t);\n }\n }\n state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));\n }\n }\n\n /** Just escapes the 0xff byte (which we still for SEP). */\n private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {\n\n final BytesRef spare = new BytesRef();\n\n @Override\n protected BytesRef changeToken(BytesRef in) {\n int upto = 0;\n for(int i=0;i {\n\n private final boolean hasPayloads;\n\n public AnalyzingComparator(boolean hasPayloads) {\n this.hasPayloads = hasPayloads;\n }\n\n private final ByteArrayDataInput readerA = new ByteArrayDataInput();\n private final ByteArrayDataInput readerB = new ByteArrayDataInput();\n private final BytesRef scratchA = new BytesRef();\n private final BytesRef scratchB = new BytesRef();\n\n @Override\n public int compare(BytesRef a, BytesRef b) {\n\n // First by analyzed form:\n readerA.reset(a.bytes, a.offset, a.length);\n scratchA.length = readerA.readShort();\n scratchA.bytes = a.bytes;\n scratchA.offset = readerA.getPosition();\n\n readerB.reset(b.bytes, b.offset, b.length);\n scratchB.bytes = b.bytes;\n scratchB.length = readerB.readShort();\n scratchB.offset = readerB.getPosition();\n\n int cmp = scratchA.compareTo(scratchB);\n if (cmp != 0) {\n return cmp;\n }\n\n // Next by cost:\n long aCost = readerA.readInt();\n long bCost = readerB.readInt();\n\n if (aCost < bCost) {\n return -1;\n } else if (aCost > bCost) {\n return 1;\n }\n\n // Finally by surface form:\n if (hasPayloads) {\n readerA.setPosition(readerA.getPosition() + scratchA.length);\n scratchA.length = readerA.readShort();\n scratchA.offset = readerA.getPosition();\n readerB.setPosition(readerB.getPosition() + scratchB.length);\n scratchB.length = readerB.readShort();\n scratchB.offset = readerB.getPosition();\n } else {\n scratchA.offset = readerA.getPosition();\n scratchA.length = a.length - scratchA.offset;\n scratchB.offset = readerB.getPosition();\n scratchB.length = b.length - scratchB.offset;\n }\n\n cmp = scratchA.compareTo(scratchB);\n if (cmp != 0) {\n return cmp;\n }\n\n return 0;\n }\n };\n\n @Override\n public void build(TermFreqIterator iterator) throws IOException {\n String prefix = getClass().getSimpleName();\n File directory = Sort.defaultTempDir();\n File tempInput = File.createTempFile(prefix, \".input\", directory);\n File tempSorted = File.createTempFile(prefix, \".sorted\", directory);\n\n TermFreqPayloadIterator payloads;\n if (iterator instanceof TermFreqPayloadIterator) {\n payloads = (TermFreqPayloadIterator) iterator;\n } else {\n payloads = null;\n }\n hasPayloads = payloads != null;\n\n Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);\n Sort.ByteSequencesReader reader = null;\n BytesRef scratch = new BytesRef();\n\n TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();\n\n boolean success = false;\n byte buffer[] = new byte[8];\n try {\n ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);\n BytesRef surfaceForm;\n\n while ((surfaceForm = iterator.next()) != null) {\n Set paths = toFiniteStrings(surfaceForm, ts2a);\n \n maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());\n\n for (IntsRef path : paths) {\n\n Util.toBytesRef(path, scratch);\n \n // length of the analyzed text (FST input)\n if (scratch.length > Short.MAX_VALUE-2) {\n throw new IllegalArgumentException(\"cannot handle analyzed forms > \" + (Short.MAX_VALUE-2) + \" in length (got \" + scratch.length + \")\");\n }\n short analyzedLength = (short) scratch.length;\n\n // compute the required length:\n // analyzed sequence + weight (4) + surface + analyzedLength (short)\n int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;\n\n BytesRef payload;\n\n if (hasPayloads) {\n if (surfaceForm.length > (Short.MAX_VALUE-2)) {\n throw new IllegalArgumentException(\"cannot handle surface form > \" + (Short.MAX_VALUE-2) + \" in length (got \" + surfaceForm.length + \")\");\n }\n payload = payloads.payload();\n // payload + surfaceLength (short)\n requiredLength += payload.length + 2;\n } else {\n payload = null;\n }\n \n buffer = ArrayUtil.grow(buffer, requiredLength);\n \n output.reset(buffer);\n\n output.writeShort(analyzedLength);\n\n output.writeBytes(scratch.bytes, scratch.offset, scratch.length);\n\n output.writeInt(encodeWeight(iterator.weight()));\n\n if (hasPayloads) {\n for(int i=0;i outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());\n Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs);\n\n // Build FST:\n BytesRef previousAnalyzed = null;\n BytesRef analyzed = new BytesRef();\n BytesRef surface = new BytesRef();\n IntsRef scratchInts = new IntsRef();\n ByteArrayDataInput input = new ByteArrayDataInput();\n\n // Used to remove duplicate surface forms (but we\n // still index the hightest-weight one). We clear\n // this when we see a new analyzed form, so it cannot\n // grow unbounded (at most 256 entries):\n Set seenSurfaceForms = new HashSet();\n\n int dedup = 0;\n while (reader.read(scratch)) {\n input.reset(scratch.bytes, scratch.offset, scratch.length);\n short analyzedLength = input.readShort();\n analyzed.grow(analyzedLength+2);\n input.readBytes(analyzed.bytes, 0, analyzedLength);\n analyzed.length = analyzedLength;\n\n long cost = input.readInt();\n\n surface.bytes = scratch.bytes;\n if (hasPayloads) {\n surface.length = input.readShort();\n surface.offset = input.getPosition();\n } else {\n surface.offset = input.getPosition();\n surface.length = scratch.length - surface.offset;\n }\n \n if (previousAnalyzed == null) {\n previousAnalyzed = new BytesRef();\n previousAnalyzed.copyBytes(analyzed);\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n } else if (analyzed.equals(previousAnalyzed)) {\n dedup++;\n if (dedup >= maxSurfaceFormsPerAnalyzedForm) {\n // More than maxSurfaceFormsPerAnalyzedForm\n // dups: skip the rest:\n continue;\n }\n if (seenSurfaceForms.contains(surface)) {\n continue;\n }\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n } else {\n dedup = 0;\n previousAnalyzed.copyBytes(analyzed);\n seenSurfaceForms.clear();\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n }\n\n // TODO: I think we can avoid the extra 2 bytes when\n // there is no dup (dedup==0), but we'd have to fix\n // the exactFirst logic ... which would be sort of\n // hairy because we'd need to special case the two\n // (dup/not dup)...\n\n // NOTE: must be byte 0 so we sort before whatever\n // is next\n analyzed.bytes[analyzed.offset+analyzed.length] = 0;\n analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup;\n analyzed.length += 2;\n\n Util.toIntsRef(analyzed, scratchInts);\n //System.out.println(\"ADD: \" + scratchInts + \" -> \" + cost + \": \" + surface.utf8ToString());\n if (!hasPayloads) {\n builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));\n } else {\n int payloadOffset = input.getPosition() + surface.length;\n int payloadLength = scratch.length - payloadOffset;\n BytesRef br = new BytesRef(surface.length + 1 + payloadLength);\n System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);\n br.bytes[surface.length] = PAYLOAD_SEP;\n System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);\n br.length = br.bytes.length;\n builder.add(scratchInts, outputs.newPair(cost, br));\n }\n }\n fst = builder.finish();\n\n //Util.dotToFile(fst, \"/tmp/suggest.dot\");\n \n success = true;\n } finally {\n if (success) {\n IOUtils.close(reader, writer);\n } else {\n IOUtils.closeWhileHandlingException(reader, writer);\n }\n \n tempInput.delete();\n tempSorted.delete();\n }\n }\n\n @Override\n public boolean store(OutputStream output) throws IOException {\n DataOutput dataOut = new OutputStreamDataOutput(output);\n try {\n if (fst == null) {\n return false;\n }\n\n fst.save(dataOut);\n dataOut.writeVInt(maxAnalyzedPathsForOneInput);\n dataOut.writeByte((byte) (hasPayloads ? 1 : 0));\n } finally {\n IOUtils.close(output);\n }\n return true;\n }\n\n @Override\n public boolean load(InputStream input) throws IOException {\n DataInput dataIn = new InputStreamDataInput(input);\n try {\n this.fst = new FST>(dataIn, new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));\n maxAnalyzedPathsForOneInput = dataIn.readVInt();\n hasPayloads = dataIn.readByte() == 1;\n } finally {\n IOUtils.close(input);\n }\n return true;\n }\n\n private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) {\n LookupResult result;\n if (hasPayloads) {\n int sepIndex = -1;\n for(int i=0;i= output2.length) {\n return false;\n }\n for(int i=0;i lookup(final CharSequence key, boolean onlyMorePopular, int num) {\n assert num > 0;\n\n if (onlyMorePopular) {\n throw new IllegalArgumentException(\"this suggester only works with onlyMorePopular=false\");\n }\n if (fst == null) {\n return Collections.emptyList();\n }\n\n //System.out.println(\"lookup key=\" + key + \" num=\" + num);\n final BytesRef utf8Key = new BytesRef(key);\n try {\n\n Automaton lookupAutomaton = toLookupAutomaton(key);\n\n final CharsRef spare = new CharsRef();\n\n //System.out.println(\" now intersect exactFirst=\" + exactFirst);\n \n // Intersect automaton w/ suggest wFST and get all\n // prefix starting nodes & their outputs:\n //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);\n\n //System.out.println(\" prefixPaths: \" + prefixPaths.size());\n\n BytesReader bytesReader = fst.getBytesReader();\n\n FST.Arc> scratchArc = new FST.Arc>();\n\n final List results = new ArrayList();\n\n List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);\n\n if (exactFirst) {\n\n int count = 0;\n for (FSTUtil.Path> path : prefixPaths) {\n if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {\n // This node has END_BYTE arc leaving, meaning it's an\n // \"exact\" match:\n count++;\n }\n }\n\n // Searcher just to find the single exact only\n // match, if present:\n Util.TopNSearcher> searcher;\n searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);\n\n // NOTE: we could almost get away with only using\n // the first start node. The only catch is if\n // maxSurfaceFormsPerAnalyzedForm had kicked in and\n // pruned our exact match from one of these nodes\n // ...:\n for (FSTUtil.Path> path : prefixPaths) {\n if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {\n // This node has END_BYTE arc leaving, meaning it's an\n // \"exact\" match:\n searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);\n }\n }\n\n MinResult> completions[] = searcher.search();\n\n // NOTE: this is rather inefficient: we enumerate\n // every matching \"exactly the same analyzed form\"\n // path, and then do linear scan to see if one of\n // these exactly matches the input. It should be\n // possible (though hairy) to do something similar\n // to getByOutput, since the surface form is encoded\n // into the FST output, so we more efficiently hone\n // in on the exact surface-form match. Still, I\n // suspect very little time is spent in this linear\n // seach: it's bounded by how many prefix start\n // nodes we have and the\n // maxSurfaceFormsPerAnalyzedForm:\n for(MinResult> completion : completions) {\n BytesRef output2 = completion.output.output2;\n if (sameSurfaceForm(utf8Key, output2)) {\n results.add(getLookupResult(completion.output.output1, output2, spare));\n break;\n }\n }\n\n if (results.size() == num) {\n // That was quick:\n return results;\n }\n }\n\n Util.TopNSearcher> searcher;\n searcher = new Util.TopNSearcher>(fst,\n num - results.size(),\n num * maxAnalyzedPathsForOneInput,\n weightComparator) {\n private final Set seen = new HashSet();\n\n @Override\n protected boolean acceptResult(IntsRef input, Pair output) {\n\n // Dedup: when the input analyzes to a graph we\n // can get duplicate surface forms:\n if (seen.contains(output.output2)) {\n return false;\n }\n seen.add(output.output2);\n \n if (!exactFirst) {\n return true;\n } else {\n // In exactFirst mode, don't accept any paths\n // matching the surface form since that will\n // create duplicate results:\n if (sameSurfaceForm(utf8Key, output.output2)) {\n // We found exact match, which means we should\n // have already found it in the first search:\n assert results.size() == 1;\n return false;\n } else {\n return true;\n }\n }\n }\n };\n\n prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);\n \n for (FSTUtil.Path> path : prefixPaths) {\n searcher.addStartPaths(path.fstNode, path.output, true, path.input);\n }\n\n MinResult> completions[] = searcher.search();\n\n for(MinResult> completion : completions) {\n\n LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);\n\n // TODO: for fuzzy case would be nice to return\n // how many edits were required\n\n //System.out.println(\" result=\" + result);\n results.add(result);\n\n if (results.size() == num) {\n // In the exactFirst=true case the search may\n // produce one extra path\n break;\n }\n }\n\n return results;\n } catch (IOException bogus) {\n throw new RuntimeException(bogus);\n }\n }\n\n /** Returns all prefix paths to initialize the search. */\n protected List>> getFullPrefixPaths(List>> prefixPaths,\n Automaton lookupAutomaton,\n FST> fst)\n throws IOException {\n return prefixPaths;\n }\n \n final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {\n // Analyze surface form:\n TokenStream ts = indexAnalyzer.tokenStream(\"\", new StringReader(surfaceForm.utf8ToString()));\n\n // Create corresponding automaton: labels are bytes\n // from each analyzed token, with byte 0 used as\n // separator between tokens:\n Automaton automaton = ts2a.toAutomaton(ts);\n ts.close();\n\n replaceSep(automaton);\n\n assert SpecialOperations.isFinite(automaton);\n\n // Get all paths from the automaton (there can be\n // more than one path, eg if the analyzer created a\n // graph using SynFilter or WDF):\n\n // TODO: we could walk & add simultaneously, so we\n // don't have to alloc [possibly biggish]\n // intermediate HashSet in RAM:\n return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);\n }\n\n final Automaton toLookupAutomaton(final CharSequence key) throws IOException {\n // TODO: is there a Reader from a CharSequence?\n // Turn tokenstream into automaton:\n TokenStream ts = queryAnalyzer.tokenStream(\"\", new StringReader(key.toString()));\n Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);\n ts.close();\n\n // TODO: we could use the end offset to \"guess\"\n // whether the final token was a partial token; this\n // would only be a heuristic ... but maybe an OK one.\n // This way we could eg differentiate \"net\" from \"net \",\n // which we can't today...\n\n replaceSep(automaton);\n\n // TODO: we can optimize this somewhat by determinizing\n // while we convert\n BasicOperations.determinize(automaton);\n return automaton;\n }\n \n \n\n /**\n * Returns the weight associated with an input string,\n * or null if it does not exist.\n */\n public Object get(CharSequence key) {\n throw new UnsupportedOperationException();\n }\n \n /** cost -> weight */\n private static int decodeWeight(long encoded) {\n return (int)(Integer.MAX_VALUE - encoded);\n }\n \n /** weight -> cost */\n private static int encodeWeight(long value) {\n if (value < 0 || value > Integer.MAX_VALUE) {\n throw new UnsupportedOperationException(\"cannot encode value: \" + value);\n }\n return Integer.MAX_VALUE - (int)value;\n }\n \n static final Comparator> weightComparator = new Comparator> () {\n @Override\n public int compare(Pair left, Pair right) {\n return left.output1.compareTo(right.output1);\n }\n };\n}\n =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 40d2f78d3aae3e092f459ebe0031e69a04f47b5a) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision ) @@ -32,6 +32,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqPayloadIterator; import org.apache.lucene.search.suggest.Lookup; @@ -53,6 +54,7 @@ import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST.BytesReader; @@ -160,7 +162,7 @@ /** Represents the separation between tokens, if * PRESERVE_SEP was specified */ - private static final int SEP_LABEL = 0xff; + private static final int SEP_LABEL = 0x10FFFF; /** Marks end of the analyzed input and start of dedup * byte. */ @@ -307,46 +309,61 @@ } } - /** Just escapes the 0xff byte (which we still for SEP). */ - private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { + /** Just escapes the 0x10FFFF Unicode code point (which we still use for SEP). */ + private static final class EscapingTokenStreamToUnicodeAutomaton extends TokenStreamToUnicodeAutomaton { final BytesRef spare = new BytesRef(); @Override protected BytesRef changeToken(BytesRef in) { + + final String utf16 = in.utf8ToString(); + final int[] inCodePoints = new int[utf16.codePointCount(0, utf16.length())]; + for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) + inCodePoints[j++] = cp = utf16.codePointAt(i); + int upto = 0; - for(int i=0;i spare.bytes.length) { + spare.grow(spare.bytes.length + 8); } - spare.bytes[upto++] = (byte) 0xff; - spare.bytes[upto++] = b; + spare.bytes[upto++] = (byte) 0xF4; + spare.bytes[upto++] = (byte) 0x8F; + spare.bytes[upto++] = (byte) 0xBF; + spare.bytes[upto++] = (byte) 0xBF; + spare.bytes[upto++] = (byte) 0xF4; + spare.bytes[upto++] = (byte) 0x8F; + spare.bytes[upto++] = (byte) 0xBF; + spare.bytes[upto++] = (byte) 0xBF; } else { - if (spare.bytes.length == upto) { - spare.grow(upto+1); + byte[] chars = new String(Character.toChars(inCodePoint)).getBytes(); + if (upto + chars.length > spare.bytes.length) { + spare.grow(spare.bytes.length + chars.length); } - spare.bytes[upto++] = b; + for (int k = 0; k < chars.length; k++) { + spare.bytes[upto++] = chars[k]; - } - } + } + } + } spare.offset = 0; spare.length = upto; return spare; } } - TokenStreamToAutomaton getTokenStreamToAutomaton() { - final TokenStreamToAutomaton tsta; + TokenStreamToUnicodeAutomaton getTokenStreamToUnicodeAutomaton() { + final TokenStreamToUnicodeAutomaton tstua; if (preserveSep) { - tsta = new EscapingTokenStreamToAutomaton(); + tstua = new EscapingTokenStreamToUnicodeAutomaton(); } else { - // When we're not preserving sep, we don't steal 0xff + // When we're not preserving sep, we don't steal 0x10FFFF // byte, so we don't need to do any escaping: - tsta = new TokenStreamToAutomaton(); + tstua = new TokenStreamToUnicodeAutomaton(); } - tsta.setPreservePositionIncrements(preservePositionIncrements); - return tsta; + tstua.setPreservePositionIncrements(preservePositionIncrements); + return tstua; } private static class AnalyzingComparator implements Comparator { @@ -434,7 +451,7 @@ Sort.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); - TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + TokenStreamToUnicodeAutomaton ts2ua = getTokenStreamToUnicodeAutomaton(); boolean success = false; byte buffer[] = new byte[8]; @@ -443,7 +460,7 @@ BytesRef surfaceForm; while ((surfaceForm = iterator.next()) != null) { - Set paths = toFiniteStrings(surfaceForm, ts2a); + Set paths = toFiniteStrings(surfaceForm, ts2ua); maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size()); @@ -704,6 +721,8 @@ try { Automaton lookupAutomaton = toLookupAutomaton(key); + Automaton utf8lookupAutomaton = new UTF32ToUTF8().convert(lookupAutomaton); + BasicOperations.determinize(utf8lookupAutomaton); final CharsRef spare = new CharsRef(); @@ -721,7 +740,7 @@ final List results = new ArrayList(); - List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst); + List>> prefixPaths = FSTUtil.intersectPrefixPaths(utf8lookupAutomaton, fst); if (exactFirst) { @@ -854,18 +873,21 @@ return prefixPaths; } - final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { + final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToUnicodeAutomaton ts2ua) throws IOException { // Analyze surface form: TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); - // Create corresponding automaton: labels are bytes - // from each analyzed token, with byte 0 used as + // Create corresponding automaton: labels are Unicode code points + // from each analyzed token, with code point 0 used as // separator between tokens: - Automaton automaton = ts2a.toAutomaton(ts); + Automaton unicodeAutomaton = ts2ua.toAutomaton(ts); ts.close(); - replaceSep(automaton); + replaceSep(unicodeAutomaton); + Automaton automaton = new UTF32ToUTF8().convert(unicodeAutomaton); + BasicOperations.determinize(automaton); + assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be @@ -882,7 +904,7 @@ // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); - Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + Automaton unicodeAutomaton = (getTokenStreamToUnicodeAutomaton()).toAutomaton(ts); ts.close(); // TODO: we could use the end offset to "guess" @@ -891,12 +913,12 @@ // This way we could eg differentiate "net" from "net ", // which we can't today... - replaceSep(automaton); + replaceSep(unicodeAutomaton); // TODO: we can optimize this somewhat by determinizing // while we convert - BasicOperations.determinize(automaton); - return automaton; + BasicOperations.determinize(unicodeAutomaton); + return unicodeAutomaton; } Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.IOException;\nimport java.io.Reader;\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\nimport java.util.TreeSet;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.CannedTokenStream;\nimport org.apache.lucene.analysis.MockAnalyzer;\nimport org.apache.lucene.analysis.MockTokenFilter;\nimport org.apache.lucene.analysis.MockTokenizer;\nimport org.apache.lucene.analysis.Token;\nimport org.apache.lucene.analysis.TokenFilter;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.TokenStreamToAutomaton;\nimport org.apache.lucene.analysis.Tokenizer;\nimport org.apache.lucene.analysis.tokenattributes.CharTermAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.search.suggest.Lookup.LookupResult;\nimport org.apache.lucene.search.suggest.TermFreq;\nimport org.apache.lucene.search.suggest.TermFreqArrayIterator;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.LuceneTestCase;\nimport org.apache.lucene.util._TestUtil;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.fst.Util;\n\npublic class FuzzySuggesterTest extends LuceneTestCase {\n \n public void testRandomEdits() throws IOException {\n List keys = new ArrayList();\n int numTerms = atLeast(100);\n for (int i = 0; i < numTerms; i++) {\n keys.add(new TermFreq(\"boo\" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));\n }\n keys.add(new TermFreq(\"foo bar boo far\", 12));\n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n int numIters = atLeast(10);\n for (int i = 0; i < numIters; i++) {\n String addRandomEdit = addRandomEdit(\"foo bar boo\", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX);\n List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);\n assertEquals(addRandomEdit, 1, results.size());\n assertEquals(\"foo bar boo far\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F); \n }\n }\n \n /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */\n public void testKeyword() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"foo\", 50),\n new TermFreq(\"bar\", 10),\n new TermFreq(\"barbar\", 12),\n new TermFreq(\"barbara\", 6)\n };\n \n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"bariar\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbr\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbara\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbara\", results.get(0).key.toString());\n assertEquals(6, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbar\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"barbara\", results.get(1).key.toString());\n assertEquals(6, results.get(1).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbaa\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"barbara\", results.get(1).key.toString());\n assertEquals(6, results.get(1).value, 0.01F);\n \n // top N of 2, but only foo is available\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"f\", random()), false, 2);\n assertEquals(1, results.size());\n assertEquals(\"foo\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n \n // top N of 1 for 'bar': we return this even though\n // barbar is higher because exactFirst is enabled:\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"bar\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"bar\", results.get(0).key.toString());\n assertEquals(10, results.get(0).value, 0.01F);\n \n // top N Of 2 for 'b'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"b\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n \n // top N of 3 for 'ba'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ba\", random()), false, 3);\n assertEquals(3, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n assertEquals(\"barbara\", results.get(2).key.toString());\n assertEquals(6, results.get(2).value, 0.01F);\n }\n \n /**\n * basic \"standardanalyzer\" test with stopword removal\n */\n public void testStandard() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"the ghost of christmas past\", 50),\n };\n \n Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);\n FuzzySuggester suggester = new FuzzySuggester(standard);\n suggester.setPreservePositionIncrements(false);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"the ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' since its a stopword, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' and 'of' since they are stopwords, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n }\n\n public void testNoSeps() throws Exception {\n TermFreq[] keys = new TermFreq[] {\n new TermFreq(\"ab cd\", 0),\n new TermFreq(\"abcd\", 1),\n };\n\n int options = 0;\n\n Analyzer a = new MockAnalyzer(random());\n FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);\n suggester.build(new TermFreqArrayIterator(keys));\n // TODO: would be nice if \"ab \" would allow the test to\n // pass, and more generally if the analyzer can know\n // that the user's current query has ended at a word, \n // but, analyzers don't produce SEP tokens!\n List r = suggester.lookup(_TestUtil.stringToCharSequence(\"ab c\", random()), false, 2);\n assertEquals(2, r.size());\n\n // With no PRESERVE_SEPS specified, \"ab c\" should also\n // complete to \"abcd\", which has higher weight so should\n // appear first:\n assertEquals(\"abcd\", r.get(0).key.toString());\n }\n\n public void testGraphDups() throws Exception {\n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"slow\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"wi\",1,1),\n token(\"hotspot\",0,3),\n token(\"fi\",1,1),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"fast\",1,1)\n\n }),\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1)\n }),\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"wifi network is slow\", 50),\n new TermFreq(\"wi fi network is fast\", 10),\n };\n FuzzySuggester suggester = new FuzzySuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(\"wifi network\", false, 10);\n if (VERBOSE) {\n System.out.println(\"Results: \" + results);\n }\n assertEquals(2, results.size());\n assertEquals(\"wifi network is slow\", results.get(0).key);\n assertEquals(50, results.get(0).value);\n assertEquals(\"wi fi network is fast\", results.get(1).key);\n assertEquals(10, results.get(1).value);\n }\n\n public void testEmpty() throws Exception {\n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(new TermFreq[0]));\n\n List result = suggester.lookup(\"a\", false, 20);\n assertTrue(result.isEmpty());\n }\n\n public void testInputPathRequired() throws Exception {\n\n // SynonymMap.Builder b = new SynonymMap.Builder(false);\n // b.add(new CharsRef(\"ab\"), new CharsRef(\"ba\"), true);\n // final SynonymMap map = b.build();\n\n // The Analyzer below mimics the functionality of the SynonymAnalyzer\n // using the above map, so that the suggest module does not need a dependency on the \n // synonym module \n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"xc\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ba\",1,1), \n token(\"xd\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"x\",1,1)\n })\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"ab xc\", 50),\n new TermFreq(\"ba xd\", 50),\n };\n FuzzySuggester suggester = new FuzzySuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n List results = suggester.lookup(\"ab x\", false, 1);\n assertTrue(results.size() == 1);\n }\n\n private static Token token(String term, int posInc, int posLength) {\n final Token t = new Token(term, 0, 0);\n t.setPositionIncrement(posInc);\n t.setPositionLength(posLength);\n return t;\n }\n\n /*\n private void printTokens(final Analyzer analyzer, String input) throws IOException {\n System.out.println(\"Tokens for \" + input);\n TokenStream ts = analyzer.tokenStream(\"\", new StringReader(input));\n ts.reset();\n final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);\n final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);\n final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);\n \n while(ts.incrementToken()) {\n termBytesAtt.fillBytesRef();\n System.out.println(String.format(\"%s,%s,%s\", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); \n }\n ts.end();\n ts.close();\n } \n */ \n\n private final Analyzer getUnusualAnalyzer() {\n return new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n\n int count;\n\n @Override\n public TokenStream getTokenStream() {\n // 4th time we are called, return tokens a b,\n // else just a:\n if (count++ != 3) {\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n });\n } else {\n // After that \"a b\":\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n token(\"b\", 1, 1),\n });\n }\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n }\n\n public void testExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n //System.out.println(\"ALL: \" + suggester.lookup(\"x y\", false, 6));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"x y\", false, topN);\n //System.out.println(\"topN=\" + topN + \" \" + results);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"x y\", results.get(0).key);\n assertEquals(1, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"z z z\", results.get(1).key);\n assertEquals(20, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x y z\", results.get(2).key);\n assertEquals(3, results.get(2).value);\n\n if (topN > 3) {\n assertEquals(\"x\", results.get(3).key);\n assertEquals(2, results.get(3).value);\n }\n }\n }\n }\n }\n\n public void testNonExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"p\", false, topN);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"z z z\", results.get(0).key);\n assertEquals(20, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"x y z\", results.get(1).key);\n assertEquals(3, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x\", results.get(2).key);\n assertEquals(2, results.get(2).value);\n \n if (topN > 3) {\n assertEquals(\"x y\", results.get(3).key);\n assertEquals(1, results.get(3).value);\n }\n }\n }\n }\n }\n \n // Holds surface form separately:\n private static class TermFreq2 implements Comparable {\n public final String surfaceForm;\n public final String analyzedForm;\n public final long weight;\n\n public TermFreq2(String surfaceForm, String analyzedForm, long weight) {\n this.surfaceForm = surfaceForm;\n this.analyzedForm = analyzedForm;\n this.weight = weight;\n }\n\n @Override\n public int compareTo(TermFreq2 other) {\n int cmp = analyzedForm.compareTo(other.analyzedForm);\n if (cmp != 0) {\n return cmp;\n } else if (weight > other.weight) {\n return -1;\n } else if (weight < other.weight) {\n return 1;\n } else {\n assert false;\n return 0;\n }\n }\n }\n\n static boolean isStopChar(char ch, int numStopChars) {\n //System.out.println(\"IS? \" + ch + \": \" + (ch - 'a') + \": \" + ((ch - 'a') < numStopChars));\n return (ch - 'a') < numStopChars;\n }\n\n // Like StopFilter:\n private static class TokenEater extends TokenFilter {\n private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);\n private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);\n private final int numStopChars;\n private final boolean preserveHoles;\n private boolean first;\n\n public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) {\n super(in);\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public void reset() throws IOException {\n super.reset();\n first = true;\n }\n\n @Override\n public final boolean incrementToken() throws IOException {\n int skippedPositions = 0;\n while (input.incrementToken()) {\n if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) {\n int posInc = posIncrAtt.getPositionIncrement() + skippedPositions;\n if (first) {\n if (posInc == 0) {\n // first token having posinc=0 is illegal.\n posInc = 1;\n }\n first = false;\n }\n posIncrAtt.setPositionIncrement(posInc);\n //System.out.println(\"RETURN term=\" + termAtt + \" numStopChars=\" + numStopChars);\n return true;\n }\n if (preserveHoles) {\n skippedPositions += posIncrAtt.getPositionIncrement();\n }\n }\n\n return false;\n }\n }\n\n private static class MockTokenEatingAnalyzer extends Analyzer {\n private int numStopChars;\n private boolean preserveHoles;\n\n public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public TokenStreamComponents createComponents(String fieldName, Reader reader) {\n MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);\n tokenizer.setEnableChecks(true);\n TokenStream next;\n if (numStopChars != 0) {\n next = new TokenEater(preserveHoles, tokenizer, numStopChars);\n } else {\n next = tokenizer;\n }\n return new TokenStreamComponents(tokenizer, next);\n }\n }\n\n public void testRandom() throws Exception {\n\n int numQueries = atLeast(100);\n \n final List slowCompletor = new ArrayList();\n final TreeSet allPrefixes = new TreeSet();\n final Set seen = new HashSet();\n \n TermFreq[] keys = new TermFreq[numQueries];\n\n boolean preserveSep = random().nextBoolean();\n\n final int numStopChars = random().nextInt(10);\n final boolean preserveHoles = random().nextBoolean();\n\n if (VERBOSE) {\n System.out.println(\"TEST: \" + numQueries + \" words; preserveSep=\" + preserveSep + \" numStopChars=\" + numStopChars + \" preserveHoles=\" + preserveHoles);\n }\n \n for (int i = 0; i < numQueries; i++) {\n int numTokens = _TestUtil.nextInt(random(), 1, 4);\n String key;\n String analyzedKey;\n while(true) {\n key = \"\";\n analyzedKey = \"\";\n boolean lastRemoved = false;\n for(int token=0;token < numTokens;token++) {\n String s;\n while (true) {\n // TODO: would be nice to fix this slowCompletor/comparator to\n // use full range, but we might lose some coverage too...\n s = _TestUtil.randomSimpleString(random());\n if (s.length() > 0) {\n if (token > 0) {\n key += \" \";\n }\n if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') {\n analyzedKey += \" \";\n }\n key += s;\n if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n analyzedKey += '\\u0000';\n }\n lastRemoved = true;\n } else {\n analyzedKey += s;\n lastRemoved = false;\n }\n break;\n }\n }\n }\n\n analyzedKey = analyzedKey.replaceAll(\"(^| )\\u0000$\", \"\");\n\n if (preserveSep && lastRemoved) {\n analyzedKey += \" \";\n }\n\n // Don't add same surface form more than once:\n if (!seen.contains(key)) {\n seen.add(key);\n break;\n }\n }\n\n for (int j = 1; j < key.length(); j++) {\n allPrefixes.add(key.substring(0, j));\n }\n // we can probably do Integer.MAX_VALUE here, but why worry.\n int weight = random().nextInt(1<<24);\n keys[i] = new TermFreq(key, weight);\n\n slowCompletor.add(new TermFreq2(key, analyzedKey, weight));\n }\n\n if (VERBOSE) {\n // Don't just sort original list, to avoid VERBOSE\n // altering the test:\n List sorted = new ArrayList(slowCompletor);\n Collections.sort(sorted);\n for(TermFreq2 ent : sorted) {\n System.out.println(\" surface='\" + ent.surfaceForm + \" analyzed='\" + ent.analyzedForm + \"' weight=\" + ent.weight);\n }\n }\n\n Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);\n FuzzySuggester suggester = new FuzzySuggester(a, a,\n preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);\n suggester.build(new TermFreqArrayIterator(keys));\n\n for (String prefix : allPrefixes) {\n\n if (VERBOSE) {\n System.out.println(\"\\nTEST: prefix=\" + prefix);\n }\n\n final int topN = _TestUtil.nextInt(random(), 1, 10);\n List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN);\n\n // 2. go thru whole set to find suggestions:\n List matches = new ArrayList();\n\n // \"Analyze\" the key:\n String[] tokens = prefix.split(\" \");\n StringBuilder builder = new StringBuilder();\n boolean lastRemoved = false;\n for(int i=0;i 0 && !builder.toString().endsWith(\" \")) {\n builder.append(' ');\n }\n\n if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n builder.append(\"\\u0000\");\n }\n lastRemoved = true;\n } else {\n builder.append(token);\n lastRemoved = false;\n }\n }\n\n String analyzedKey = builder.toString();\n\n // Remove trailing sep/holes (TokenStream.end() does\n // not tell us any trailing holes, yet ... there is an\n // issue open for this):\n while (true) {\n String s = analyzedKey.replaceAll(\"(^| )\\u0000$\", \"\");\n s = s.replaceAll(\"\\\\s+$\", \"\");\n if (s.equals(analyzedKey)) {\n break;\n }\n analyzedKey = s;\n }\n\n if (analyzedKey.length() == 0) {\n // Currently suggester can't suggest from the empty\n // string! You get no results, not all results...\n continue;\n }\n\n if (preserveSep && (prefix.endsWith(\" \") || lastRemoved)) {\n analyzedKey += \" \";\n }\n\n if (VERBOSE) {\n System.out.println(\" analyzed: \" + analyzedKey);\n }\n TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();\n\n // NOTE: not great that we ask the suggester to give\n // us the \"answer key\" (ie maybe we have a bug in\n // suggester.toLevA ...) ... but testRandom2() fixes\n // this:\n Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));\n assertTrue(automaton.isDeterministic());\n // TODO: could be faster... but its slowCompletor for a reason\n BytesRef spare = new BytesRef();\n for (TermFreq2 e : slowCompletor) {\n spare.copyChars(e.analyzedForm);\n Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);\n for (IntsRef intsRef : finiteStrings) {\n State p = automaton.getInitialState();\n BytesRef ref = Util.toBytesRef(intsRef, spare);\n boolean added = false;\n for (int i = ref.offset; i < ref.length; i++) {\n State q = p.step(ref.bytes[i] & 0xff);\n if (q == null) {\n break;\n } else if (q.isAccept()) {\n matches.add(new LookupResult(e.surfaceForm, e.weight));\n added = true;\n break;\n }\n p = q;\n }\n if (!added && p.isAccept()) {\n matches.add(new LookupResult(e.surfaceForm, e.weight));\n } \n }\n }\n\n assertTrue(numStopChars > 0 || matches.size() > 0);\n\n if (matches.size() > 1) {\n Collections.sort(matches, new Comparator() {\n @Override\n public int compare(LookupResult left, LookupResult right) {\n int cmp = Float.compare(right.value, left.value);\n if (cmp == 0) {\n return left.compareTo(right);\n } else {\n return cmp;\n }\n }\n });\n }\n\n if (matches.size() > topN) {\n matches = matches.subList(0, topN);\n }\n\n if (VERBOSE) {\n System.out.println(\" expected:\");\n for(LookupResult lr : matches) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n\n System.out.println(\" actual:\");\n for(LookupResult lr : r) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n }\n \n assertEquals(prefix + \" \" + topN, matches.size(), r.size());\n for(int hit=0;hit keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"a\", 40),\n new TermFreq(\"a \", 50),\n new TermFreq(\" a\", 60),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n List results = suggester.lookup(\"a\", false, 5);\n assertEquals(2, results.size());\n assertEquals(\" a\", results.get(0).key);\n assertEquals(60, results.get(0).value);\n assertEquals(\"a \", results.get(1).key);\n assertEquals(50, results.get(1).value);\n }\n\n public void testEditSeps() throws Exception {\n Analyzer a = new MockAnalyzer(random());\n FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);\n\n List keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"foo bar\", 40),\n new TermFreq(\"foo bar baz\", 50),\n new TermFreq(\"barbaz\", 60),\n new TermFreq(\"barbazfoo\", 10),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n assertEquals(\"[foo bar baz/50, foo bar/40]\", suggester.lookup(\"foobar\", false, 5).toString());\n assertEquals(\"[foo bar baz/50]\", suggester.lookup(\"foobarbaz\", false, 5).toString());\n assertEquals(\"[barbaz/60, barbazfoo/10]\", suggester.lookup(\"bar baz\", false, 5).toString());\n assertEquals(\"[barbazfoo/10]\", suggester.lookup(\"bar baz foo\", false, 5).toString());\n }\n \n @SuppressWarnings(\"fallthrough\")\n private static String addRandomEdit(String string, int prefixLength) {\n char[] input = string.toCharArray();\n StringBuilder builder = new StringBuilder();\n for (int i = 0; i < input.length; i++) {\n if (i >= prefixLength && random().nextBoolean() && i < input.length-1) {\n switch(random().nextInt(4)) {\n case 3:\n if (i < input.length-1) {\n // Transpose input[i] and input[1+i]:\n builder.append(input[i+1]);\n builder.append(input[i]);\n for(int j=i+2;j answers = new ArrayList();\n final Set seen = new HashSet();\n for(int i=0;i() {\n @Override\n public int compare(TermFreq a, TermFreq b) {\n return a.term.compareTo(b.term);\n }\n });\n if (VERBOSE) {\n System.out.println(\"\\nTEST: targets\");\n for(TermFreq tf : answers) {\n System.out.println(\" \" + tf.term.utf8ToString() + \" freq=\" + tf.v);\n }\n }\n\n Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);\n int maxEdits = random().nextBoolean() ? 1 : 2;\n int prefixLen = random().nextInt(4);\n boolean transpositions = random().nextBoolean();\n // TODO: test graph analyzers\n // TODO: test exactFirst / preserveSep permutations\n FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, prefixLen);\n\n if (VERBOSE) {\n System.out.println(\"TEST: maxEdits=\" + maxEdits + \" prefixLen=\" + prefixLen + \" transpositions=\" + transpositions + \" num=\" + NUM);\n }\n\n Collections.shuffle(answers, random());\n suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()])));\n\n final int ITERS = atLeast(100);\n for(int iter=0;iter expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag);\n if (VERBOSE) {\n System.out.println(\" expected: \" + expected.size());\n for(LookupResult c : expected) {\n System.out.println(\" \" + c);\n }\n }\n final List actual = suggest.lookup(frag, false, NUM);\n if (VERBOSE) {\n System.out.println(\" actual: \" + actual.size());\n for(LookupResult c : actual) {\n System.out.println(\" \" + c);\n }\n }\n\n Collections.sort(actual, new CompareByCostThenAlpha());\n\n final int limit = Math.min(expected.size(), actual.size());\n for(int ans=0;ans slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List answers, String frag) {\n final List results = new ArrayList();\n final int fragLen = frag.length();\n for(TermFreq tf : answers) {\n //System.out.println(\" check s=\" + tf.term.utf8ToString());\n boolean prefixMatches = true;\n for(int i=0;i= fragLen-maxEdits) {\n // OK it's possible:\n //System.out.println(\" possible\");\n int d;\n final String s = tf.term.utf8ToString();\n if (fragLen == prefixLen) {\n d = 0;\n } else if (false && len < fragLen) {\n d = getDistance(frag, s, allowTransposition);\n } else {\n //System.out.println(\" try loop\");\n d = maxEdits + 1;\n //for(int ed=-maxEdits;ed<=maxEdits;ed++) {\n for(int ed=-maxEdits;ed<=maxEdits;ed++) {\n if (s.length() < fragLen - ed) {\n continue;\n }\n String check = s.substring(0, fragLen-ed);\n d = getDistance(frag, check, allowTransposition);\n //System.out.println(\" sub check s=\" + check + \" d=\" + d);\n if (d <= maxEdits) {\n break;\n }\n }\n }\n if (d <= maxEdits) {\n results.add(new LookupResult(tf.term.utf8ToString(), tf.v));\n }\n }\n }\n\n Collections.sort(results, new CompareByCostThenAlpha());\n }\n\n return results;\n }\n\n private static class CharSequenceComparator implements Comparator {\n\n @Override\n public int compare(CharSequence o1, CharSequence o2) {\n final int l1 = o1.length();\n final int l2 = o2.length();\n \n final int aStop = Math.min(l1, l2);\n for (int i = 0; i < aStop; i++) {\n int diff = o1.charAt(i) - o2.charAt(i);\n if (diff != 0) {\n return diff;\n }\n }\n // One is a prefix of the other, or, they are equal:\n return l1 - l2;\n }\n }\n\n private static final Comparator CHARSEQUENCE_COMPARATOR = new CharSequenceComparator();\n\n public class CompareByCostThenAlpha implements Comparator {\n @Override\n public int compare(LookupResult a, LookupResult b) {\n if (a.value > b.value) {\n return -1;\n } else if (a.value < b.value) {\n return 1;\n } else {\n final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);\n assert c != 0: \"term=\" + a.key;\n return c;\n }\n }\n }\n\n // NOTE: copied from\n // modules/suggest/src/java/org/apache/lucene/search/spell/LuceneLevenshteinDistance.java\n // and tweaked to return the edit distance not the float\n // lucene measure\n\n /* Finds unicode (code point) Levenstein (edit) distance\n * between two strings, including transpositions. */\n public int getDistance(String target, String other, boolean allowTransposition) {\n IntsRef targetPoints;\n IntsRef otherPoints;\n int n;\n int d[][]; // cost array\n \n // NOTE: if we cared, we could 3*m space instead of m*n space, similar to \n // what LevenshteinDistance does, except cycling thru a ring of three \n // horizontal cost arrays... but this comparator is never actually used by \n // DirectSpellChecker, its only used for merging results from multiple shards \n // in \"distributed spellcheck\", and its inefficient in other ways too...\n\n // cheaper to do this up front once\n targetPoints = toIntsRef(target);\n otherPoints = toIntsRef(other);\n n = targetPoints.length;\n final int m = otherPoints.length;\n d = new int[n+1][m+1];\n \n if (n == 0 || m == 0) {\n if (n == m) {\n return 0;\n }\n else {\n return Math.max(n, m);\n }\n } \n\n // indexes into strings s and t\n int i; // iterates through s\n int j; // iterates through t\n\n int t_j; // jth character of t\n\n int cost; // cost\n\n for (i = 0; i<=n; i++) {\n d[i][0] = i;\n }\n \n for (j = 0; j<=m; j++) {\n d[0][j] = j;\n }\n\n for (j = 1; j<=m; j++) {\n t_j = otherPoints.ints[j-1];\n\n for (i=1; i<=n; i++) {\n cost = targetPoints.ints[i-1]==t_j ? 0 : 1;\n // minimum of cell to the left+1, to the top+1, diagonally left and up +cost\n d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);\n // transposition\n if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {\n d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);\n }\n }\n }\n \n return d[n][m];\n }\n \n private static IntsRef toIntsRef(String s) {\n IntsRef ref = new IntsRef(s.length()); // worst case\n int utf16Len = s.length();\n for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {\n cp = ref.ints[ref.length++] = Character.codePointAt(s, i);\n }\n return ref;\n }\n}\n =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision 40d2f78d3aae3e092f459ebe0031e69a04f47b5a) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision ) @@ -36,7 +36,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -48,7 +48,9 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.Util; public class FuzzySuggesterTest extends LuceneTestCase { @@ -72,6 +74,27 @@ } } + public void testNonLatinRandomEdits() throws IOException { + List keys = new ArrayList(); + int numTerms = atLeast(100); + for (int i = 0; i < numTerms; i++) { + keys.add(new TermFreq("буу" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); + } + keys.add(new TermFreq("фуу бар буу \u00ff фар\udbff\udfffфар\u001f", 12)); + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, + 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH); + suggester.build(new TermFreqArrayIterator(keys)); + int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + String addRandomEdit = addRandomEdit("фуу бар буу", 0); + List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); + assertEquals(addRandomEdit, 1, results.size()); + assertEquals("фуу бар буу \u00ff фар\udbff\udfffфар\u001F", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + } + } + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { TermFreq keys[] = new TermFreq[] { @@ -722,19 +745,21 @@ if (VERBOSE) { System.out.println(" analyzed: " + analyzedKey); } - TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton(); + TokenStreamToUnicodeAutomaton tokenStreamToUnicodeAutomaton = suggester.getTokenStreamToUnicodeAutomaton(); // NOTE: not great that we ask the suggester to give // us the "answer key" (ie maybe we have a bug in // suggester.toLevA ...) ... but testRandom2() fixes // this: - Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)); + Automaton unicodeAutomaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)); + Automaton automaton = new UTF32ToUTF8().convert(unicodeAutomaton); + BasicOperations.determinize(automaton); assertTrue(automaton.isDeterministic()); // TODO: could be faster... but its slowCompletor for a reason BytesRef spare = new BytesRef(); for (TermFreq2 e : slowCompletor) { spare.copyChars(e.analyzedForm); - Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton); + Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToUnicodeAutomaton); for (IntsRef intsRef : finiteStrings) { State p = automaton.getInitialState(); BytesRef ref = Util.toBytesRef(intsRef, spare); Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.File;\nimport java.io.FileInputStream;\nimport java.io.FileOutputStream;\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.OutputStream;\nimport java.io.Reader;\nimport java.io.StringReader;\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\nimport java.util.TreeSet;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.CannedBinaryTokenStream.BinaryToken;\nimport org.apache.lucene.analysis.CannedBinaryTokenStream;\nimport org.apache.lucene.analysis.CannedTokenStream;\nimport org.apache.lucene.analysis.MockAnalyzer;\nimport org.apache.lucene.analysis.MockBytesAttributeFactory;\nimport org.apache.lucene.analysis.MockTokenFilter;\nimport org.apache.lucene.analysis.MockTokenizer;\nimport org.apache.lucene.analysis.Token;\nimport org.apache.lucene.analysis.TokenFilter;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.Tokenizer;\nimport org.apache.lucene.analysis.tokenattributes.CharTermAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;\nimport org.apache.lucene.search.suggest.Lookup.LookupResult;\nimport org.apache.lucene.search.suggest.TermFreq;\nimport org.apache.lucene.search.suggest.TermFreqArrayIterator;\nimport org.apache.lucene.search.suggest.TermFreqPayload;\nimport org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.LuceneTestCase;\nimport org.apache.lucene.util._TestUtil;\n\npublic class AnalyzingSuggesterTest extends LuceneTestCase {\n \n /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */\n public void testKeyword() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"foo\", 50),\n new TermFreq(\"bar\", 10),\n new TermFreq(\"barbar\", 12),\n new TermFreq(\"barbara\", 6)\n };\n \n AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n \n // top N of 2, but only foo is available\n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"f\", random()), false, 2);\n assertEquals(1, results.size());\n assertEquals(\"foo\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n \n // top N of 1 for 'bar': we return this even though\n // barbar is higher because exactFirst is enabled:\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"bar\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"bar\", results.get(0).key.toString());\n assertEquals(10, results.get(0).value, 0.01F);\n \n // top N Of 2 for 'b'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"b\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n \n // top N of 3 for 'ba'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ba\", random()), false, 3);\n assertEquals(3, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n assertEquals(\"barbara\", results.get(2).key.toString());\n assertEquals(6, results.get(2).value, 0.01F);\n }\n \n public void testKeywordWithPayloads() throws Exception {\n TermFreqPayload keys[] = new TermFreqPayload[] {\n new TermFreqPayload(\"foo\", 50, new BytesRef(\"hello\")),\n new TermFreqPayload(\"bar\", 10, new BytesRef(\"goodbye\")),\n new TermFreqPayload(\"barbar\", 12, new BytesRef(\"thank you\")),\n new TermFreqPayload(\"barbara\", 6, new BytesRef(\"for all the fish\"))\n };\n \n AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqPayloadArrayIterator(keys));\n \n // top N of 2, but only foo is available\n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"f\", random()), false, 2);\n assertEquals(1, results.size());\n assertEquals(\"foo\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n assertEquals(new BytesRef(\"hello\"), results.get(0).payload);\n \n // top N of 1 for 'bar': we return this even though\n // barbar is higher because exactFirst is enabled:\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"bar\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"bar\", results.get(0).key.toString());\n assertEquals(10, results.get(0).value, 0.01F);\n assertEquals(new BytesRef(\"goodbye\"), results.get(0).payload);\n \n // top N Of 2 for 'b'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"b\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(new BytesRef(\"thank you\"), results.get(0).payload);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n assertEquals(new BytesRef(\"goodbye\"), results.get(1).payload);\n \n // top N of 3 for 'ba'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ba\", random()), false, 3);\n assertEquals(3, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(new BytesRef(\"thank you\"), results.get(0).payload);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n assertEquals(new BytesRef(\"goodbye\"), results.get(1).payload);\n assertEquals(\"barbara\", results.get(2).key.toString());\n assertEquals(6, results.get(2).value, 0.01F);\n assertEquals(new BytesRef(\"for all the fish\"), results.get(2).payload);\n }\n \n // TODO: more tests\n /**\n * basic \"standardanalyzer\" test with stopword removal\n */\n public void testStandard() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"the ghost of christmas past\", 50),\n };\n \n Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);\n AnalyzingSuggester suggester = new AnalyzingSuggester(standard);\n suggester.setPreservePositionIncrements(false);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"the ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' since its a stopword, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' and 'of' since they are stopwords, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n }\n\n public void testEmpty() throws Exception {\n Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);\n AnalyzingSuggester suggester = new AnalyzingSuggester(standard);\n suggester.build(new TermFreqArrayIterator(new TermFreq[0]));\n\n List result = suggester.lookup(\"a\", false, 20);\n assertTrue(result.isEmpty());\n }\n\n public void testNoSeps() throws Exception {\n TermFreq[] keys = new TermFreq[] {\n new TermFreq(\"ab cd\", 0),\n new TermFreq(\"abcd\", 1),\n };\n\n int options = 0;\n\n Analyzer a = new MockAnalyzer(random());\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, options, 256, -1);\n suggester.build(new TermFreqArrayIterator(keys));\n // TODO: would be nice if \"ab \" would allow the test to\n // pass, and more generally if the analyzer can know\n // that the user's current query has ended at a word, \n // but, analyzers don't produce SEP tokens!\n List r = suggester.lookup(_TestUtil.stringToCharSequence(\"ab c\", random()), false, 2);\n assertEquals(2, r.size());\n\n // With no PRESERVE_SEPS specified, \"ab c\" should also\n // complete to \"abcd\", which has higher weight so should\n // appear first:\n assertEquals(\"abcd\", r.get(0).key.toString());\n }\n\n public void testGraphDups() throws Exception {\n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"slow\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"wi\",1,1),\n token(\"hotspot\",0,3),\n token(\"fi\",1,1),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"fast\",1,1)\n\n }),\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1)\n }),\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"wifi network is slow\", 50),\n new TermFreq(\"wi fi network is fast\", 10),\n };\n //AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, AnalyzingSuggester.EXACT_FIRST, 256, -1);\n AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n List results = suggester.lookup(\"wifi network\", false, 10);\n if (VERBOSE) {\n System.out.println(\"Results: \" + results);\n }\n assertEquals(2, results.size());\n assertEquals(\"wifi network is slow\", results.get(0).key);\n assertEquals(50, results.get(0).value);\n assertEquals(\"wi fi network is fast\", results.get(1).key);\n assertEquals(10, results.get(1).value);\n }\n\n public void testInputPathRequired() throws Exception {\n\n // SynonymMap.Builder b = new SynonymMap.Builder(false);\n // b.add(new CharsRef(\"ab\"), new CharsRef(\"ba\"), true);\n // final SynonymMap map = b.build();\n\n // The Analyzer below mimics the functionality of the SynonymAnalyzer\n // using the above map, so that the suggest module does not need a dependency on the \n // synonym module \n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"xc\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ba\",1,1), \n token(\"xd\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"x\",1,1)\n })\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"ab xc\", 50),\n new TermFreq(\"ba xd\", 50),\n };\n AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n List results = suggester.lookup(\"ab x\", false, 1);\n assertTrue(results.size() == 1);\n }\n\n private static Token token(String term, int posInc, int posLength) {\n final Token t = new Token(term, 0, 0);\n t.setPositionIncrement(posInc);\n t.setPositionLength(posLength);\n return t;\n }\n\n private static BinaryToken token(BytesRef term) {\n return new BinaryToken(term);\n }\n\n /*\n private void printTokens(final Analyzer analyzer, String input) throws IOException {\n System.out.println(\"Tokens for \" + input);\n TokenStream ts = analyzer.tokenStream(\"\", new StringReader(input));\n ts.reset();\n final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);\n final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);\n final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);\n \n while(ts.incrementToken()) {\n termBytesAtt.fillBytesRef();\n System.out.println(String.format(\"%s,%s,%s\", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); \n }\n ts.end();\n ts.close();\n } \n */ \n\n private final Analyzer getUnusualAnalyzer() {\n return new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n\n int count;\n\n @Override\n public TokenStream getTokenStream() {\n // 4th time we are called, return tokens a b,\n // else just a:\n if (count++ != 3) {\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n });\n } else {\n // After that \"a b\":\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n token(\"b\", 1, 1),\n });\n }\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n }\n\n public void testExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1);\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n //System.out.println(\"ALL: \" + suggester.lookup(\"x y\", false, 6));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"x y\", false, topN);\n //System.out.println(\"topN=\" + topN + \" \" + results);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"x y\", results.get(0).key);\n assertEquals(1, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"z z z\", results.get(1).key);\n assertEquals(20, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x y z\", results.get(2).key);\n assertEquals(3, results.get(2).value);\n\n if (topN > 3) {\n assertEquals(\"x\", results.get(3).key);\n assertEquals(2, results.get(3).value);\n }\n }\n }\n }\n }\n\n public void testNonExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"p\", false, topN);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"z z z\", results.get(0).key);\n assertEquals(20, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"x y z\", results.get(1).key);\n assertEquals(3, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x\", results.get(2).key);\n assertEquals(2, results.get(2).value);\n \n if (topN > 3) {\n assertEquals(\"x y\", results.get(3).key);\n assertEquals(1, results.get(3).value);\n }\n }\n }\n }\n }\n \n // Holds surface form separately:\n private static class TermFreq2 implements Comparable {\n public final String surfaceForm;\n public final String analyzedForm;\n public final long weight;\n public final BytesRef payload;\n\n public TermFreq2(String surfaceForm, String analyzedForm, long weight, BytesRef payload) {\n this.surfaceForm = surfaceForm;\n this.analyzedForm = analyzedForm;\n this.weight = weight;\n this.payload = payload;\n }\n\n @Override\n public int compareTo(TermFreq2 other) {\n int cmp = analyzedForm.compareTo(other.analyzedForm);\n if (cmp != 0) {\n return cmp;\n } else if (weight > other.weight) {\n return -1;\n } else if (weight < other.weight) {\n return 1;\n } else {\n assert false;\n return 0;\n }\n }\n\n @Override\n public String toString() {\n return surfaceForm + \"/\" + weight;\n }\n }\n\n static boolean isStopChar(char ch, int numStopChars) {\n //System.out.println(\"IS? \" + ch + \": \" + (ch - 'a') + \": \" + ((ch - 'a') < numStopChars));\n return (ch - 'a') < numStopChars;\n }\n\n // Like StopFilter:\n private static class TokenEater extends TokenFilter {\n private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);\n private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);\n private final int numStopChars;\n private final boolean preserveHoles;\n private boolean first;\n\n public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) {\n super(in);\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public void reset() throws IOException {\n super.reset();\n first = true;\n }\n\n @Override\n public final boolean incrementToken() throws IOException {\n int skippedPositions = 0;\n while (input.incrementToken()) {\n if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) {\n int posInc = posIncrAtt.getPositionIncrement() + skippedPositions;\n if (first) {\n if (posInc == 0) {\n // first token having posinc=0 is illegal.\n posInc = 1;\n }\n first = false;\n }\n posIncrAtt.setPositionIncrement(posInc);\n //System.out.println(\"RETURN term=\" + termAtt + \" numStopChars=\" + numStopChars);\n return true;\n }\n if (preserveHoles) {\n skippedPositions += posIncrAtt.getPositionIncrement();\n }\n }\n\n return false;\n }\n }\n\n private static class MockTokenEatingAnalyzer extends Analyzer {\n private int numStopChars;\n private boolean preserveHoles;\n\n private final MockBytesAttributeFactory factory = new MockBytesAttributeFactory();\n\n public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public TokenStreamComponents createComponents(String fieldName, Reader reader) {\n MockTokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);\n tokenizer.setEnableChecks(true);\n TokenStream next;\n if (numStopChars != 0) {\n next = new TokenEater(preserveHoles, tokenizer, numStopChars);\n } else {\n next = tokenizer;\n }\n return new TokenStreamComponents(tokenizer, next);\n }\n }\n\n private static char SEP = '\\uFFFF';\n\n public void testRandom() throws Exception {\n\n int numQueries = atLeast(1000);\n \n final List slowCompletor = new ArrayList();\n final TreeSet allPrefixes = new TreeSet();\n final Set seen = new HashSet();\n \n boolean doPayloads = random().nextBoolean();\n\n TermFreq[] keys = null;\n TermFreqPayload[] payloadKeys = null;\n if (doPayloads) {\n payloadKeys = new TermFreqPayload[numQueries];\n } else {\n keys = new TermFreq[numQueries];\n }\n\n boolean preserveSep = random().nextBoolean();\n\n final int numStopChars = random().nextInt(10);\n final boolean preserveHoles = random().nextBoolean();\n\n if (VERBOSE) {\n System.out.println(\"TEST: \" + numQueries + \" words; preserveSep=\" + preserveSep + \" numStopChars=\" + numStopChars + \" preserveHoles=\" + preserveHoles);\n }\n \n for (int i = 0; i < numQueries; i++) {\n int numTokens = _TestUtil.nextInt(random(), 1, 4);\n String key;\n String analyzedKey;\n while(true) {\n key = \"\";\n analyzedKey = \"\";\n boolean lastRemoved = false;\n for(int token=0;token < numTokens;token++) {\n String s;\n while (true) {\n // TODO: would be nice to fix this slowCompletor/comparator to\n // use full range, but we might lose some coverage too...\n s = _TestUtil.randomSimpleString(random());\n if (s.length() > 0) {\n if (token > 0) {\n key += \" \";\n }\n if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != SEP) {\n analyzedKey += SEP;\n }\n key += s;\n if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {\n lastRemoved = true;\n if (preserveSep && preserveHoles) {\n analyzedKey += SEP;\n }\n } else {\n lastRemoved = false;\n analyzedKey += s;\n }\n break;\n }\n }\n }\n\n analyzedKey = analyzedKey.replaceAll(\"(^|\" + SEP + \")\" + SEP + \"$\", \"\");\n\n if (preserveSep && lastRemoved) {\n analyzedKey += SEP;\n }\n\n // Don't add same surface form more than once:\n if (!seen.contains(key)) {\n seen.add(key);\n break;\n }\n }\n\n for (int j = 1; j < key.length(); j++) {\n allPrefixes.add(key.substring(0, j));\n }\n // we can probably do Integer.MAX_VALUE here, but why worry.\n int weight = random().nextInt(1<<24);\n BytesRef payload;\n if (doPayloads) {\n byte[] bytes = new byte[random().nextInt(10)];\n random().nextBytes(bytes);\n payload = new BytesRef(bytes);\n payloadKeys[i] = new TermFreqPayload(key, weight, payload);\n } else {\n keys[i] = new TermFreq(key, weight);\n payload = null;\n }\n\n slowCompletor.add(new TermFreq2(key, analyzedKey, weight, payload));\n }\n\n if (VERBOSE) {\n // Don't just sort original list, to avoid VERBOSE\n // altering the test:\n List sorted = new ArrayList(slowCompletor);\n Collections.sort(sorted);\n for(TermFreq2 ent : sorted) {\n System.out.println(\" surface='\" + ent.surfaceForm + \"' analyzed='\" + ent.analyzedForm + \"' weight=\" + ent.weight);\n }\n }\n\n Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a,\n preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1);\n if (doPayloads) {\n suggester.build(new TermFreqPayloadArrayIterator(payloadKeys));\n } else {\n suggester.build(new TermFreqArrayIterator(keys));\n }\n\n for (String prefix : allPrefixes) {\n\n if (VERBOSE) {\n System.out.println(\"\\nTEST: prefix=\" + prefix);\n }\n\n final int topN = _TestUtil.nextInt(random(), 1, 10);\n List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN);\n\n // 2. go thru whole set to find suggestions:\n List matches = new ArrayList();\n\n // \"Analyze\" the key:\n String[] tokens = prefix.split(\" \");\n StringBuilder builder = new StringBuilder();\n boolean lastRemoved = false;\n for(int i=0;i 0 && !builder.toString().endsWith(\"\"+SEP)) {\n builder.append(SEP);\n }\n\n if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n builder.append(SEP);\n }\n lastRemoved = true;\n } else {\n builder.append(token);\n lastRemoved = false;\n }\n }\n\n String analyzedKey = builder.toString();\n\n // Remove trailing sep/holes (TokenStream.end() does\n // not tell us any trailing holes, yet ... there is an\n // issue open for this):\n while (true) {\n String s = analyzedKey.replaceAll(SEP + \"$\", \"\");\n if (s.equals(analyzedKey)) {\n break;\n }\n analyzedKey = s;\n }\n\n if (analyzedKey.length() == 0) {\n // Currently suggester can't suggest from the empty\n // string! You get no results, not all results...\n continue;\n }\n\n if (preserveSep && (prefix.endsWith(\" \") || lastRemoved)) {\n analyzedKey += SEP;\n }\n\n if (VERBOSE) {\n System.out.println(\" analyzed: \" + analyzedKey);\n }\n\n // TODO: could be faster... but its slowCompletor for a reason\n for (TermFreq2 e : slowCompletor) {\n if (e.analyzedForm.startsWith(analyzedKey)) {\n matches.add(e);\n }\n }\n\n assertTrue(numStopChars > 0 || matches.size() > 0);\n\n if (matches.size() > 1) {\n Collections.sort(matches, new Comparator() {\n @Override\n public int compare(TermFreq2 left, TermFreq2 right) {\n int cmp = Float.compare(right.weight, left.weight);\n if (cmp == 0) {\n return left.analyzedForm.compareTo(right.analyzedForm);\n } else {\n return cmp;\n }\n }\n });\n }\n\n if (matches.size() > topN) {\n matches = matches.subList(0, topN);\n }\n\n if (VERBOSE) {\n System.out.println(\" expected:\");\n for(TermFreq2 lr : matches) {\n System.out.println(\" key=\" + lr.surfaceForm + \" weight=\" + lr.weight);\n }\n\n System.out.println(\" actual:\");\n for(LookupResult lr : r) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n }\n\n assertEquals(matches.size(), r.size());\n\n for(int hit=0;hit results = suggester.lookup(\"a a\", false, 5);\n assertEquals(1, results.size());\n assertEquals(\"a b\", results.get(0).key);\n assertEquals(50, results.get(0).value);\n\n results = suggester.lookup(\"a a\", false, 5);\n assertEquals(1, results.size());\n assertEquals(\"a a\", results.get(0).key);\n assertEquals(50, results.get(0).value);\n }\n }\n\n public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {\n Analyzer a = new MockAnalyzer(random());\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1);\n\n List keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"a\", 40),\n new TermFreq(\"a \", 50),\n new TermFreq(\" a\", 60),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n List results = suggester.lookup(\"a\", false, 5);\n assertEquals(2, results.size());\n assertEquals(\" a\", results.get(0).key);\n assertEquals(60, results.get(0).value);\n assertEquals(\"a \", results.get(1).key);\n assertEquals(50, results.get(1).value);\n }\n\n public void testQueueExhaustion() throws Exception {\n Analyzer a = new MockAnalyzer(random());\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.EXACT_FIRST, 256, -1);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"a\", 2),\n new TermFreq(\"a b c\", 3),\n new TermFreq(\"a c a\", 1),\n new TermFreq(\"a c b\", 1),\n }));\n\n suggester.lookup(\"a\", false, 4);\n }\n\n public void testExactFirstMissingResult() throws Exception {\n\n Analyzer a = new MockAnalyzer(random());\n\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.EXACT_FIRST, 256, -1);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"a\", 5),\n new TermFreq(\"a b\", 3),\n new TermFreq(\"a c\", 4),\n }));\n\n List results = suggester.lookup(\"a\", false, 3);\n assertEquals(3, results.size());\n assertEquals(\"a\", results.get(0).key);\n assertEquals(5, results.get(0).value);\n assertEquals(\"a c\", results.get(1).key);\n assertEquals(4, results.get(1).value);\n assertEquals(\"a b\", results.get(2).key);\n assertEquals(3, results.get(2).value);\n\n // Try again after save/load:\n File tmpDir = _TestUtil.getTempDir(\"AnalyzingSuggesterTest\");\n tmpDir.mkdir();\n\n File path = new File(tmpDir, \"suggester\");\n\n OutputStream os = new FileOutputStream(path);\n suggester.store(os);\n os.close();\n\n InputStream is = new FileInputStream(path);\n suggester.load(is);\n is.close();\n\n results = suggester.lookup(\"a\", false, 3);\n assertEquals(3, results.size());\n assertEquals(\"a\", results.get(0).key);\n assertEquals(5, results.get(0).value);\n assertEquals(\"a c\", results.get(1).key);\n assertEquals(4, results.get(1).value);\n assertEquals(\"a b\", results.get(2).key);\n assertEquals(3, results.get(2).value);\n }\n\n public void testDupSurfaceFormsMissingResults() throws Exception {\n Analyzer a = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n\n @Override\n public TokenStream getTokenStream() {\n return new CannedTokenStream(new Token[] {\n token(\"hairy\", 1, 1),\n token(\"smelly\", 0, 1),\n token(\"dog\", 1, 1),\n });\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"hambone\", 6),\n new TermFreq(\"nellie\", 5),\n }));\n\n List results = suggester.lookup(\"nellie\", false, 2);\n assertEquals(2, results.size());\n assertEquals(\"hambone\", results.get(0).key);\n assertEquals(6, results.get(0).value);\n assertEquals(\"nellie\", results.get(1).key);\n assertEquals(5, results.get(1).value);\n\n // Try again after save/load:\n File tmpDir = _TestUtil.getTempDir(\"AnalyzingSuggesterTest\");\n tmpDir.mkdir();\n\n File path = new File(tmpDir, \"suggester\");\n\n OutputStream os = new FileOutputStream(path);\n suggester.store(os);\n os.close();\n\n InputStream is = new FileInputStream(path);\n suggester.load(is);\n is.close();\n\n results = suggester.lookup(\"nellie\", false, 2);\n assertEquals(2, results.size());\n assertEquals(\"hambone\", results.get(0).key);\n assertEquals(6, results.get(0).value);\n assertEquals(\"nellie\", results.get(1).key);\n assertEquals(5, results.get(1).value);\n }\n\n public void testDupSurfaceFormsMissingResults2() throws Exception {\n Analyzer a = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n\n int count;\n\n @Override\n public TokenStream getTokenStream() {\n if (count == 0) {\n count++;\n return new CannedTokenStream(new Token[] {\n token(\"p\", 1, 1),\n token(\"q\", 1, 1),\n token(\"r\", 0, 1),\n token(\"s\", 0, 1),\n });\n } else {\n return new CannedTokenStream(new Token[] {\n token(\"p\", 1, 1),\n });\n }\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"a\", 6),\n new TermFreq(\"b\", 5),\n }));\n\n List results = suggester.lookup(\"a\", false, 2);\n assertEquals(2, results.size());\n assertEquals(\"a\", results.get(0).key);\n assertEquals(6, results.get(0).value);\n assertEquals(\"b\", results.get(1).key);\n assertEquals(5, results.get(1).value);\n\n // Try again after save/load:\n File tmpDir = _TestUtil.getTempDir(\"AnalyzingSuggesterTest\");\n tmpDir.mkdir();\n\n File path = new File(tmpDir, \"suggester\");\n\n OutputStream os = new FileOutputStream(path);\n suggester.store(os);\n os.close();\n\n InputStream is = new FileInputStream(path);\n suggester.load(is);\n is.close();\n\n results = suggester.lookup(\"a\", false, 2);\n assertEquals(2, results.size());\n assertEquals(\"a\", results.get(0).key);\n assertEquals(6, results.get(0).value);\n assertEquals(\"b\", results.get(1).key);\n assertEquals(5, results.get(1).value);\n }\n\n public void test0ByteKeys() throws Exception {\n final Analyzer a = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedBinaryTokenStream(new BinaryToken[] {\n token(new BytesRef(new byte[] {0x0, 0x0, 0x0})),\n }),\n new CannedBinaryTokenStream(new BinaryToken[] {\n token(new BytesRef(new byte[] {0x0, 0x0})),\n }),\n new CannedBinaryTokenStream(new BinaryToken[] {\n token(new BytesRef(new byte[] {0x0, 0x0, 0x0})),\n }),\n new CannedBinaryTokenStream(new BinaryToken[] {\n token(new BytesRef(new byte[] {0x0, 0x0})),\n }),\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"a a\", 50),\n new TermFreq(\"a b\", 50),\n }));\n }\n\n public void testDupSurfaceFormsMissingResults3() throws Exception {\n Analyzer a = new MockAnalyzer(random());\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"a a\", 7),\n new TermFreq(\"a a\", 7),\n new TermFreq(\"a c\", 6),\n new TermFreq(\"a c\", 3),\n new TermFreq(\"a b\", 5),\n }));\n assertEquals(\"[a a/7, a c/6, a b/5]\", suggester.lookup(\"a\", false, 3).toString());\n }\n\n public void testEndingSpace() throws Exception {\n Analyzer a = new MockAnalyzer(random());\n AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"i love lucy\", 7),\n new TermFreq(\"isla de muerta\", 8),\n }));\n assertEquals(\"[isla de muerta/8, i love lucy/7]\", suggester.lookup(\"i\", false, 3).toString());\n assertEquals(\"[i love lucy/7]\", suggester.lookup(\"i \", false, 3).toString());\n }\n}\n =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 40d2f78d3aae3e092f459ebe0031e69a04f47b5a) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision ) @@ -24,7 +24,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; -import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -48,8 +47,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.TermFreq; import org.apache.lucene.search.suggest.TermFreqArrayIterator; @@ -594,7 +591,7 @@ } } - private static char SEP = '\uFFFF'; + private static String SEP = "\udbff\udfff"; public void testRandom() throws Exception { @@ -641,7 +638,7 @@ if (token > 0) { key += " "; } - if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != SEP) { + if (preserveSep && analyzedKey.length() > 0 && analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length()-1)) != 0x10FFFF) { analyzedKey += SEP; } key += s; @@ -838,7 +835,7 @@ int tokenStreamCounter = 0; final TokenStream[] tokenStreams = new TokenStream[] { new CannedBinaryTokenStream(new BinaryToken[] { - token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), + token(new BytesRef(new byte[] {0x61, (byte) 0xF4, (byte) 0x8F, (byte) 0xBF, (byte) 0xBF, 0x61})), }), new CannedTokenStream(new Token[] { token("a",1,1), @@ -849,7 +846,7 @@ token("a",1,1) }), new CannedBinaryTokenStream(new BinaryToken[] { - token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), + token(new BytesRef(new byte[] {0x61, (byte) 0xF4, (byte) 0x8F, (byte) 0xBF, (byte) 0xBF, 0x61})), }) }; @@ -872,17 +869,37 @@ new TermFreq("a b", 50), }; - AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, analyzer, AnalyzingSuggester.EXACT_FIRST | (i==0 ? AnalyzingSuggester.PRESERVE_SEP : 0), 256, -1); + if (i == 0) { + // First time w/ preserveSep: + AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, analyzer, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1); - suggester.build(new TermFreqArrayIterator(keys)); - List results = suggester.lookup("a a", false, 5); - assertEquals(1, results.size()); - assertEquals("a b", results.get(0).key); - assertEquals(50, results.get(0).value); + suggester.build(new TermFreqArrayIterator(keys)); + List results = suggester.lookup("a a", false, 5); + assertEquals(1, results.size()); + assertEquals("a b", results.get(0).key); + assertEquals(50, results.get(0).value); - results = suggester.lookup("a a", false, 5); + results = suggester.lookup("a b", false, 5); - assertEquals(1, results.size()); - assertEquals("a a", results.get(0).key); - assertEquals(50, results.get(0).value); + assertEquals(1, results.size()); + assertEquals("a a", results.get(0).key); + assertEquals(50, results.get(0).value); + } else { + // Second time without preserveSep: + AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, analyzer, AnalyzingSuggester.EXACT_FIRST, 256, -1); + suggester.build(new TermFreqArrayIterator(keys)); + List results = suggester.lookup("a a", false, 5); + assertEquals(2, results.size()); + assertEquals("a a", results.get(0).key); + assertEquals(50, results.get(0).value); + assertEquals("a b", results.get(1).key); + assertEquals(50, results.get(1).value); + + results = suggester.lookup("a b", false, 5); + assertEquals(2, results.size()); + assertEquals("a b", results.get(0).key); + assertEquals(50, results.get(0).value); + assertEquals("a a", results.get(1).key); + assertEquals(50, results.get(1).value); + } } } Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java (revision ) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java (revision ) @@ -0,0 +1,247 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RollingBuffer; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +import java.io.IOException; + +// TODO: maybe also toFST? then we can translate atts into FST outputs/weights + +/** + * Consumes a TokenStream and creates an {@link org.apache.lucene.util.automaton.Automaton} + * where the transition labels are Unicode code points from the {@link + * org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}. Between tokens we insert + * POS_SEP and for holes we insert HOLE. + * + * @lucene.experimental + */ +public class TokenStreamToUnicodeAutomaton { + + private boolean preservePositionIncrements; + + /** + * Sole constructor. + */ + public TokenStreamToUnicodeAutomaton() { + this.preservePositionIncrements = true; + } + + /** + * Whether to generate holes in the automaton for missing positions, true by default. + */ + public void setPreservePositionIncrements(boolean enablePositionIncrements) { + this.preservePositionIncrements = enablePositionIncrements; + } + + private static class Position implements RollingBuffer.Resettable { + // Any tokens that ended at our position arrive to this state: + State arriving; + + // Any tokens that start at our position leave from this state: + State leaving; + + @Override + public void reset() { + arriving = null; + leaving = null; + } + } + + private static class Positions extends RollingBuffer { + @Override + protected Position newInstance() { + return new Position(); + } + } + + /** + * Subclass & implement this if you need to change the + * token (such as escaping certain bytes) before it's + * turned into a graph. + */ + protected BytesRef changeToken(BytesRef in) { + return in; + } + + /** + * Pulls the graph (including {@link + * org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}) from the provided {@link + * org.apache.lucene.analysis.TokenStream}, and creates the corresponding + * automaton where arcs are Unicode code points from each term. + */ + public Automaton toAutomaton(TokenStream in) throws IOException { + final Automaton a = new Automaton(); + boolean deterministic = true; + + final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); + final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class); + + final BytesRef term = termBytesAtt.getBytesRef(); + + in.reset(); + + // Only temporarily holds states ahead of our current + // position: + + final RollingBuffer positions = new Positions(); + + int pos = -1; + Position posData = null; + int maxOffset = 0; + while (in.incrementToken()) { + int posInc = posIncAtt.getPositionIncrement(); + if (!preservePositionIncrements && posInc > 1) { + posInc = 1; + } + assert pos > -1 || posInc > 0; + + if (posInc > 0) { + + // New node: + pos += posInc; + + posData = positions.get(pos); + assert posData.leaving == null; + + if (posData.arriving == null) { + // No token ever arrived to this position + if (pos == 0) { + // OK: this is the first token + posData.leaving = a.getInitialState(); + } else { + // This means there's a hole (eg, StopFilter + // does this): + posData.leaving = new State(); + addHoles(a.getInitialState(), positions, pos); + } + } else { + posData.leaving = new State(); + posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, posData.leaving)); + if (posInc > 1) { + // A token spanned over a hole; add holes + // "under" it: + addHoles(a.getInitialState(), positions, pos); + } + } + positions.freeBefore(pos); + } else { + // note: this isn't necessarily true. its just that we aren't surely det. + // we could optimize this further (e.g. buffer and sort synonyms at a position) + // but thats probably overkill. this is cheap and dirty + deterministic = false; + } + + final int endPos = pos + posLengthAtt.getPositionLength(); + + termBytesAtt.fillBytesRef(); + final String utf16 = changeToken(term).utf8ToString(); + final int[] term2 = new int[utf16.codePointCount(0, utf16.length())]; + for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) + term2[j++] = cp = utf16.codePointAt(i); + + final Position endPosData = positions.get(endPos); + if (endPosData.arriving == null) { + endPosData.arriving = new State(); + } + + State state = posData.leaving; + for (int charIDX = 0; charIDX < term2.length; charIDX++) { + final State nextState = charIDX == term2.length - 1 ? endPosData.arriving : new State(); + state.addTransition(new Transition(term2[charIDX], nextState)); + state = nextState; + } + + maxOffset = Math.max(maxOffset, offsetAtt.endOffset()); + } + + in.end(); + State endState = null; + if (offsetAtt.endOffset() > maxOffset) { + endState = new State(); + endState.setAccept(true); + } + + pos++; + while (pos <= positions.getMaxPos()) { + posData = positions.get(pos); + if (posData.arriving != null) { + if (endState != null) { + posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, endState)); + } else { + posData.arriving.setAccept(true); + } + } + pos++; + } + + //toDot(a); + a.setDeterministic(deterministic); + return a; + } + + // for debugging! + /* + private static void toDot(Automaton a) throws IOException { + final String s = a.toDot(); + Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot")); + w.write(s); + w.close(); + System.out.println("TEST: saved to /tmp/out.dot"); + } + */ + + private static void addHoles(State startState, RollingBuffer positions, int pos) { + Position posData = positions.get(pos); + Position prevPosData = positions.get(pos - 1); + + while (posData.arriving == null || prevPosData.leaving == null) { + if (posData.arriving == null) { + posData.arriving = new State(); + posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, posData.leaving)); + } + if (prevPosData.leaving == null) { + if (pos == 1) { + prevPosData.leaving = startState; + } else { + prevPosData.leaving = new State(); + } + if (prevPosData.arriving != null) { + prevPosData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, prevPosData.leaving)); + } + } + prevPosData.leaving.addTransition(new Transition(TokenStreamToAutomaton.HOLE, posData.arriving)); + pos--; + if (pos <= 0) { + break; + } + posData = prevPosData; + prevPosData = positions.get(pos - 1); + } + } +} Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.analysis;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.IOException;\n\nimport org.apache.lucene.analysis.tokenattributes.OffsetAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.RollingBuffer;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.automaton.Transition;\n\n// TODO: maybe also toFST? then we can translate atts into FST outputs/weights\n\n/** Consumes a TokenStream and creates an {@link Automaton}\n * where the transition labels are UTF8 bytes from the {@link\n * TermToBytesRefAttribute}. Between tokens we insert\n * POS_SEP and for holes we insert HOLE.\n *\n * @lucene.experimental */\npublic class TokenStreamToAutomaton {\n\n private boolean preservePositionIncrements;\n\n /** Sole constructor. */\n public TokenStreamToAutomaton() {\n this.preservePositionIncrements = true;\n }\n\n /** Whether to generate holes in the automaton for missing positions, true by default. */\n public void setPreservePositionIncrements(boolean enablePositionIncrements) {\n this.preservePositionIncrements = enablePositionIncrements;\n }\n\n private static class Position implements RollingBuffer.Resettable {\n // Any tokens that ended at our position arrive to this state:\n State arriving;\n\n // Any tokens that start at our position leave from this state:\n State leaving;\n\n @Override\n public void reset() {\n arriving = null;\n leaving = null;\n }\n }\n\n private static class Positions extends RollingBuffer {\n @Override\n protected Position newInstance() {\n return new Position();\n }\n }\n\n /** Subclass & implement this if you need to change the\n * token (such as escaping certain bytes) before it's\n * turned into a graph. */ \n protected BytesRef changeToken(BytesRef in) {\n return in;\n }\n\n /** We create transition between two adjacent tokens. */\n public static final int POS_SEP = 256;\n\n /** We add this arc to represent a hole. */\n public static final int HOLE = 257;\n\n /** Pulls the graph (including {@link\n * PositionLengthAttribute}) from the provided {@link\n * TokenStream}, and creates the corresponding\n * automaton where arcs are bytes from each term. */\n public Automaton toAutomaton(TokenStream in) throws IOException {\n final Automaton a = new Automaton();\n boolean deterministic = true;\n\n final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);\n final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);\n final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);\n final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);\n\n final BytesRef term = termBytesAtt.getBytesRef();\n\n in.reset();\n\n // Only temporarily holds states ahead of our current\n // position:\n\n final RollingBuffer positions = new Positions();\n\n int pos = -1;\n Position posData = null;\n int maxOffset = 0;\n while (in.incrementToken()) {\n int posInc = posIncAtt.getPositionIncrement();\n if (!preservePositionIncrements && posInc > 1) {\n posInc = 1;\n }\n assert pos > -1 || posInc > 0;\n\n if (posInc > 0) {\n\n // New node:\n pos += posInc;\n\n posData = positions.get(pos);\n assert posData.leaving == null;\n\n if (posData.arriving == null) {\n // No token ever arrived to this position\n if (pos == 0) {\n // OK: this is the first token\n posData.leaving = a.getInitialState();\n } else {\n // This means there's a hole (eg, StopFilter\n // does this):\n posData.leaving = new State();\n addHoles(a.getInitialState(), positions, pos);\n }\n } else {\n posData.leaving = new State();\n posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));\n if (posInc > 1) {\n // A token spanned over a hole; add holes\n // \"under\" it:\n addHoles(a.getInitialState(), positions, pos);\n }\n }\n positions.freeBefore(pos);\n } else {\n // note: this isn't necessarily true. its just that we aren't surely det.\n // we could optimize this further (e.g. buffer and sort synonyms at a position)\n // but thats probably overkill. this is cheap and dirty\n deterministic = false;\n }\n\n final int endPos = pos + posLengthAtt.getPositionLength();\n\n termBytesAtt.fillBytesRef();\n final BytesRef term2 = changeToken(term);\n final Position endPosData = positions.get(endPos);\n if (endPosData.arriving == null) {\n endPosData.arriving = new State();\n }\n\n State state = posData.leaving;\n for(int byteIDX=0;byteIDX maxOffset) {\n endState = new State();\n endState.setAccept(true);\n }\n\n pos++;\n while (pos <= positions.getMaxPos()) {\n posData = positions.get(pos);\n if (posData.arriving != null) {\n if (endState != null) {\n posData.arriving.addTransition(new Transition(POS_SEP, endState));\n } else {\n posData.arriving.setAccept(true);\n }\n }\n pos++;\n }\n\n //toDot(a);\n a.setDeterministic(deterministic);\n return a;\n }\n\n // for debugging!\n /*\n private static void toDot(Automaton a) throws IOException {\n final String s = a.toDot();\n Writer w = new OutputStreamWriter(new FileOutputStream(\"/tmp/out.dot\"));\n w.write(s);\n w.close();\n System.out.println(\"TEST: saved to /tmp/out.dot\");\n }\n */\n\n private static void addHoles(State startState, RollingBuffer positions, int pos) {\n Position posData = positions.get(pos);\n Position prevPosData = positions.get(pos-1);\n\n while(posData.arriving == null || prevPosData.leaving == null) {\n if (posData.arriving == null) {\n posData.arriving = new State();\n posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));\n }\n if (prevPosData.leaving == null) {\n if (pos == 1) {\n prevPosData.leaving = startState;\n } else {\n prevPosData.leaving = new State();\n }\n if (prevPosData.arriving != null) {\n prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));\n }\n }\n prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));\n pos--;\n if (pos <= 0) {\n break;\n }\n posData = prevPosData;\n prevPosData = positions.get(pos-1);\n }\n }\n}\n =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 40d2f78d3aae3e092f459ebe0031e69a04f47b5a) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision ) @@ -80,10 +80,10 @@ } /** We create transition between two adjacent tokens. */ - public static final int POS_SEP = 256; + public static final int POS_SEP = 0x10FFFF; /** We add this arc to represent a hole. */ - public static final int HOLE = 257; + public static final int HOLE = POS_SEP - 1; /** Pulls the graph (including {@link * PositionLengthAttribute}) from the provided {@link Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\nimport java.io.FileOutputStream;\nimport java.io.IOException;\nimport java.io.OutputStreamWriter;\nimport java.io.Writer;\nimport java.util.Arrays;\nimport java.util.List;\nimport java.util.Set;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.BasicAutomata;\nimport org.apache.lucene.util.automaton.BasicOperations;\nimport org.apache.lucene.util.automaton.LevenshteinAutomata;\nimport org.apache.lucene.util.automaton.SpecialOperations;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.PairOutputs.Pair;\n\n/**\n * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is\n * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though\n * you can explicitly choose classic Levenshtein by passing false\n * for the transpositions parameter.\n *

\n * At most, this query will match terms up to\n * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}\n * edits. Higher distances are not supported. Note that the\n * fuzzy distance is measured in \"byte space\" on the bytes\n * returned by the {@link TokenStream}'s {@link\n * TermToBytesRefAttribute}, usually UTF8. By default\n * the analyzed bytes must be at least 3 {@link\n * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are\n * considered. Furthermore, the first 1 {@link\n * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be\n * edited. We allow up to 1 (@link\n * #DEFAULT_MAX_EDITS} edit.\n *\n *

\n * NOTE: This suggester does not boost suggestions that\n * required no edits over suggestions that did require\n * edits. This is a known limitation.\n *\n *

\n * Note: complex query analyzers can have a significant impact on the lookup\n * performance. It's recommended to not use analyzers that drop or inject terms\n * like synonyms to keep the complexity of the prefix intersection low for good\n * lookup performance. At index time, complex analyzers can safely be used.\n *

\n */\npublic final class FuzzySuggester extends AnalyzingSuggester {\n private final int maxEdits;\n private final boolean transpositions;\n private final int nonFuzzyPrefix;\n private final int minFuzzyLength;\n\n /**\n * The default minimum length of the key passed to {@link\n * #lookup} before any edits are allowed.\n */\n public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;\n\n /**\n * The default prefix length where edits are not allowed.\n */\n public static final int DEFAULT_NON_FUZZY_PREFIX = 1;\n \n /**\n * The default maximum number of edits for fuzzy\n * suggestions.\n */\n public static final int DEFAULT_MAX_EDITS = 1;\n \n /**\n * The default transposition value passed to {@link LevenshteinAutomata}\n */\n public static final boolean DEFAULT_TRANSPOSITIONS = true;\n\n /**\n * Creates a {@link FuzzySuggester} instance initialized with default values.\n * \n * @param analyzer the analyzer used for this suggester\n */\n public FuzzySuggester(Analyzer analyzer) {\n this(analyzer, analyzer);\n }\n \n /**\n * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.\n * \n * @param indexAnalyzer\n * Analyzer that will be used for analyzing suggestions while building the index.\n * @param queryAnalyzer\n * Analyzer that will be used for analyzing query text during lookup\n */\n public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,\n DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);\n }\n\n /**\n * Creates a {@link FuzzySuggester} instance.\n * \n * @param indexAnalyzer Analyzer that will be used for\n * analyzing suggestions while building the index.\n * @param queryAnalyzer Analyzer that will be used for\n * analyzing query text during lookup\n * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}\n * @param maxSurfaceFormsPerAnalyzedForm Maximum number of\n * surface forms to keep for a single analyzed form.\n * When there are too many surface forms we discard the\n * lowest weighted ones.\n * @param maxGraphExpansions Maximum number of graph paths\n * to expand from the analyzed form. Set this to -1 for\n * no limit.\n * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .\n * @param transpositions true if transpositions should be treated as a primitive \n * edit operation. If this is false, comparisons will implement the classic\n * Levenshtein algorithm.\n * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}\n * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})\n */\n public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,\n int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,\n int maxEdits, boolean transpositions, int nonFuzzyPrefix,\n int minFuzzyLength) {\n super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);\n if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {\n throw new IllegalArgumentException(\"maxEdits must be between 0 and \" + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);\n }\n if (nonFuzzyPrefix < 0) {\n throw new IllegalArgumentException(\"nonFuzzyPrefix must not be >= 0 (got \" + nonFuzzyPrefix + \")\");\n }\n if (minFuzzyLength < 0) {\n throw new IllegalArgumentException(\"minFuzzyLength must not be >= 0 (got \" + minFuzzyLength + \")\");\n }\n \n this.maxEdits = maxEdits;\n this.transpositions = transpositions;\n this.nonFuzzyPrefix = nonFuzzyPrefix;\n this.minFuzzyLength = minFuzzyLength;\n }\n \n @Override\n protected List>> getFullPrefixPaths(List>> prefixPaths,\n Automaton lookupAutomaton,\n FST> fst)\n throws IOException {\n\n // TODO: right now there's no penalty for fuzzy/edits,\n // ie a completion whose prefix matched exactly what the\n // user typed gets no boost over completions that\n // required an edit, which get no boost over completions\n // requiring two edits. I suspect a multiplicative\n // factor is appropriate (eg, say a fuzzy match must be at\n // least 2X better weight than the non-fuzzy match to\n // \"compete\") ... in which case I think the wFST needs\n // to be log weights or something ...\n\n Automaton levA = toLevenshteinAutomata(lookupAutomaton);\n /*\n Writer w = new OutputStreamWriter(new FileOutputStream(\"out.dot\"), \"UTF-8\");\n w.write(levA.toDot());\n w.close();\n System.out.println(\"Wrote LevA to out.dot\");\n */\n return FSTUtil.intersectPrefixPaths(levA, fst);\n }\n\n Automaton toLevenshteinAutomata(Automaton automaton) {\n final Set ref = SpecialOperations.getFiniteStrings(automaton, -1);\n Automaton subs[] = new Automaton[ref.size()];\n int upto = 0;\n for (IntsRef path : ref) {\n if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {\n subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);\n upto++;\n } else {\n Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);\n int ints[] = new int[path.length-nonFuzzyPrefix];\n System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);\n // TODO: maybe add alphaMin to LevenshteinAutomata,\n // and pass 1 instead of 0? We probably don't want\n // to allow the trailing dedup bytes to be\n // edited... but then 0 byte is \"in general\" allowed\n // on input (but not in UTF8).\n LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);\n Automaton levAutomaton = lev.toAutomaton(maxEdits);\n Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));\n combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already\n subs[upto] = combined;\n upto++;\n }\n }\n\n if (subs.length == 0) {\n // automaton is empty, there is no accepted paths through it\n return BasicAutomata.makeEmpty(); // matches nothing\n } else if (subs.length == 1) {\n // no synonyms or anything: just a single path through the tokenstream\n return subs[0];\n } else {\n // multiple paths: this is really scary! is it slow?\n // maybe we should not do this and throw UOE?\n Automaton a = BasicOperations.union(Arrays.asList(subs));\n // TODO: we could call toLevenshteinAutomata() before det? \n // this only happens if you have multiple paths anyway (e.g. synonyms)\n BasicOperations.determinize(a);\n\n return a;\n }\n }\n}\n =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision 40d2f78d3aae3e092f459ebe0031e69a04f47b5a) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision ) @@ -15,10 +15,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -import java.io.FileOutputStream; + import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; import java.util.Arrays; import java.util.List; import java.util.Set; @@ -33,6 +31,7 @@ import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs.Pair; @@ -177,13 +176,15 @@ // to be log weights or something ... Automaton levA = toLevenshteinAutomata(lookupAutomaton); + Automaton utf8LevA = new UTF32ToUTF8().convert(levA); + BasicOperations.determinize(utf8LevA); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ - return FSTUtil.intersectPrefixPaths(levA, fst); + return FSTUtil.intersectPrefixPaths(utf8LevA, fst); } Automaton toLevenshteinAutomata(Automaton automaton) { @@ -203,7 +204,7 @@ // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). - LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); + LevenshteinAutomata lev = new LevenshteinAutomata(ints, Character.MAX_CODE_POINT, transpositions); Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already