Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
<+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.File;\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.OutputStream;\nimport java.io.StringReader;\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.TokenStreamToAutomaton;\nimport org.apache.lucene.search.spell.TermFreqIterator;\nimport org.apache.lucene.search.spell.TermFreqPayloadIterator;\nimport org.apache.lucene.search.suggest.Lookup;\nimport org.apache.lucene.search.suggest.Sort;\nimport org.apache.lucene.store.ByteArrayDataInput;\nimport org.apache.lucene.store.ByteArrayDataOutput;\nimport org.apache.lucene.store.DataInput;\nimport org.apache.lucene.store.DataOutput;\nimport org.apache.lucene.store.InputStreamDataInput;\nimport org.apache.lucene.store.OutputStreamDataOutput;\nimport org.apache.lucene.util.ArrayUtil;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.CharsRef;\nimport org.apache.lucene.util.IOUtils;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.UnicodeUtil;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.BasicOperations;\nimport org.apache.lucene.util.automaton.SpecialOperations;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.automaton.Transition;\nimport org.apache.lucene.util.fst.Builder;\nimport org.apache.lucene.util.fst.ByteSequenceOutputs;\nimport org.apache.lucene.util.fst.FST.BytesReader;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.PairOutputs.Pair;\nimport org.apache.lucene.util.fst.PairOutputs;\nimport org.apache.lucene.util.fst.PositiveIntOutputs;\nimport org.apache.lucene.util.fst.Util.MinResult;\nimport org.apache.lucene.util.fst.Util;\n\n/**\n * Suggester that first analyzes the surface form, adds the\n * analyzed form to a weighted FST, and then does the same\n * thing at lookup time. This means lookup is based on the\n * analyzed form while suggestions are still the surface\n * form(s).\n *\n *
\n * This can result in powerful suggester functionality. For\n * example, if you use an analyzer removing stop words, \n * then the partial text \"ghost chr...\" could see the\n * suggestion \"The Ghost of Christmas Past\". Note that\n * position increments MUST NOT be preserved for this example\n * to work, so you should call\n * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.\n *\n *
\n * If SynonymFilter is used to map wifi and wireless network to\n * hotspot then the partial text \"wirele...\" could suggest\n * \"wifi router\". Token normalization like stemmers, accent\n * removal, etc., would allow suggestions to ignore such\n * variations.\n *\n *
\n * When two matching suggestions have the same weight, they\n * are tie-broken by the analyzed form. If their analyzed\n * form is the same then the order is undefined.\n *\n *
\n * There are some limitations:\n *
\n *\n * - A lookup from a query like \"net\" in English won't\n * be any different than \"net \" (ie, user added a\n * trailing space) because analyzers don't reflect\n * when they've seen a token separator and when they\n * haven't.\n *\n *
- If you're using {@code StopFilter}, and the user will\n * type \"fast apple\", but so far all they've typed is\n * \"fast a\", again because the analyzer doesn't convey whether\n * it's seen a token separator after the \"a\",\n * {@code StopFilter} will remove that \"a\" causing\n * far more matches than you'd expect.\n *\n *
- Lookups with the empty string return no results\n * instead of all results.\n *
\n * \n * @lucene.experimental\n */\npublic class AnalyzingSuggester extends Lookup {\n \n /**\n * FST: \n * input is the analyzed form, with a null byte between terms\n * weights are encoded as costs: (Integer.MAX_VALUE-weight)\n * surface is the original, unanalyzed form.\n */\n private FST> fst = null;\n \n /** \n * Analyzer that will be used for analyzing suggestions at\n * index time.\n */\n private final Analyzer indexAnalyzer;\n\n /** \n * Analyzer that will be used for analyzing suggestions at\n * query time.\n */\n private final Analyzer queryAnalyzer;\n \n /** \n * True if exact match suggestions should always be returned first.\n */\n private final boolean exactFirst;\n \n /** \n * True if separator between tokens should be preserved.\n */\n private final boolean preserveSep;\n\n /** Include this flag in the options parameter to {@link\n * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to always\n * return the exact match first, regardless of score. This\n * has no performance impact but could result in\n * low-quality suggestions. */\n public static final int EXACT_FIRST = 1;\n\n /** Include this flag in the options parameter to {@link\n * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to preserve\n * token separators when matching. */\n public static final int PRESERVE_SEP = 2;\n\n /** Represents the separation between tokens, if\n * PRESERVE_SEP was specified */\n private static final int SEP_LABEL = 0xff;\n\n /** Marks end of the analyzed input and start of dedup\n * byte. */\n private static final int END_BYTE = 0x0;\n\n /** Maximum number of dup surface forms (different surface\n * forms for the same analyzed form). */\n private final int maxSurfaceFormsPerAnalyzedForm;\n\n /** Maximum graph paths to index for a single analyzed\n * surface form. This only matters if your analyzer\n * makes lots of alternate paths (e.g. contains\n * SynonymFilter). */\n private final int maxGraphExpansions;\n\n /** Highest number of analyzed paths we saw for any single\n * input surface form. For analyzers that never create\n * graphs this will always be 1. */\n private int maxAnalyzedPathsForOneInput;\n\n private boolean hasPayloads;\n\n private static final int PAYLOAD_SEP = '\\u001f';\n\n /** Whether position holes should appear in the automaton. */\n private boolean preservePositionIncrements;\n\n /**\n * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)\n * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |\n * PRESERVE_SEP, 256, -1)}\n */\n public AnalyzingSuggester(Analyzer analyzer) {\n this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1);\n }\n\n /**\n * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)\n * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |\n * PRESERVE_SEP, 256, -1)}\n */\n public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1);\n }\n\n /**\n * Creates a new suggester.\n * \n * @param indexAnalyzer Analyzer that will be used for\n * analyzing suggestions while building the index.\n * @param queryAnalyzer Analyzer that will be used for\n * analyzing query text during lookup\n * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}\n * @param maxSurfaceFormsPerAnalyzedForm Maximum number of\n * surface forms to keep for a single analyzed form.\n * When there are too many surface forms we discard the\n * lowest weighted ones.\n * @param maxGraphExpansions Maximum number of graph paths\n * to expand from the analyzed form. Set this to -1 for\n * no limit.\n */\n public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions) {\n this.indexAnalyzer = indexAnalyzer;\n this.queryAnalyzer = queryAnalyzer;\n if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {\n throw new IllegalArgumentException(\"options should only contain EXACT_FIRST and PRESERVE_SEP; got \" + options);\n }\n this.exactFirst = (options & EXACT_FIRST) != 0;\n this.preserveSep = (options & PRESERVE_SEP) != 0;\n\n // NOTE: this is just an implementation limitation; if\n // somehow this is a problem we could fix it by using\n // more than one byte to disambiguate ... but 256 seems\n // like it should be way more then enough.\n if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {\n throw new IllegalArgumentException(\"maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: \" + maxSurfaceFormsPerAnalyzedForm + \")\");\n }\n this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;\n\n if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {\n throw new IllegalArgumentException(\"maxGraphExpansions must -1 (no limit) or > 0 (got: \" + maxGraphExpansions + \")\");\n }\n this.maxGraphExpansions = maxGraphExpansions;\n preservePositionIncrements = true;\n }\n\n /** Whether to take position holes (position increment > 1) into account when\n * building the automaton, true by default. */\n public void setPreservePositionIncrements(boolean preservePositionIncrements) {\n this.preservePositionIncrements = preservePositionIncrements;\n }\n\n /** Returns byte size of the underlying FST. */\n public long sizeInBytes() {\n return fst == null ? 0 : fst.sizeInBytes();\n }\n\n private void copyDestTransitions(State from, State to, List transitions) {\n if (to.isAccept()) {\n from.setAccept(true);\n }\n for(Transition t : to.getTransitions()) {\n transitions.add(t);\n }\n }\n\n // Replaces SEP with epsilon or remaps them if\n // we were asked to preserve them:\n private void replaceSep(Automaton a) {\n\n State[] states = a.getNumberedStates();\n\n // Go in reverse topo sort so we know we only have to\n // make one pass:\n for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {\n final State state = states[stateNumber];\n List newTransitions = new ArrayList();\n for(Transition t : state.getTransitions()) {\n assert t.getMin() == t.getMax();\n if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {\n if (preserveSep) {\n // Remap to SEP_LABEL:\n newTransitions.add(new Transition(SEP_LABEL, t.getDest()));\n } else {\n copyDestTransitions(state, t.getDest(), newTransitions);\n a.setDeterministic(false);\n }\n } else if (t.getMin() == TokenStreamToAutomaton.HOLE) {\n\n // Just remove the hole: there will then be two\n // SEP tokens next to each other, which will only\n // match another hole at search time. Note that\n // it will also match an empty-string token ... if\n // that's somehow a problem we can always map HOLE\n // to a dedicated byte (and escape it in the\n // input).\n copyDestTransitions(state, t.getDest(), newTransitions);\n a.setDeterministic(false);\n } else {\n newTransitions.add(t);\n }\n }\n state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));\n }\n }\n\n /** Just escapes the 0xff byte (which we still for SEP). */\n private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {\n\n final BytesRef spare = new BytesRef();\n\n @Override\n protected BytesRef changeToken(BytesRef in) {\n int upto = 0;\n for(int i=0;i {\n\n private final boolean hasPayloads;\n\n public AnalyzingComparator(boolean hasPayloads) {\n this.hasPayloads = hasPayloads;\n }\n\n private final ByteArrayDataInput readerA = new ByteArrayDataInput();\n private final ByteArrayDataInput readerB = new ByteArrayDataInput();\n private final BytesRef scratchA = new BytesRef();\n private final BytesRef scratchB = new BytesRef();\n\n @Override\n public int compare(BytesRef a, BytesRef b) {\n\n // First by analyzed form:\n readerA.reset(a.bytes, a.offset, a.length);\n scratchA.length = readerA.readShort();\n scratchA.bytes = a.bytes;\n scratchA.offset = readerA.getPosition();\n\n readerB.reset(b.bytes, b.offset, b.length);\n scratchB.bytes = b.bytes;\n scratchB.length = readerB.readShort();\n scratchB.offset = readerB.getPosition();\n\n int cmp = scratchA.compareTo(scratchB);\n if (cmp != 0) {\n return cmp;\n }\n\n // Next by cost:\n long aCost = readerA.readInt();\n long bCost = readerB.readInt();\n\n if (aCost < bCost) {\n return -1;\n } else if (aCost > bCost) {\n return 1;\n }\n\n // Finally by surface form:\n if (hasPayloads) {\n readerA.setPosition(readerA.getPosition() + scratchA.length);\n scratchA.length = readerA.readShort();\n scratchA.offset = readerA.getPosition();\n readerB.setPosition(readerB.getPosition() + scratchB.length);\n scratchB.length = readerB.readShort();\n scratchB.offset = readerB.getPosition();\n } else {\n scratchA.offset = readerA.getPosition();\n scratchA.length = a.length - scratchA.offset;\n scratchB.offset = readerB.getPosition();\n scratchB.length = b.length - scratchB.offset;\n }\n\n cmp = scratchA.compareTo(scratchB);\n if (cmp != 0) {\n return cmp;\n }\n\n return 0;\n }\n };\n\n @Override\n public void build(TermFreqIterator iterator) throws IOException {\n String prefix = getClass().getSimpleName();\n File directory = Sort.defaultTempDir();\n File tempInput = File.createTempFile(prefix, \".input\", directory);\n File tempSorted = File.createTempFile(prefix, \".sorted\", directory);\n\n TermFreqPayloadIterator payloads;\n if (iterator instanceof TermFreqPayloadIterator) {\n payloads = (TermFreqPayloadIterator) iterator;\n } else {\n payloads = null;\n }\n hasPayloads = payloads != null;\n\n Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);\n Sort.ByteSequencesReader reader = null;\n BytesRef scratch = new BytesRef();\n\n TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();\n\n boolean success = false;\n byte buffer[] = new byte[8];\n try {\n ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);\n BytesRef surfaceForm;\n\n while ((surfaceForm = iterator.next()) != null) {\n Set paths = toFiniteStrings(surfaceForm, ts2a);\n \n maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());\n\n for (IntsRef path : paths) {\n\n Util.toBytesRef(path, scratch);\n \n // length of the analyzed text (FST input)\n if (scratch.length > Short.MAX_VALUE-2) {\n throw new IllegalArgumentException(\"cannot handle analyzed forms > \" + (Short.MAX_VALUE-2) + \" in length (got \" + scratch.length + \")\");\n }\n short analyzedLength = (short) scratch.length;\n\n // compute the required length:\n // analyzed sequence + weight (4) + surface + analyzedLength (short)\n int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;\n\n BytesRef payload;\n\n if (hasPayloads) {\n if (surfaceForm.length > (Short.MAX_VALUE-2)) {\n throw new IllegalArgumentException(\"cannot handle surface form > \" + (Short.MAX_VALUE-2) + \" in length (got \" + surfaceForm.length + \")\");\n }\n payload = payloads.payload();\n // payload + surfaceLength (short)\n requiredLength += payload.length + 2;\n } else {\n payload = null;\n }\n \n buffer = ArrayUtil.grow(buffer, requiredLength);\n \n output.reset(buffer);\n\n output.writeShort(analyzedLength);\n\n output.writeBytes(scratch.bytes, scratch.offset, scratch.length);\n\n output.writeInt(encodeWeight(iterator.weight()));\n\n if (hasPayloads) {\n for(int i=0;i outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());\n Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs);\n\n // Build FST:\n BytesRef previousAnalyzed = null;\n BytesRef analyzed = new BytesRef();\n BytesRef surface = new BytesRef();\n IntsRef scratchInts = new IntsRef();\n ByteArrayDataInput input = new ByteArrayDataInput();\n\n // Used to remove duplicate surface forms (but we\n // still index the hightest-weight one). We clear\n // this when we see a new analyzed form, so it cannot\n // grow unbounded (at most 256 entries):\n Set seenSurfaceForms = new HashSet();\n\n int dedup = 0;\n while (reader.read(scratch)) {\n input.reset(scratch.bytes, scratch.offset, scratch.length);\n short analyzedLength = input.readShort();\n analyzed.grow(analyzedLength+2);\n input.readBytes(analyzed.bytes, 0, analyzedLength);\n analyzed.length = analyzedLength;\n\n long cost = input.readInt();\n\n surface.bytes = scratch.bytes;\n if (hasPayloads) {\n surface.length = input.readShort();\n surface.offset = input.getPosition();\n } else {\n surface.offset = input.getPosition();\n surface.length = scratch.length - surface.offset;\n }\n \n if (previousAnalyzed == null) {\n previousAnalyzed = new BytesRef();\n previousAnalyzed.copyBytes(analyzed);\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n } else if (analyzed.equals(previousAnalyzed)) {\n dedup++;\n if (dedup >= maxSurfaceFormsPerAnalyzedForm) {\n // More than maxSurfaceFormsPerAnalyzedForm\n // dups: skip the rest:\n continue;\n }\n if (seenSurfaceForms.contains(surface)) {\n continue;\n }\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n } else {\n dedup = 0;\n previousAnalyzed.copyBytes(analyzed);\n seenSurfaceForms.clear();\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n }\n\n // TODO: I think we can avoid the extra 2 bytes when\n // there is no dup (dedup==0), but we'd have to fix\n // the exactFirst logic ... which would be sort of\n // hairy because we'd need to special case the two\n // (dup/not dup)...\n\n // NOTE: must be byte 0 so we sort before whatever\n // is next\n analyzed.bytes[analyzed.offset+analyzed.length] = 0;\n analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup;\n analyzed.length += 2;\n\n Util.toIntsRef(analyzed, scratchInts);\n //System.out.println(\"ADD: \" + scratchInts + \" -> \" + cost + \": \" + surface.utf8ToString());\n if (!hasPayloads) {\n builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));\n } else {\n int payloadOffset = input.getPosition() + surface.length;\n int payloadLength = scratch.length - payloadOffset;\n BytesRef br = new BytesRef(surface.length + 1 + payloadLength);\n System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);\n br.bytes[surface.length] = PAYLOAD_SEP;\n System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);\n br.length = br.bytes.length;\n builder.add(scratchInts, outputs.newPair(cost, br));\n }\n }\n fst = builder.finish();\n\n //Util.dotToFile(fst, \"/tmp/suggest.dot\");\n \n success = true;\n } finally {\n if (success) {\n IOUtils.close(reader, writer);\n } else {\n IOUtils.closeWhileHandlingException(reader, writer);\n }\n \n tempInput.delete();\n tempSorted.delete();\n }\n }\n\n @Override\n public boolean store(OutputStream output) throws IOException {\n DataOutput dataOut = new OutputStreamDataOutput(output);\n try {\n if (fst == null) {\n return false;\n }\n\n fst.save(dataOut);\n dataOut.writeVInt(maxAnalyzedPathsForOneInput);\n dataOut.writeByte((byte) (hasPayloads ? 1 : 0));\n } finally {\n IOUtils.close(output);\n }\n return true;\n }\n\n @Override\n public boolean load(InputStream input) throws IOException {\n DataInput dataIn = new InputStreamDataInput(input);\n try {\n this.fst = new FST>(dataIn, new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));\n maxAnalyzedPathsForOneInput = dataIn.readVInt();\n hasPayloads = dataIn.readByte() == 1;\n } finally {\n IOUtils.close(input);\n }\n return true;\n }\n\n private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) {\n LookupResult result;\n if (hasPayloads) {\n int sepIndex = -1;\n for(int i=0;i= output2.length) {\n return false;\n }\n for(int i=0;i lookup(final CharSequence key, boolean onlyMorePopular, int num) {\n assert num > 0;\n\n if (onlyMorePopular) {\n throw new IllegalArgumentException(\"this suggester only works with onlyMorePopular=false\");\n }\n if (fst == null) {\n return Collections.emptyList();\n }\n\n //System.out.println(\"lookup key=\" + key + \" num=\" + num);\n final BytesRef utf8Key = new BytesRef(key);\n try {\n\n Automaton lookupAutomaton = toLookupAutomaton(key);\n\n final CharsRef spare = new CharsRef();\n\n //System.out.println(\" now intersect exactFirst=\" + exactFirst);\n \n // Intersect automaton w/ suggest wFST and get all\n // prefix starting nodes & their outputs:\n //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);\n\n //System.out.println(\" prefixPaths: \" + prefixPaths.size());\n\n BytesReader bytesReader = fst.getBytesReader();\n\n FST.Arc> scratchArc = new FST.Arc>();\n\n final List results = new ArrayList();\n\n List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);\n\n if (exactFirst) {\n\n int count = 0;\n for (FSTUtil.Path> path : prefixPaths) {\n if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {\n // This node has END_BYTE arc leaving, meaning it's an\n // \"exact\" match:\n count++;\n }\n }\n\n // Searcher just to find the single exact only\n // match, if present:\n Util.TopNSearcher> searcher;\n searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);\n\n // NOTE: we could almost get away with only using\n // the first start node. The only catch is if\n // maxSurfaceFormsPerAnalyzedForm had kicked in and\n // pruned our exact match from one of these nodes\n // ...:\n for (FSTUtil.Path> path : prefixPaths) {\n if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {\n // This node has END_BYTE arc leaving, meaning it's an\n // \"exact\" match:\n searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);\n }\n }\n\n MinResult> completions[] = searcher.search();\n\n // NOTE: this is rather inefficient: we enumerate\n // every matching \"exactly the same analyzed form\"\n // path, and then do linear scan to see if one of\n // these exactly matches the input. It should be\n // possible (though hairy) to do something similar\n // to getByOutput, since the surface form is encoded\n // into the FST output, so we more efficiently hone\n // in on the exact surface-form match. Still, I\n // suspect very little time is spent in this linear\n // seach: it's bounded by how many prefix start\n // nodes we have and the\n // maxSurfaceFormsPerAnalyzedForm:\n for(MinResult> completion : completions) {\n BytesRef output2 = completion.output.output2;\n if (sameSurfaceForm(utf8Key, output2)) {\n results.add(getLookupResult(completion.output.output1, output2, spare));\n break;\n }\n }\n\n if (results.size() == num) {\n // That was quick:\n return results;\n }\n }\n\n Util.TopNSearcher> searcher;\n searcher = new Util.TopNSearcher>(fst,\n num - results.size(),\n num * maxAnalyzedPathsForOneInput,\n weightComparator) {\n private final Set seen = new HashSet();\n\n @Override\n protected boolean acceptResult(IntsRef input, Pair output) {\n\n // Dedup: when the input analyzes to a graph we\n // can get duplicate surface forms:\n if (seen.contains(output.output2)) {\n return false;\n }\n seen.add(output.output2);\n \n if (!exactFirst) {\n return true;\n } else {\n // In exactFirst mode, don't accept any paths\n // matching the surface form since that will\n // create duplicate results:\n if (sameSurfaceForm(utf8Key, output.output2)) {\n // We found exact match, which means we should\n // have already found it in the first search:\n assert results.size() == 1;\n return false;\n } else {\n return true;\n }\n }\n }\n };\n\n prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);\n \n for (FSTUtil.Path> path : prefixPaths) {\n searcher.addStartPaths(path.fstNode, path.output, true, path.input);\n }\n\n MinResult> completions[] = searcher.search();\n\n for(MinResult> completion : completions) {\n\n LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);\n\n // TODO: for fuzzy case would be nice to return\n // how many edits were required\n\n //System.out.println(\" result=\" + result);\n results.add(result);\n\n if (results.size() == num) {\n // In the exactFirst=true case the search may\n // produce one extra path\n break;\n }\n }\n\n return results;\n } catch (IOException bogus) {\n throw new RuntimeException(bogus);\n }\n }\n\n /** Returns all prefix paths to initialize the search. */\n protected List>> getFullPrefixPaths(List>> prefixPaths,\n Automaton lookupAutomaton,\n FST> fst)\n throws IOException {\n return prefixPaths;\n }\n \n final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {\n // Analyze surface form:\n TokenStream ts = indexAnalyzer.tokenStream(\"\", new StringReader(surfaceForm.utf8ToString()));\n\n // Create corresponding automaton: labels are bytes\n // from each analyzed token, with byte 0 used as\n // separator between tokens:\n Automaton automaton = ts2a.toAutomaton(ts);\n ts.close();\n\n replaceSep(automaton);\n\n assert SpecialOperations.isFinite(automaton);\n\n // Get all paths from the automaton (there can be\n // more than one path, eg if the analyzer created a\n // graph using SynFilter or WDF):\n\n // TODO: we could walk & add simultaneously, so we\n // don't have to alloc [possibly biggish]\n // intermediate HashSet in RAM:\n return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);\n }\n\n final Automaton toLookupAutomaton(final CharSequence key) throws IOException {\n // TODO: is there a Reader from a CharSequence?\n // Turn tokenstream into automaton:\n TokenStream ts = queryAnalyzer.tokenStream(\"\", new StringReader(key.toString()));\n Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);\n ts.close();\n\n // TODO: we could use the end offset to \"guess\"\n // whether the final token was a partial token; this\n // would only be a heuristic ... but maybe an OK one.\n // This way we could eg differentiate \"net\" from \"net \",\n // which we can't today...\n\n replaceSep(automaton);\n\n // TODO: we can optimize this somewhat by determinizing\n // while we convert\n BasicOperations.determinize(automaton);\n return automaton;\n }\n \n \n\n /**\n * Returns the weight associated with an input string,\n * or null if it does not exist.\n */\n public Object get(CharSequence key) {\n throw new UnsupportedOperationException();\n }\n \n /** cost -> weight */\n private static int decodeWeight(long encoded) {\n return (int)(Integer.MAX_VALUE - encoded);\n }\n \n /** weight -> cost */\n private static int encodeWeight(long value) {\n if (value < 0 || value > Integer.MAX_VALUE) {\n throw new UnsupportedOperationException(\"cannot encode value: \" + value);\n }\n return Integer.MAX_VALUE - (int)value;\n }\n \n static final Comparator> weightComparator = new Comparator> () {\n @Override\n public int compare(Pair left, Pair right) {\n return left.output1.compareTo(right.output1);\n }\n };\n}\n
===================================================================
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision f81056da25f3671b9807c4a51d6b985389fe916e)
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision )
@@ -32,6 +32,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
import org.apache.lucene.search.suggest.Lookup;
@@ -53,6 +54,7 @@
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.UTF32ToUTF8;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST.BytesReader;
@@ -160,7 +162,7 @@
/** Represents the separation between tokens, if
* PRESERVE_SEP was specified */
- private static final int SEP_LABEL = 0xff;
+ private static final int SEP_LABEL = 0x10FFFF;
/** Marks end of the analyzed input and start of dedup
* byte. */
@@ -308,45 +310,60 @@
}
/** Just escapes the 0xff byte (which we still for SEP). */
- private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
+ private static final class EscapingTokenStreamToUnicodeAutomaton extends TokenStreamToUnicodeAutomaton {
final BytesRef spare = new BytesRef();
@Override
protected BytesRef changeToken(BytesRef in) {
+
+ final String utf16 = in.utf8ToString();
+ final int[] inCodePoints = new int[utf16.codePointCount(0, utf16.length())];
+ for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
+ inCodePoints[j++] = cp = utf16.codePointAt(i);
+
int upto = 0;
- for(int i=0;i {
@@ -434,7 +451,7 @@
Sort.ByteSequencesReader reader = null;
BytesRef scratch = new BytesRef();
- TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
+ TokenStreamToUnicodeAutomaton ts2ua = getTokenStreamToUnicodeAutomaton();
boolean success = false;
byte buffer[] = new byte[8];
@@ -443,7 +460,7 @@
BytesRef surfaceForm;
while ((surfaceForm = iterator.next()) != null) {
- Set paths = toFiniteStrings(surfaceForm, ts2a);
+ Set paths = toFiniteStrings(surfaceForm, ts2ua);
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
@@ -854,18 +871,20 @@
return prefixPaths;
}
- final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
+ final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToUnicodeAutomaton ts2ua) throws IOException {
// Analyze surface form:
TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
- // Create corresponding automaton: labels are bytes
- // from each analyzed token, with byte 0 used as
+ // Create corresponding automaton: labels are Unicode code points
+ // from each analyzed token, with code point 0 used as
// separator between tokens:
- Automaton automaton = ts2a.toAutomaton(ts);
+ Automaton unicodeAutomaton = ts2ua.toAutomaton(ts);
ts.close();
- replaceSep(automaton);
+ replaceSep(unicodeAutomaton);
+ Automaton automaton = new UTF32ToUTF8().convert(unicodeAutomaton);
+
assert SpecialOperations.isFinite(automaton);
// Get all paths from the automaton (there can be
@@ -882,7 +901,7 @@
// TODO: is there a Reader from a CharSequence?
// Turn tokenstream into automaton:
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
- Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
+ Automaton unicodeAutomaton = (getTokenStreamToUnicodeAutomaton()).toAutomaton(ts);
ts.close();
// TODO: we could use the end offset to "guess"
@@ -891,12 +910,12 @@
// This way we could eg differentiate "net" from "net ",
// which we can't today...
- replaceSep(automaton);
+ replaceSep(unicodeAutomaton);
// TODO: we can optimize this somewhat by determinizing
// while we convert
- BasicOperations.determinize(automaton);
- return automaton;
+ BasicOperations.determinize(unicodeAutomaton);
+ return unicodeAutomaton;
}
Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
<+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.IOException;\nimport java.io.Reader;\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\nimport java.util.TreeSet;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.CannedTokenStream;\nimport org.apache.lucene.analysis.MockAnalyzer;\nimport org.apache.lucene.analysis.MockTokenFilter;\nimport org.apache.lucene.analysis.MockTokenizer;\nimport org.apache.lucene.analysis.Token;\nimport org.apache.lucene.analysis.TokenFilter;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.TokenStreamToAutomaton;\nimport org.apache.lucene.analysis.Tokenizer;\nimport org.apache.lucene.analysis.tokenattributes.CharTermAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.search.suggest.Lookup.LookupResult;\nimport org.apache.lucene.search.suggest.TermFreq;\nimport org.apache.lucene.search.suggest.TermFreqArrayIterator;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.LuceneTestCase;\nimport org.apache.lucene.util._TestUtil;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.fst.Util;\n\npublic class FuzzySuggesterTest extends LuceneTestCase {\n \n public void testRandomEdits() throws IOException {\n List keys = new ArrayList();\n int numTerms = atLeast(100);\n for (int i = 0; i < numTerms; i++) {\n keys.add(new TermFreq(\"boo\" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));\n }\n keys.add(new TermFreq(\"foo bar boo far\", 12));\n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n int numIters = atLeast(10);\n for (int i = 0; i < numIters; i++) {\n String addRandomEdit = addRandomEdit(\"foo bar boo\", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX);\n List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);\n assertEquals(addRandomEdit, 1, results.size());\n assertEquals(\"foo bar boo far\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F); \n }\n }\n \n /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */\n public void testKeyword() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"foo\", 50),\n new TermFreq(\"bar\", 10),\n new TermFreq(\"barbar\", 12),\n new TermFreq(\"barbara\", 6)\n };\n \n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"bariar\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbr\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbara\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbara\", results.get(0).key.toString());\n assertEquals(6, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbar\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"barbara\", results.get(1).key.toString());\n assertEquals(6, results.get(1).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbaa\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"barbara\", results.get(1).key.toString());\n assertEquals(6, results.get(1).value, 0.01F);\n \n // top N of 2, but only foo is available\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"f\", random()), false, 2);\n assertEquals(1, results.size());\n assertEquals(\"foo\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n \n // top N of 1 for 'bar': we return this even though\n // barbar is higher because exactFirst is enabled:\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"bar\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"bar\", results.get(0).key.toString());\n assertEquals(10, results.get(0).value, 0.01F);\n \n // top N Of 2 for 'b'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"b\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n \n // top N of 3 for 'ba'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ba\", random()), false, 3);\n assertEquals(3, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n assertEquals(\"barbara\", results.get(2).key.toString());\n assertEquals(6, results.get(2).value, 0.01F);\n }\n \n /**\n * basic \"standardanalyzer\" test with stopword removal\n */\n public void testStandard() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"the ghost of christmas past\", 50),\n };\n \n Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);\n FuzzySuggester suggester = new FuzzySuggester(standard);\n suggester.setPreservePositionIncrements(false);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"the ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' since its a stopword, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' and 'of' since they are stopwords, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n }\n\n public void testNoSeps() throws Exception {\n TermFreq[] keys = new TermFreq[] {\n new TermFreq(\"ab cd\", 0),\n new TermFreq(\"abcd\", 1),\n };\n\n int options = 0;\n\n Analyzer a = new MockAnalyzer(random());\n FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);\n suggester.build(new TermFreqArrayIterator(keys));\n // TODO: would be nice if \"ab \" would allow the test to\n // pass, and more generally if the analyzer can know\n // that the user's current query has ended at a word, \n // but, analyzers don't produce SEP tokens!\n List r = suggester.lookup(_TestUtil.stringToCharSequence(\"ab c\", random()), false, 2);\n assertEquals(2, r.size());\n\n // With no PRESERVE_SEPS specified, \"ab c\" should also\n // complete to \"abcd\", which has higher weight so should\n // appear first:\n assertEquals(\"abcd\", r.get(0).key.toString());\n }\n\n public void testGraphDups() throws Exception {\n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"slow\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"wi\",1,1),\n token(\"hotspot\",0,3),\n token(\"fi\",1,1),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"fast\",1,1)\n\n }),\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1)\n }),\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"wifi network is slow\", 50),\n new TermFreq(\"wi fi network is fast\", 10),\n };\n FuzzySuggester suggester = new FuzzySuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(\"wifi network\", false, 10);\n if (VERBOSE) {\n System.out.println(\"Results: \" + results);\n }\n assertEquals(2, results.size());\n assertEquals(\"wifi network is slow\", results.get(0).key);\n assertEquals(50, results.get(0).value);\n assertEquals(\"wi fi network is fast\", results.get(1).key);\n assertEquals(10, results.get(1).value);\n }\n\n public void testEmpty() throws Exception {\n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(new TermFreq[0]));\n\n List result = suggester.lookup(\"a\", false, 20);\n assertTrue(result.isEmpty());\n }\n\n public void testInputPathRequired() throws Exception {\n\n // SynonymMap.Builder b = new SynonymMap.Builder(false);\n // b.add(new CharsRef(\"ab\"), new CharsRef(\"ba\"), true);\n // final SynonymMap map = b.build();\n\n // The Analyzer below mimics the functionality of the SynonymAnalyzer\n // using the above map, so that the suggest module does not need a dependency on the \n // synonym module \n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"xc\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ba\",1,1), \n token(\"xd\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"x\",1,1)\n })\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"ab xc\", 50),\n new TermFreq(\"ba xd\", 50),\n };\n FuzzySuggester suggester = new FuzzySuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n List results = suggester.lookup(\"ab x\", false, 1);\n assertTrue(results.size() == 1);\n }\n\n private static Token token(String term, int posInc, int posLength) {\n final Token t = new Token(term, 0, 0);\n t.setPositionIncrement(posInc);\n t.setPositionLength(posLength);\n return t;\n }\n\n /*\n private void printTokens(final Analyzer analyzer, String input) throws IOException {\n System.out.println(\"Tokens for \" + input);\n TokenStream ts = analyzer.tokenStream(\"\", new StringReader(input));\n ts.reset();\n final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);\n final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);\n final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);\n \n while(ts.incrementToken()) {\n termBytesAtt.fillBytesRef();\n System.out.println(String.format(\"%s,%s,%s\", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); \n }\n ts.end();\n ts.close();\n } \n */ \n\n private final Analyzer getUnusualAnalyzer() {\n return new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n\n int count;\n\n @Override\n public TokenStream getTokenStream() {\n // 4th time we are called, return tokens a b,\n // else just a:\n if (count++ != 3) {\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n });\n } else {\n // After that \"a b\":\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n token(\"b\", 1, 1),\n });\n }\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n }\n\n public void testExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n //System.out.println(\"ALL: \" + suggester.lookup(\"x y\", false, 6));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"x y\", false, topN);\n //System.out.println(\"topN=\" + topN + \" \" + results);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"x y\", results.get(0).key);\n assertEquals(1, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"z z z\", results.get(1).key);\n assertEquals(20, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x y z\", results.get(2).key);\n assertEquals(3, results.get(2).value);\n\n if (topN > 3) {\n assertEquals(\"x\", results.get(3).key);\n assertEquals(2, results.get(3).value);\n }\n }\n }\n }\n }\n\n public void testNonExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"p\", false, topN);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"z z z\", results.get(0).key);\n assertEquals(20, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"x y z\", results.get(1).key);\n assertEquals(3, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x\", results.get(2).key);\n assertEquals(2, results.get(2).value);\n \n if (topN > 3) {\n assertEquals(\"x y\", results.get(3).key);\n assertEquals(1, results.get(3).value);\n }\n }\n }\n }\n }\n \n // Holds surface form separately:\n private static class TermFreq2 implements Comparable {\n public final String surfaceForm;\n public final String analyzedForm;\n public final long weight;\n\n public TermFreq2(String surfaceForm, String analyzedForm, long weight) {\n this.surfaceForm = surfaceForm;\n this.analyzedForm = analyzedForm;\n this.weight = weight;\n }\n\n @Override\n public int compareTo(TermFreq2 other) {\n int cmp = analyzedForm.compareTo(other.analyzedForm);\n if (cmp != 0) {\n return cmp;\n } else if (weight > other.weight) {\n return -1;\n } else if (weight < other.weight) {\n return 1;\n } else {\n assert false;\n return 0;\n }\n }\n }\n\n static boolean isStopChar(char ch, int numStopChars) {\n //System.out.println(\"IS? \" + ch + \": \" + (ch - 'a') + \": \" + ((ch - 'a') < numStopChars));\n return (ch - 'a') < numStopChars;\n }\n\n // Like StopFilter:\n private static class TokenEater extends TokenFilter {\n private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);\n private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);\n private final int numStopChars;\n private final boolean preserveHoles;\n private boolean first;\n\n public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) {\n super(in);\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public void reset() throws IOException {\n super.reset();\n first = true;\n }\n\n @Override\n public final boolean incrementToken() throws IOException {\n int skippedPositions = 0;\n while (input.incrementToken()) {\n if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) {\n int posInc = posIncrAtt.getPositionIncrement() + skippedPositions;\n if (first) {\n if (posInc == 0) {\n // first token having posinc=0 is illegal.\n posInc = 1;\n }\n first = false;\n }\n posIncrAtt.setPositionIncrement(posInc);\n //System.out.println(\"RETURN term=\" + termAtt + \" numStopChars=\" + numStopChars);\n return true;\n }\n if (preserveHoles) {\n skippedPositions += posIncrAtt.getPositionIncrement();\n }\n }\n\n return false;\n }\n }\n\n private static class MockTokenEatingAnalyzer extends Analyzer {\n private int numStopChars;\n private boolean preserveHoles;\n\n public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public TokenStreamComponents createComponents(String fieldName, Reader reader) {\n MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);\n tokenizer.setEnableChecks(true);\n TokenStream next;\n if (numStopChars != 0) {\n next = new TokenEater(preserveHoles, tokenizer, numStopChars);\n } else {\n next = tokenizer;\n }\n return new TokenStreamComponents(tokenizer, next);\n }\n }\n\n public void testRandom() throws Exception {\n\n int numQueries = atLeast(100);\n \n final List slowCompletor = new ArrayList();\n final TreeSet allPrefixes = new TreeSet();\n final Set seen = new HashSet();\n \n TermFreq[] keys = new TermFreq[numQueries];\n\n boolean preserveSep = random().nextBoolean();\n\n final int numStopChars = random().nextInt(10);\n final boolean preserveHoles = random().nextBoolean();\n\n if (VERBOSE) {\n System.out.println(\"TEST: \" + numQueries + \" words; preserveSep=\" + preserveSep + \" numStopChars=\" + numStopChars + \" preserveHoles=\" + preserveHoles);\n }\n \n for (int i = 0; i < numQueries; i++) {\n int numTokens = _TestUtil.nextInt(random(), 1, 4);\n String key;\n String analyzedKey;\n while(true) {\n key = \"\";\n analyzedKey = \"\";\n boolean lastRemoved = false;\n for(int token=0;token < numTokens;token++) {\n String s;\n while (true) {\n // TODO: would be nice to fix this slowCompletor/comparator to\n // use full range, but we might lose some coverage too...\n s = _TestUtil.randomSimpleString(random());\n if (s.length() > 0) {\n if (token > 0) {\n key += \" \";\n }\n if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') {\n analyzedKey += \" \";\n }\n key += s;\n if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n analyzedKey += '\\u0000';\n }\n lastRemoved = true;\n } else {\n analyzedKey += s;\n lastRemoved = false;\n }\n break;\n }\n }\n }\n\n analyzedKey = analyzedKey.replaceAll(\"(^| )\\u0000$\", \"\");\n\n if (preserveSep && lastRemoved) {\n analyzedKey += \" \";\n }\n\n // Don't add same surface form more than once:\n if (!seen.contains(key)) {\n seen.add(key);\n break;\n }\n }\n\n for (int j = 1; j < key.length(); j++) {\n allPrefixes.add(key.substring(0, j));\n }\n // we can probably do Integer.MAX_VALUE here, but why worry.\n int weight = random().nextInt(1<<24);\n keys[i] = new TermFreq(key, weight);\n\n slowCompletor.add(new TermFreq2(key, analyzedKey, weight));\n }\n\n if (VERBOSE) {\n // Don't just sort original list, to avoid VERBOSE\n // altering the test:\n List sorted = new ArrayList(slowCompletor);\n Collections.sort(sorted);\n for(TermFreq2 ent : sorted) {\n System.out.println(\" surface='\" + ent.surfaceForm + \" analyzed='\" + ent.analyzedForm + \"' weight=\" + ent.weight);\n }\n }\n\n Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);\n FuzzySuggester suggester = new FuzzySuggester(a, a,\n preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);\n suggester.build(new TermFreqArrayIterator(keys));\n\n for (String prefix : allPrefixes) {\n\n if (VERBOSE) {\n System.out.println(\"\\nTEST: prefix=\" + prefix);\n }\n\n final int topN = _TestUtil.nextInt(random(), 1, 10);\n List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN);\n\n // 2. go thru whole set to find suggestions:\n List matches = new ArrayList();\n\n // \"Analyze\" the key:\n String[] tokens = prefix.split(\" \");\n StringBuilder builder = new StringBuilder();\n boolean lastRemoved = false;\n for(int i=0;i 0 && !builder.toString().endsWith(\" \")) {\n builder.append(' ');\n }\n\n if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n builder.append(\"\\u0000\");\n }\n lastRemoved = true;\n } else {\n builder.append(token);\n lastRemoved = false;\n }\n }\n\n String analyzedKey = builder.toString();\n\n // Remove trailing sep/holes (TokenStream.end() does\n // not tell us any trailing holes, yet ... there is an\n // issue open for this):\n while (true) {\n String s = analyzedKey.replaceAll(\"(^| )\\u0000$\", \"\");\n s = s.replaceAll(\"\\\\s+$\", \"\");\n if (s.equals(analyzedKey)) {\n break;\n }\n analyzedKey = s;\n }\n\n if (analyzedKey.length() == 0) {\n // Currently suggester can't suggest from the empty\n // string! You get no results, not all results...\n continue;\n }\n\n if (preserveSep && (prefix.endsWith(\" \") || lastRemoved)) {\n analyzedKey += \" \";\n }\n\n if (VERBOSE) {\n System.out.println(\" analyzed: \" + analyzedKey);\n }\n TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();\n\n // NOTE: not great that we ask the suggester to give\n // us the \"answer key\" (ie maybe we have a bug in\n // suggester.toLevA ...) ... but testRandom2() fixes\n // this:\n Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));\n assertTrue(automaton.isDeterministic());\n // TODO: could be faster... but its slowCompletor for a reason\n BytesRef spare = new BytesRef();\n for (TermFreq2 e : slowCompletor) {\n spare.copyChars(e.analyzedForm);\n Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);\n for (IntsRef intsRef : finiteStrings) {\n State p = automaton.getInitialState();\n BytesRef ref = Util.toBytesRef(intsRef, spare);\n boolean added = false;\n for (int i = ref.offset; i < ref.length; i++) {\n State q = p.step(ref.bytes[i] & 0xff);\n if (q == null) {\n break;\n } else if (q.isAccept()) {\n matches.add(new LookupResult(e.surfaceForm, e.weight));\n added = true;\n break;\n }\n p = q;\n }\n if (!added && p.isAccept()) {\n matches.add(new LookupResult(e.surfaceForm, e.weight));\n } \n }\n }\n\n assertTrue(numStopChars > 0 || matches.size() > 0);\n\n if (matches.size() > 1) {\n Collections.sort(matches, new Comparator() {\n @Override\n public int compare(LookupResult left, LookupResult right) {\n int cmp = Float.compare(right.value, left.value);\n if (cmp == 0) {\n return left.compareTo(right);\n } else {\n return cmp;\n }\n }\n });\n }\n\n if (matches.size() > topN) {\n matches = matches.subList(0, topN);\n }\n\n if (VERBOSE) {\n System.out.println(\" expected:\");\n for(LookupResult lr : matches) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n\n System.out.println(\" actual:\");\n for(LookupResult lr : r) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n }\n \n assertEquals(prefix + \" \" + topN, matches.size(), r.size());\n for(int hit=0;hit keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"a\", 40),\n new TermFreq(\"a \", 50),\n new TermFreq(\" a\", 60),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n List results = suggester.lookup(\"a\", false, 5);\n assertEquals(2, results.size());\n assertEquals(\" a\", results.get(0).key);\n assertEquals(60, results.get(0).value);\n assertEquals(\"a \", results.get(1).key);\n assertEquals(50, results.get(1).value);\n }\n\n public void testEditSeps() throws Exception {\n Analyzer a = new MockAnalyzer(random());\n FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);\n\n List keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"foo bar\", 40),\n new TermFreq(\"foo bar baz\", 50),\n new TermFreq(\"barbaz\", 60),\n new TermFreq(\"barbazfoo\", 10),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n assertEquals(\"[foo bar baz/50, foo bar/40]\", suggester.lookup(\"foobar\", false, 5).toString());\n assertEquals(\"[foo bar baz/50]\", suggester.lookup(\"foobarbaz\", false, 5).toString());\n assertEquals(\"[barbaz/60, barbazfoo/10]\", suggester.lookup(\"bar baz\", false, 5).toString());\n assertEquals(\"[barbazfoo/10]\", suggester.lookup(\"bar baz foo\", false, 5).toString());\n }\n \n @SuppressWarnings(\"fallthrough\")\n private static String addRandomEdit(String string, int prefixLength) {\n char[] input = string.toCharArray();\n StringBuilder builder = new StringBuilder();\n for (int i = 0; i < input.length; i++) {\n if (i >= prefixLength && random().nextBoolean() && i < input.length-1) {\n switch(random().nextInt(4)) {\n case 3:\n if (i < input.length-1) {\n // Transpose input[i] and input[1+i]:\n builder.append(input[i+1]);\n builder.append(input[i]);\n for(int j=i+2;j answers = new ArrayList();\n final Set seen = new HashSet();\n for(int i=0;i() {\n @Override\n public int compare(TermFreq a, TermFreq b) {\n return a.term.compareTo(b.term);\n }\n });\n if (VERBOSE) {\n System.out.println(\"\\nTEST: targets\");\n for(TermFreq tf : answers) {\n System.out.println(\" \" + tf.term.utf8ToString() + \" freq=\" + tf.v);\n }\n }\n\n Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);\n int maxEdits = random().nextBoolean() ? 1 : 2;\n int prefixLen = random().nextInt(4);\n boolean transpositions = random().nextBoolean();\n // TODO: test graph analyzers\n // TODO: test exactFirst / preserveSep permutations\n FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, prefixLen);\n\n if (VERBOSE) {\n System.out.println(\"TEST: maxEdits=\" + maxEdits + \" prefixLen=\" + prefixLen + \" transpositions=\" + transpositions + \" num=\" + NUM);\n }\n\n Collections.shuffle(answers, random());\n suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()])));\n\n final int ITERS = atLeast(100);\n for(int iter=0;iter expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag);\n if (VERBOSE) {\n System.out.println(\" expected: \" + expected.size());\n for(LookupResult c : expected) {\n System.out.println(\" \" + c);\n }\n }\n final List actual = suggest.lookup(frag, false, NUM);\n if (VERBOSE) {\n System.out.println(\" actual: \" + actual.size());\n for(LookupResult c : actual) {\n System.out.println(\" \" + c);\n }\n }\n\n Collections.sort(actual, new CompareByCostThenAlpha());\n\n final int limit = Math.min(expected.size(), actual.size());\n for(int ans=0;ans slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List answers, String frag) {\n final List results = new ArrayList();\n final int fragLen = frag.length();\n for(TermFreq tf : answers) {\n //System.out.println(\" check s=\" + tf.term.utf8ToString());\n boolean prefixMatches = true;\n for(int i=0;i= fragLen-maxEdits) {\n // OK it's possible:\n //System.out.println(\" possible\");\n int d;\n final String s = tf.term.utf8ToString();\n if (fragLen == prefixLen) {\n d = 0;\n } else if (false && len < fragLen) {\n d = getDistance(frag, s, allowTransposition);\n } else {\n //System.out.println(\" try loop\");\n d = maxEdits + 1;\n //for(int ed=-maxEdits;ed<=maxEdits;ed++) {\n for(int ed=-maxEdits;ed<=maxEdits;ed++) {\n if (s.length() < fragLen - ed) {\n continue;\n }\n String check = s.substring(0, fragLen-ed);\n d = getDistance(frag, check, allowTransposition);\n //System.out.println(\" sub check s=\" + check + \" d=\" + d);\n if (d <= maxEdits) {\n break;\n }\n }\n }\n if (d <= maxEdits) {\n results.add(new LookupResult(tf.term.utf8ToString(), tf.v));\n }\n }\n }\n\n Collections.sort(results, new CompareByCostThenAlpha());\n }\n\n return results;\n }\n\n private static class CharSequenceComparator implements Comparator {\n\n @Override\n public int compare(CharSequence o1, CharSequence o2) {\n final int l1 = o1.length();\n final int l2 = o2.length();\n \n final int aStop = Math.min(l1, l2);\n for (int i = 0; i < aStop; i++) {\n int diff = o1.charAt(i) - o2.charAt(i);\n if (diff != 0) {\n return diff;\n }\n }\n // One is a prefix of the other, or, they are equal:\n return l1 - l2;\n }\n }\n\n private static final Comparator CHARSEQUENCE_COMPARATOR = new CharSequenceComparator();\n\n public class CompareByCostThenAlpha implements Comparator {\n @Override\n public int compare(LookupResult a, LookupResult b) {\n if (a.value > b.value) {\n return -1;\n } else if (a.value < b.value) {\n return 1;\n } else {\n final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);\n assert c != 0: \"term=\" + a.key;\n return c;\n }\n }\n }\n\n // NOTE: copied from\n // modules/suggest/src/java/org/apache/lucene/search/spell/LuceneLevenshteinDistance.java\n // and tweaked to return the edit distance not the float\n // lucene measure\n\n /* Finds unicode (code point) Levenstein (edit) distance\n * between two strings, including transpositions. */\n public int getDistance(String target, String other, boolean allowTransposition) {\n IntsRef targetPoints;\n IntsRef otherPoints;\n int n;\n int d[][]; // cost array\n \n // NOTE: if we cared, we could 3*m space instead of m*n space, similar to \n // what LevenshteinDistance does, except cycling thru a ring of three \n // horizontal cost arrays... but this comparator is never actually used by \n // DirectSpellChecker, its only used for merging results from multiple shards \n // in \"distributed spellcheck\", and its inefficient in other ways too...\n\n // cheaper to do this up front once\n targetPoints = toIntsRef(target);\n otherPoints = toIntsRef(other);\n n = targetPoints.length;\n final int m = otherPoints.length;\n d = new int[n+1][m+1];\n \n if (n == 0 || m == 0) {\n if (n == m) {\n return 0;\n }\n else {\n return Math.max(n, m);\n }\n } \n\n // indexes into strings s and t\n int i; // iterates through s\n int j; // iterates through t\n\n int t_j; // jth character of t\n\n int cost; // cost\n\n for (i = 0; i<=n; i++) {\n d[i][0] = i;\n }\n \n for (j = 0; j<=m; j++) {\n d[0][j] = j;\n }\n\n for (j = 1; j<=m; j++) {\n t_j = otherPoints.ints[j-1];\n\n for (i=1; i<=n; i++) {\n cost = targetPoints.ints[i-1]==t_j ? 0 : 1;\n // minimum of cell to the left+1, to the top+1, diagonally left and up +cost\n d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);\n // transposition\n if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {\n d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);\n }\n }\n }\n \n return d[n][m];\n }\n \n private static IntsRef toIntsRef(String s) {\n IntsRef ref = new IntsRef(s.length()); // worst case\n int utf16Len = s.length();\n for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {\n cp = ref.ints[ref.length++] = Character.codePointAt(s, i);\n }\n return ref;\n }\n}\n
===================================================================
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision f81056da25f3671b9807c4a51d6b985389fe916e)
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision )
@@ -36,7 +36,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -72,6 +72,27 @@
}
}
+ public void testNonLatinRandomEdits() throws IOException {
+ List keys = new ArrayList();
+ int numTerms = atLeast(100);
+ for (int i = 0; i < numTerms; i++) {
+ keys.add(new TermFreq("буу" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
+ }
+ keys.add(new TermFreq("фуу бар буу \u00ff фар", 12));
+ MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
+ FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
+ 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH);
+ suggester.build(new TermFreqArrayIterator(keys));
+ int numIters = atLeast(10);
+ for (int i = 0; i < numIters; i++) {
+ String addRandomEdit = addRandomEdit("фуу бар буу", 0);
+ List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
+ assertEquals(addRandomEdit, 1, results.size());
+ assertEquals("фуу бар буу \u00ff фар", results.get(0).key.toString());
+ assertEquals(12, results.get(0).value, 0.01F);
+ }
+ }
+
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
public void testKeyword() throws Exception {
TermFreq keys[] = new TermFreq[] {
@@ -722,7 +743,7 @@
if (VERBOSE) {
System.out.println(" analyzed: " + analyzedKey);
}
- TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
+ TokenStreamToUnicodeAutomaton tokenStreamToUnicodeAutomaton = suggester.getTokenStreamToUnicodeAutomaton();
// NOTE: not great that we ask the suggester to give
// us the "answer key" (ie maybe we have a bug in
@@ -734,7 +755,7 @@
BytesRef spare = new BytesRef();
for (TermFreq2 e : slowCompletor) {
spare.copyChars(e.analyzedForm);
- Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);
+ Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToUnicodeAutomaton);
for (IntsRef intsRef : finiteStrings) {
State p = automaton.getInitialState();
BytesRef ref = Util.toBytesRef(intsRef, spare);
Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java (revision )
+++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java (revision )
@@ -0,0 +1,247 @@
+package org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.RollingBuffer;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.State;
+import org.apache.lucene.util.automaton.Transition;
+
+import java.io.IOException;
+
+// TODO: maybe also toFST? then we can translate atts into FST outputs/weights
+
+/**
+ * Consumes a TokenStream and creates an {@link org.apache.lucene.util.automaton.Automaton}
+ * where the transition labels are Unicode code points from the {@link
+ * org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}. Between tokens we insert
+ * POS_SEP and for holes we insert HOLE.
+ *
+ * @lucene.experimental
+ */
+public class TokenStreamToUnicodeAutomaton {
+
+ private boolean preservePositionIncrements;
+
+ /**
+ * Sole constructor.
+ */
+ public TokenStreamToUnicodeAutomaton() {
+ this.preservePositionIncrements = true;
+ }
+
+ /**
+ * Whether to generate holes in the automaton for missing positions, true by default.
+ */
+ public void setPreservePositionIncrements(boolean enablePositionIncrements) {
+ this.preservePositionIncrements = enablePositionIncrements;
+ }
+
+ private static class Position implements RollingBuffer.Resettable {
+ // Any tokens that ended at our position arrive to this state:
+ State arriving;
+
+ // Any tokens that start at our position leave from this state:
+ State leaving;
+
+ @Override
+ public void reset() {
+ arriving = null;
+ leaving = null;
+ }
+ }
+
+ private static class Positions extends RollingBuffer {
+ @Override
+ protected Position newInstance() {
+ return new Position();
+ }
+ }
+
+ /**
+ * Subclass & implement this if you need to change the
+ * token (such as escaping certain bytes) before it's
+ * turned into a graph.
+ */
+ protected BytesRef changeToken(BytesRef in) {
+ return in;
+ }
+
+ /**
+ * Pulls the graph (including {@link
+ * org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}) from the provided {@link
+ * org.apache.lucene.analysis.TokenStream}, and creates the corresponding
+ * automaton where arcs are Unicode code points from each term.
+ */
+ public Automaton toAutomaton(TokenStream in) throws IOException {
+ final Automaton a = new Automaton();
+ boolean deterministic = true;
+
+ final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
+ final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
+ final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
+ final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
+
+ final BytesRef term = termBytesAtt.getBytesRef();
+
+ in.reset();
+
+ // Only temporarily holds states ahead of our current
+ // position:
+
+ final RollingBuffer positions = new Positions();
+
+ int pos = -1;
+ Position posData = null;
+ int maxOffset = 0;
+ while (in.incrementToken()) {
+ int posInc = posIncAtt.getPositionIncrement();
+ if (!preservePositionIncrements && posInc > 1) {
+ posInc = 1;
+ }
+ assert pos > -1 || posInc > 0;
+
+ if (posInc > 0) {
+
+ // New node:
+ pos += posInc;
+
+ posData = positions.get(pos);
+ assert posData.leaving == null;
+
+ if (posData.arriving == null) {
+ // No token ever arrived to this position
+ if (pos == 0) {
+ // OK: this is the first token
+ posData.leaving = a.getInitialState();
+ } else {
+ // This means there's a hole (eg, StopFilter
+ // does this):
+ posData.leaving = new State();
+ addHoles(a.getInitialState(), positions, pos);
+ }
+ } else {
+ posData.leaving = new State();
+ posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, posData.leaving));
+ if (posInc > 1) {
+ // A token spanned over a hole; add holes
+ // "under" it:
+ addHoles(a.getInitialState(), positions, pos);
+ }
+ }
+ positions.freeBefore(pos);
+ } else {
+ // note: this isn't necessarily true. its just that we aren't surely det.
+ // we could optimize this further (e.g. buffer and sort synonyms at a position)
+ // but thats probably overkill. this is cheap and dirty
+ deterministic = false;
+ }
+
+ final int endPos = pos + posLengthAtt.getPositionLength();
+
+ termBytesAtt.fillBytesRef();
+ final String utf16 = changeToken(term).utf8ToString();
+ final int[] term2 = new int[utf16.codePointCount(0, utf16.length())];
+ for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
+ term2[j++] = cp = utf16.codePointAt(i);
+
+ final Position endPosData = positions.get(endPos);
+ if (endPosData.arriving == null) {
+ endPosData.arriving = new State();
+ }
+
+ State state = posData.leaving;
+ for (int charIDX = 0; charIDX < term2.length; charIDX++) {
+ final State nextState = charIDX == term2.length - 1 ? endPosData.arriving : new State();
+ state.addTransition(new Transition(term2[charIDX], nextState));
+ state = nextState;
+ }
+
+ maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
+ }
+
+ in.end();
+ State endState = null;
+ if (offsetAtt.endOffset() > maxOffset) {
+ endState = new State();
+ endState.setAccept(true);
+ }
+
+ pos++;
+ while (pos <= positions.getMaxPos()) {
+ posData = positions.get(pos);
+ if (posData.arriving != null) {
+ if (endState != null) {
+ posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, endState));
+ } else {
+ posData.arriving.setAccept(true);
+ }
+ }
+ pos++;
+ }
+
+ //toDot(a);
+ a.setDeterministic(deterministic);
+ return a;
+ }
+
+ // for debugging!
+ /*
+ private static void toDot(Automaton a) throws IOException {
+ final String s = a.toDot();
+ Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot"));
+ w.write(s);
+ w.close();
+ System.out.println("TEST: saved to /tmp/out.dot");
+ }
+ */
+
+ private static void addHoles(State startState, RollingBuffer positions, int pos) {
+ Position posData = positions.get(pos);
+ Position prevPosData = positions.get(pos - 1);
+
+ while (posData.arriving == null || prevPosData.leaving == null) {
+ if (posData.arriving == null) {
+ posData.arriving = new State();
+ posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, posData.leaving));
+ }
+ if (prevPosData.leaving == null) {
+ if (pos == 1) {
+ prevPosData.leaving = startState;
+ } else {
+ prevPosData.leaving = new State();
+ }
+ if (prevPosData.arriving != null) {
+ prevPosData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, prevPosData.leaving));
+ }
+ }
+ prevPosData.leaving.addTransition(new Transition(TokenStreamToAutomaton.HOLE, posData.arriving));
+ pos--;
+ if (pos <= 0) {
+ break;
+ }
+ posData = prevPosData;
+ prevPosData = positions.get(pos - 1);
+ }
+ }
+}
Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
<+>package org.apache.lucene.analysis;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.IOException;\n\nimport org.apache.lucene.analysis.tokenattributes.OffsetAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.RollingBuffer;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.automaton.Transition;\n\n// TODO: maybe also toFST? then we can translate atts into FST outputs/weights\n\n/** Consumes a TokenStream and creates an {@link Automaton}\n * where the transition labels are UTF8 bytes from the {@link\n * TermToBytesRefAttribute}. Between tokens we insert\n * POS_SEP and for holes we insert HOLE.\n *\n * @lucene.experimental */\npublic class TokenStreamToAutomaton {\n\n private boolean preservePositionIncrements;\n\n /** Sole constructor. */\n public TokenStreamToAutomaton() {\n this.preservePositionIncrements = true;\n }\n\n /** Whether to generate holes in the automaton for missing positions, true by default. */\n public void setPreservePositionIncrements(boolean enablePositionIncrements) {\n this.preservePositionIncrements = enablePositionIncrements;\n }\n\n private static class Position implements RollingBuffer.Resettable {\n // Any tokens that ended at our position arrive to this state:\n State arriving;\n\n // Any tokens that start at our position leave from this state:\n State leaving;\n\n @Override\n public void reset() {\n arriving = null;\n leaving = null;\n }\n }\n\n private static class Positions extends RollingBuffer {\n @Override\n protected Position newInstance() {\n return new Position();\n }\n }\n\n /** Subclass & implement this if you need to change the\n * token (such as escaping certain bytes) before it's\n * turned into a graph. */ \n protected BytesRef changeToken(BytesRef in) {\n return in;\n }\n\n /** We create transition between two adjacent tokens. */\n public static final int POS_SEP = 256;\n\n /** We add this arc to represent a hole. */\n public static final int HOLE = 257;\n\n /** Pulls the graph (including {@link\n * PositionLengthAttribute}) from the provided {@link\n * TokenStream}, and creates the corresponding\n * automaton where arcs are bytes from each term. */\n public Automaton toAutomaton(TokenStream in) throws IOException {\n final Automaton a = new Automaton();\n boolean deterministic = true;\n\n final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);\n final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);\n final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);\n final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);\n\n final BytesRef term = termBytesAtt.getBytesRef();\n\n in.reset();\n\n // Only temporarily holds states ahead of our current\n // position:\n\n final RollingBuffer positions = new Positions();\n\n int pos = -1;\n Position posData = null;\n int maxOffset = 0;\n while (in.incrementToken()) {\n int posInc = posIncAtt.getPositionIncrement();\n if (!preservePositionIncrements && posInc > 1) {\n posInc = 1;\n }\n assert pos > -1 || posInc > 0;\n\n if (posInc > 0) {\n\n // New node:\n pos += posInc;\n\n posData = positions.get(pos);\n assert posData.leaving == null;\n\n if (posData.arriving == null) {\n // No token ever arrived to this position\n if (pos == 0) {\n // OK: this is the first token\n posData.leaving = a.getInitialState();\n } else {\n // This means there's a hole (eg, StopFilter\n // does this):\n posData.leaving = new State();\n addHoles(a.getInitialState(), positions, pos);\n }\n } else {\n posData.leaving = new State();\n posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));\n if (posInc > 1) {\n // A token spanned over a hole; add holes\n // \"under\" it:\n addHoles(a.getInitialState(), positions, pos);\n }\n }\n positions.freeBefore(pos);\n } else {\n // note: this isn't necessarily true. its just that we aren't surely det.\n // we could optimize this further (e.g. buffer and sort synonyms at a position)\n // but thats probably overkill. this is cheap and dirty\n deterministic = false;\n }\n\n final int endPos = pos + posLengthAtt.getPositionLength();\n\n termBytesAtt.fillBytesRef();\n final BytesRef term2 = changeToken(term);\n final Position endPosData = positions.get(endPos);\n if (endPosData.arriving == null) {\n endPosData.arriving = new State();\n }\n\n State state = posData.leaving;\n for(int byteIDX=0;byteIDX maxOffset) {\n endState = new State();\n endState.setAccept(true);\n }\n\n pos++;\n while (pos <= positions.getMaxPos()) {\n posData = positions.get(pos);\n if (posData.arriving != null) {\n if (endState != null) {\n posData.arriving.addTransition(new Transition(POS_SEP, endState));\n } else {\n posData.arriving.setAccept(true);\n }\n }\n pos++;\n }\n\n //toDot(a);\n a.setDeterministic(deterministic);\n return a;\n }\n\n // for debugging!\n /*\n private static void toDot(Automaton a) throws IOException {\n final String s = a.toDot();\n Writer w = new OutputStreamWriter(new FileOutputStream(\"/tmp/out.dot\"));\n w.write(s);\n w.close();\n System.out.println(\"TEST: saved to /tmp/out.dot\");\n }\n */\n\n private static void addHoles(State startState, RollingBuffer positions, int pos) {\n Position posData = positions.get(pos);\n Position prevPosData = positions.get(pos-1);\n\n while(posData.arriving == null || prevPosData.leaving == null) {\n if (posData.arriving == null) {\n posData.arriving = new State();\n posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));\n }\n if (prevPosData.leaving == null) {\n if (pos == 1) {\n prevPosData.leaving = startState;\n } else {\n prevPosData.leaving = new State();\n }\n if (prevPosData.arriving != null) {\n prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));\n }\n }\n prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));\n pos--;\n if (pos <= 0) {\n break;\n }\n posData = prevPosData;\n prevPosData = positions.get(pos-1);\n }\n }\n}\n
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision f81056da25f3671b9807c4a51d6b985389fe916e)
+++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision )
@@ -80,10 +80,10 @@
}
/** We create transition between two adjacent tokens. */
- public static final int POS_SEP = 256;
+ public static final int POS_SEP = 0x10FFFF;
/** We add this arc to represent a hole. */
- public static final int HOLE = 257;
+ public static final int HOLE = POS_SEP - 1;
/** Pulls the graph (including {@link
* PositionLengthAttribute}) from the provided {@link
Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
<+>package org.apache.lucene.search.suggest.analyzing;\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\nimport java.io.FileOutputStream;\nimport java.io.IOException;\nimport java.io.OutputStreamWriter;\nimport java.io.Writer;\nimport java.util.Arrays;\nimport java.util.List;\nimport java.util.Set;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.BasicAutomata;\nimport org.apache.lucene.util.automaton.BasicOperations;\nimport org.apache.lucene.util.automaton.LevenshteinAutomata;\nimport org.apache.lucene.util.automaton.SpecialOperations;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.PairOutputs.Pair;\n\n/**\n * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is\n * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though\n * you can explicitly choose classic Levenshtein by passing false\n * for the transpositions parameter.\n * \n * At most, this query will match terms up to\n * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}\n * edits. Higher distances are not supported. Note that the\n * fuzzy distance is measured in \"byte space\" on the bytes\n * returned by the {@link TokenStream}'s {@link\n * TermToBytesRefAttribute}, usually UTF8. By default\n * the analyzed bytes must be at least 3 {@link\n * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are\n * considered. Furthermore, the first 1 {@link\n * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be\n * edited. We allow up to 1 (@link\n * #DEFAULT_MAX_EDITS} edit.\n *\n *
\n * NOTE: This suggester does not boost suggestions that\n * required no edits over suggestions that did require\n * edits. This is a known limitation.\n *\n *
\n * Note: complex query analyzers can have a significant impact on the lookup\n * performance. It's recommended to not use analyzers that drop or inject terms\n * like synonyms to keep the complexity of the prefix intersection low for good\n * lookup performance. At index time, complex analyzers can safely be used.\n *
\n */\npublic final class FuzzySuggester extends AnalyzingSuggester {\n private final int maxEdits;\n private final boolean transpositions;\n private final int nonFuzzyPrefix;\n private final int minFuzzyLength;\n\n /**\n * The default minimum length of the key passed to {@link\n * #lookup} before any edits are allowed.\n */\n public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;\n\n /**\n * The default prefix length where edits are not allowed.\n */\n public static final int DEFAULT_NON_FUZZY_PREFIX = 1;\n \n /**\n * The default maximum number of edits for fuzzy\n * suggestions.\n */\n public static final int DEFAULT_MAX_EDITS = 1;\n \n /**\n * The default transposition value passed to {@link LevenshteinAutomata}\n */\n public static final boolean DEFAULT_TRANSPOSITIONS = true;\n\n /**\n * Creates a {@link FuzzySuggester} instance initialized with default values.\n * \n * @param analyzer the analyzer used for this suggester\n */\n public FuzzySuggester(Analyzer analyzer) {\n this(analyzer, analyzer);\n }\n \n /**\n * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.\n * \n * @param indexAnalyzer\n * Analyzer that will be used for analyzing suggestions while building the index.\n * @param queryAnalyzer\n * Analyzer that will be used for analyzing query text during lookup\n */\n public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,\n DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);\n }\n\n /**\n * Creates a {@link FuzzySuggester} instance.\n * \n * @param indexAnalyzer Analyzer that will be used for\n * analyzing suggestions while building the index.\n * @param queryAnalyzer Analyzer that will be used for\n * analyzing query text during lookup\n * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}\n * @param maxSurfaceFormsPerAnalyzedForm Maximum number of\n * surface forms to keep for a single analyzed form.\n * When there are too many surface forms we discard the\n * lowest weighted ones.\n * @param maxGraphExpansions Maximum number of graph paths\n * to expand from the analyzed form. Set this to -1 for\n * no limit.\n * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .\n * @param transpositions true if transpositions should be treated as a primitive \n * edit operation. If this is false, comparisons will implement the classic\n * Levenshtein algorithm.\n * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}\n * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})\n */\n public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,\n int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,\n int maxEdits, boolean transpositions, int nonFuzzyPrefix,\n int minFuzzyLength) {\n super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);\n if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {\n throw new IllegalArgumentException(\"maxEdits must be between 0 and \" + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);\n }\n if (nonFuzzyPrefix < 0) {\n throw new IllegalArgumentException(\"nonFuzzyPrefix must not be >= 0 (got \" + nonFuzzyPrefix + \")\");\n }\n if (minFuzzyLength < 0) {\n throw new IllegalArgumentException(\"minFuzzyLength must not be >= 0 (got \" + minFuzzyLength + \")\");\n }\n \n this.maxEdits = maxEdits;\n this.transpositions = transpositions;\n this.nonFuzzyPrefix = nonFuzzyPrefix;\n this.minFuzzyLength = minFuzzyLength;\n }\n \n @Override\n protected List>> getFullPrefixPaths(List>> prefixPaths,\n Automaton lookupAutomaton,\n FST> fst)\n throws IOException {\n\n // TODO: right now there's no penalty for fuzzy/edits,\n // ie a completion whose prefix matched exactly what the\n // user typed gets no boost over completions that\n // required an edit, which get no boost over completions\n // requiring two edits. I suspect a multiplicative\n // factor is appropriate (eg, say a fuzzy match must be at\n // least 2X better weight than the non-fuzzy match to\n // \"compete\") ... in which case I think the wFST needs\n // to be log weights or something ...\n\n Automaton levA = toLevenshteinAutomata(lookupAutomaton);\n /*\n Writer w = new OutputStreamWriter(new FileOutputStream(\"out.dot\"), \"UTF-8\");\n w.write(levA.toDot());\n w.close();\n System.out.println(\"Wrote LevA to out.dot\");\n */\n return FSTUtil.intersectPrefixPaths(levA, fst);\n }\n\n Automaton toLevenshteinAutomata(Automaton automaton) {\n final Set ref = SpecialOperations.getFiniteStrings(automaton, -1);\n Automaton subs[] = new Automaton[ref.size()];\n int upto = 0;\n for (IntsRef path : ref) {\n if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {\n subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);\n upto++;\n } else {\n Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);\n int ints[] = new int[path.length-nonFuzzyPrefix];\n System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);\n // TODO: maybe add alphaMin to LevenshteinAutomata,\n // and pass 1 instead of 0? We probably don't want\n // to allow the trailing dedup bytes to be\n // edited... but then 0 byte is \"in general\" allowed\n // on input (but not in UTF8).\n LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);\n Automaton levAutomaton = lev.toAutomaton(maxEdits);\n Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));\n combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already\n subs[upto] = combined;\n upto++;\n }\n }\n\n if (subs.length == 0) {\n // automaton is empty, there is no accepted paths through it\n return BasicAutomata.makeEmpty(); // matches nothing\n } else if (subs.length == 1) {\n // no synonyms or anything: just a single path through the tokenstream\n return subs[0];\n } else {\n // multiple paths: this is really scary! is it slow?\n // maybe we should not do this and throw UOE?\n Automaton a = BasicOperations.union(Arrays.asList(subs));\n // TODO: we could call toLevenshteinAutomata() before det? \n // this only happens if you have multiple paths anyway (e.g. synonyms)\n BasicOperations.determinize(a);\n\n return a;\n }\n }\n}\n
===================================================================
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision f81056da25f3671b9807c4a51d6b985389fe916e)
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision )
@@ -15,10 +15,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-import java.io.FileOutputStream;
+
import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
@@ -33,6 +31,7 @@
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.SpecialOperations;
+import org.apache.lucene.util.automaton.UTF32ToUTF8;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
@@ -177,13 +176,15 @@
// to be log weights or something ...
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
+ Automaton utf8LevA = new UTF32ToUTF8().convert(levA);
+ BasicOperations.determinize(utf8LevA);
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
w.close();
System.out.println("Wrote LevA to out.dot");
*/
- return FSTUtil.intersectPrefixPaths(levA, fst);
+ return FSTUtil.intersectPrefixPaths(utf8LevA, fst);
}
Automaton toLevenshteinAutomata(Automaton automaton) {
@@ -203,7 +204,7 @@
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
- LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
+ LevenshteinAutomata lev = new LevenshteinAutomata(ints, Character.MAX_CODE_POINT, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already