Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.File;\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.OutputStream;\nimport java.io.StringReader;\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.TokenStreamToAutomaton;\nimport org.apache.lucene.search.spell.TermFreqIterator;\nimport org.apache.lucene.search.spell.TermFreqPayloadIterator;\nimport org.apache.lucene.search.suggest.Lookup;\nimport org.apache.lucene.search.suggest.Sort;\nimport org.apache.lucene.store.ByteArrayDataInput;\nimport org.apache.lucene.store.ByteArrayDataOutput;\nimport org.apache.lucene.store.DataInput;\nimport org.apache.lucene.store.DataOutput;\nimport org.apache.lucene.store.InputStreamDataInput;\nimport org.apache.lucene.store.OutputStreamDataOutput;\nimport org.apache.lucene.util.ArrayUtil;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.CharsRef;\nimport org.apache.lucene.util.IOUtils;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.UnicodeUtil;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.BasicOperations;\nimport org.apache.lucene.util.automaton.SpecialOperations;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.automaton.Transition;\nimport org.apache.lucene.util.fst.Builder;\nimport org.apache.lucene.util.fst.ByteSequenceOutputs;\nimport org.apache.lucene.util.fst.FST.BytesReader;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.PairOutputs.Pair;\nimport org.apache.lucene.util.fst.PairOutputs;\nimport org.apache.lucene.util.fst.PositiveIntOutputs;\nimport org.apache.lucene.util.fst.Util.MinResult;\nimport org.apache.lucene.util.fst.Util;\n\n/**\n * Suggester that first analyzes the surface form, adds the\n * analyzed form to a weighted FST, and then does the same\n * thing at lookup time. This means lookup is based on the\n * analyzed form while suggestions are still the surface\n * form(s).\n *\n *

\n * This can result in powerful suggester functionality. For\n * example, if you use an analyzer removing stop words, \n * then the partial text \"ghost chr...\" could see the\n * suggestion \"The Ghost of Christmas Past\". Note that\n * position increments MUST NOT be preserved for this example\n * to work, so you should call\n * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.\n *\n *

\n * If SynonymFilter is used to map wifi and wireless network to\n * hotspot then the partial text \"wirele...\" could suggest\n * \"wifi router\". Token normalization like stemmers, accent\n * removal, etc., would allow suggestions to ignore such\n * variations.\n *\n *

\n * When two matching suggestions have the same weight, they\n * are tie-broken by the analyzed form. If their analyzed\n * form is the same then the order is undefined.\n *\n *

\n * There are some limitations:\n *

\n * \n * @lucene.experimental\n */\npublic class AnalyzingSuggester extends Lookup {\n \n /**\n * FST: \n * input is the analyzed form, with a null byte between terms\n * weights are encoded as costs: (Integer.MAX_VALUE-weight)\n * surface is the original, unanalyzed form.\n */\n private FST> fst = null;\n \n /** \n * Analyzer that will be used for analyzing suggestions at\n * index time.\n */\n private final Analyzer indexAnalyzer;\n\n /** \n * Analyzer that will be used for analyzing suggestions at\n * query time.\n */\n private final Analyzer queryAnalyzer;\n \n /** \n * True if exact match suggestions should always be returned first.\n */\n private final boolean exactFirst;\n \n /** \n * True if separator between tokens should be preserved.\n */\n private final boolean preserveSep;\n\n /** Include this flag in the options parameter to {@link\n * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to always\n * return the exact match first, regardless of score. This\n * has no performance impact but could result in\n * low-quality suggestions. */\n public static final int EXACT_FIRST = 1;\n\n /** Include this flag in the options parameter to {@link\n * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to preserve\n * token separators when matching. */\n public static final int PRESERVE_SEP = 2;\n\n /** Represents the separation between tokens, if\n * PRESERVE_SEP was specified */\n private static final int SEP_LABEL = 0xff;\n\n /** Marks end of the analyzed input and start of dedup\n * byte. */\n private static final int END_BYTE = 0x0;\n\n /** Maximum number of dup surface forms (different surface\n * forms for the same analyzed form). */\n private final int maxSurfaceFormsPerAnalyzedForm;\n\n /** Maximum graph paths to index for a single analyzed\n * surface form. This only matters if your analyzer\n * makes lots of alternate paths (e.g. contains\n * SynonymFilter). */\n private final int maxGraphExpansions;\n\n /** Highest number of analyzed paths we saw for any single\n * input surface form. For analyzers that never create\n * graphs this will always be 1. */\n private int maxAnalyzedPathsForOneInput;\n\n private boolean hasPayloads;\n\n private static final int PAYLOAD_SEP = '\\u001f';\n\n /** Whether position holes should appear in the automaton. */\n private boolean preservePositionIncrements;\n\n /**\n * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)\n * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |\n * PRESERVE_SEP, 256, -1)}\n */\n public AnalyzingSuggester(Analyzer analyzer) {\n this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1);\n }\n\n /**\n * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)\n * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |\n * PRESERVE_SEP, 256, -1)}\n */\n public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1);\n }\n\n /**\n * Creates a new suggester.\n * \n * @param indexAnalyzer Analyzer that will be used for\n * analyzing suggestions while building the index.\n * @param queryAnalyzer Analyzer that will be used for\n * analyzing query text during lookup\n * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}\n * @param maxSurfaceFormsPerAnalyzedForm Maximum number of\n * surface forms to keep for a single analyzed form.\n * When there are too many surface forms we discard the\n * lowest weighted ones.\n * @param maxGraphExpansions Maximum number of graph paths\n * to expand from the analyzed form. Set this to -1 for\n * no limit.\n */\n public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions) {\n this.indexAnalyzer = indexAnalyzer;\n this.queryAnalyzer = queryAnalyzer;\n if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {\n throw new IllegalArgumentException(\"options should only contain EXACT_FIRST and PRESERVE_SEP; got \" + options);\n }\n this.exactFirst = (options & EXACT_FIRST) != 0;\n this.preserveSep = (options & PRESERVE_SEP) != 0;\n\n // NOTE: this is just an implementation limitation; if\n // somehow this is a problem we could fix it by using\n // more than one byte to disambiguate ... but 256 seems\n // like it should be way more then enough.\n if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {\n throw new IllegalArgumentException(\"maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: \" + maxSurfaceFormsPerAnalyzedForm + \")\");\n }\n this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;\n\n if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {\n throw new IllegalArgumentException(\"maxGraphExpansions must -1 (no limit) or > 0 (got: \" + maxGraphExpansions + \")\");\n }\n this.maxGraphExpansions = maxGraphExpansions;\n preservePositionIncrements = true;\n }\n\n /** Whether to take position holes (position increment > 1) into account when\n * building the automaton, true by default. */\n public void setPreservePositionIncrements(boolean preservePositionIncrements) {\n this.preservePositionIncrements = preservePositionIncrements;\n }\n\n /** Returns byte size of the underlying FST. */\n public long sizeInBytes() {\n return fst == null ? 0 : fst.sizeInBytes();\n }\n\n private void copyDestTransitions(State from, State to, List transitions) {\n if (to.isAccept()) {\n from.setAccept(true);\n }\n for(Transition t : to.getTransitions()) {\n transitions.add(t);\n }\n }\n\n // Replaces SEP with epsilon or remaps them if\n // we were asked to preserve them:\n private void replaceSep(Automaton a) {\n\n State[] states = a.getNumberedStates();\n\n // Go in reverse topo sort so we know we only have to\n // make one pass:\n for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {\n final State state = states[stateNumber];\n List newTransitions = new ArrayList();\n for(Transition t : state.getTransitions()) {\n assert t.getMin() == t.getMax();\n if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {\n if (preserveSep) {\n // Remap to SEP_LABEL:\n newTransitions.add(new Transition(SEP_LABEL, t.getDest()));\n } else {\n copyDestTransitions(state, t.getDest(), newTransitions);\n a.setDeterministic(false);\n }\n } else if (t.getMin() == TokenStreamToAutomaton.HOLE) {\n\n // Just remove the hole: there will then be two\n // SEP tokens next to each other, which will only\n // match another hole at search time. Note that\n // it will also match an empty-string token ... if\n // that's somehow a problem we can always map HOLE\n // to a dedicated byte (and escape it in the\n // input).\n copyDestTransitions(state, t.getDest(), newTransitions);\n a.setDeterministic(false);\n } else {\n newTransitions.add(t);\n }\n }\n state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));\n }\n }\n\n /** Just escapes the 0xff byte (which we still for SEP). */\n private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {\n\n final BytesRef spare = new BytesRef();\n\n @Override\n protected BytesRef changeToken(BytesRef in) {\n int upto = 0;\n for(int i=0;i {\n\n private final boolean hasPayloads;\n\n public AnalyzingComparator(boolean hasPayloads) {\n this.hasPayloads = hasPayloads;\n }\n\n private final ByteArrayDataInput readerA = new ByteArrayDataInput();\n private final ByteArrayDataInput readerB = new ByteArrayDataInput();\n private final BytesRef scratchA = new BytesRef();\n private final BytesRef scratchB = new BytesRef();\n\n @Override\n public int compare(BytesRef a, BytesRef b) {\n\n // First by analyzed form:\n readerA.reset(a.bytes, a.offset, a.length);\n scratchA.length = readerA.readShort();\n scratchA.bytes = a.bytes;\n scratchA.offset = readerA.getPosition();\n\n readerB.reset(b.bytes, b.offset, b.length);\n scratchB.bytes = b.bytes;\n scratchB.length = readerB.readShort();\n scratchB.offset = readerB.getPosition();\n\n int cmp = scratchA.compareTo(scratchB);\n if (cmp != 0) {\n return cmp;\n }\n\n // Next by cost:\n long aCost = readerA.readInt();\n long bCost = readerB.readInt();\n\n if (aCost < bCost) {\n return -1;\n } else if (aCost > bCost) {\n return 1;\n }\n\n // Finally by surface form:\n if (hasPayloads) {\n readerA.setPosition(readerA.getPosition() + scratchA.length);\n scratchA.length = readerA.readShort();\n scratchA.offset = readerA.getPosition();\n readerB.setPosition(readerB.getPosition() + scratchB.length);\n scratchB.length = readerB.readShort();\n scratchB.offset = readerB.getPosition();\n } else {\n scratchA.offset = readerA.getPosition();\n scratchA.length = a.length - scratchA.offset;\n scratchB.offset = readerB.getPosition();\n scratchB.length = b.length - scratchB.offset;\n }\n\n cmp = scratchA.compareTo(scratchB);\n if (cmp != 0) {\n return cmp;\n }\n\n return 0;\n }\n };\n\n @Override\n public void build(TermFreqIterator iterator) throws IOException {\n String prefix = getClass().getSimpleName();\n File directory = Sort.defaultTempDir();\n File tempInput = File.createTempFile(prefix, \".input\", directory);\n File tempSorted = File.createTempFile(prefix, \".sorted\", directory);\n\n TermFreqPayloadIterator payloads;\n if (iterator instanceof TermFreqPayloadIterator) {\n payloads = (TermFreqPayloadIterator) iterator;\n } else {\n payloads = null;\n }\n hasPayloads = payloads != null;\n\n Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);\n Sort.ByteSequencesReader reader = null;\n BytesRef scratch = new BytesRef();\n\n TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();\n\n boolean success = false;\n byte buffer[] = new byte[8];\n try {\n ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);\n BytesRef surfaceForm;\n\n while ((surfaceForm = iterator.next()) != null) {\n Set paths = toFiniteStrings(surfaceForm, ts2a);\n \n maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());\n\n for (IntsRef path : paths) {\n\n Util.toBytesRef(path, scratch);\n \n // length of the analyzed text (FST input)\n if (scratch.length > Short.MAX_VALUE-2) {\n throw new IllegalArgumentException(\"cannot handle analyzed forms > \" + (Short.MAX_VALUE-2) + \" in length (got \" + scratch.length + \")\");\n }\n short analyzedLength = (short) scratch.length;\n\n // compute the required length:\n // analyzed sequence + weight (4) + surface + analyzedLength (short)\n int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;\n\n BytesRef payload;\n\n if (hasPayloads) {\n if (surfaceForm.length > (Short.MAX_VALUE-2)) {\n throw new IllegalArgumentException(\"cannot handle surface form > \" + (Short.MAX_VALUE-2) + \" in length (got \" + surfaceForm.length + \")\");\n }\n payload = payloads.payload();\n // payload + surfaceLength (short)\n requiredLength += payload.length + 2;\n } else {\n payload = null;\n }\n \n buffer = ArrayUtil.grow(buffer, requiredLength);\n \n output.reset(buffer);\n\n output.writeShort(analyzedLength);\n\n output.writeBytes(scratch.bytes, scratch.offset, scratch.length);\n\n output.writeInt(encodeWeight(iterator.weight()));\n\n if (hasPayloads) {\n for(int i=0;i outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());\n Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs);\n\n // Build FST:\n BytesRef previousAnalyzed = null;\n BytesRef analyzed = new BytesRef();\n BytesRef surface = new BytesRef();\n IntsRef scratchInts = new IntsRef();\n ByteArrayDataInput input = new ByteArrayDataInput();\n\n // Used to remove duplicate surface forms (but we\n // still index the hightest-weight one). We clear\n // this when we see a new analyzed form, so it cannot\n // grow unbounded (at most 256 entries):\n Set seenSurfaceForms = new HashSet();\n\n int dedup = 0;\n while (reader.read(scratch)) {\n input.reset(scratch.bytes, scratch.offset, scratch.length);\n short analyzedLength = input.readShort();\n analyzed.grow(analyzedLength+2);\n input.readBytes(analyzed.bytes, 0, analyzedLength);\n analyzed.length = analyzedLength;\n\n long cost = input.readInt();\n\n surface.bytes = scratch.bytes;\n if (hasPayloads) {\n surface.length = input.readShort();\n surface.offset = input.getPosition();\n } else {\n surface.offset = input.getPosition();\n surface.length = scratch.length - surface.offset;\n }\n \n if (previousAnalyzed == null) {\n previousAnalyzed = new BytesRef();\n previousAnalyzed.copyBytes(analyzed);\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n } else if (analyzed.equals(previousAnalyzed)) {\n dedup++;\n if (dedup >= maxSurfaceFormsPerAnalyzedForm) {\n // More than maxSurfaceFormsPerAnalyzedForm\n // dups: skip the rest:\n continue;\n }\n if (seenSurfaceForms.contains(surface)) {\n continue;\n }\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n } else {\n dedup = 0;\n previousAnalyzed.copyBytes(analyzed);\n seenSurfaceForms.clear();\n seenSurfaceForms.add(BytesRef.deepCopyOf(surface));\n }\n\n // TODO: I think we can avoid the extra 2 bytes when\n // there is no dup (dedup==0), but we'd have to fix\n // the exactFirst logic ... which would be sort of\n // hairy because we'd need to special case the two\n // (dup/not dup)...\n\n // NOTE: must be byte 0 so we sort before whatever\n // is next\n analyzed.bytes[analyzed.offset+analyzed.length] = 0;\n analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup;\n analyzed.length += 2;\n\n Util.toIntsRef(analyzed, scratchInts);\n //System.out.println(\"ADD: \" + scratchInts + \" -> \" + cost + \": \" + surface.utf8ToString());\n if (!hasPayloads) {\n builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));\n } else {\n int payloadOffset = input.getPosition() + surface.length;\n int payloadLength = scratch.length - payloadOffset;\n BytesRef br = new BytesRef(surface.length + 1 + payloadLength);\n System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);\n br.bytes[surface.length] = PAYLOAD_SEP;\n System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);\n br.length = br.bytes.length;\n builder.add(scratchInts, outputs.newPair(cost, br));\n }\n }\n fst = builder.finish();\n\n //Util.dotToFile(fst, \"/tmp/suggest.dot\");\n \n success = true;\n } finally {\n if (success) {\n IOUtils.close(reader, writer);\n } else {\n IOUtils.closeWhileHandlingException(reader, writer);\n }\n \n tempInput.delete();\n tempSorted.delete();\n }\n }\n\n @Override\n public boolean store(OutputStream output) throws IOException {\n DataOutput dataOut = new OutputStreamDataOutput(output);\n try {\n if (fst == null) {\n return false;\n }\n\n fst.save(dataOut);\n dataOut.writeVInt(maxAnalyzedPathsForOneInput);\n dataOut.writeByte((byte) (hasPayloads ? 1 : 0));\n } finally {\n IOUtils.close(output);\n }\n return true;\n }\n\n @Override\n public boolean load(InputStream input) throws IOException {\n DataInput dataIn = new InputStreamDataInput(input);\n try {\n this.fst = new FST>(dataIn, new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));\n maxAnalyzedPathsForOneInput = dataIn.readVInt();\n hasPayloads = dataIn.readByte() == 1;\n } finally {\n IOUtils.close(input);\n }\n return true;\n }\n\n private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) {\n LookupResult result;\n if (hasPayloads) {\n int sepIndex = -1;\n for(int i=0;i= output2.length) {\n return false;\n }\n for(int i=0;i lookup(final CharSequence key, boolean onlyMorePopular, int num) {\n assert num > 0;\n\n if (onlyMorePopular) {\n throw new IllegalArgumentException(\"this suggester only works with onlyMorePopular=false\");\n }\n if (fst == null) {\n return Collections.emptyList();\n }\n\n //System.out.println(\"lookup key=\" + key + \" num=\" + num);\n final BytesRef utf8Key = new BytesRef(key);\n try {\n\n Automaton lookupAutomaton = toLookupAutomaton(key);\n\n final CharsRef spare = new CharsRef();\n\n //System.out.println(\" now intersect exactFirst=\" + exactFirst);\n \n // Intersect automaton w/ suggest wFST and get all\n // prefix starting nodes & their outputs:\n //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);\n\n //System.out.println(\" prefixPaths: \" + prefixPaths.size());\n\n BytesReader bytesReader = fst.getBytesReader();\n\n FST.Arc> scratchArc = new FST.Arc>();\n\n final List results = new ArrayList();\n\n List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);\n\n if (exactFirst) {\n\n int count = 0;\n for (FSTUtil.Path> path : prefixPaths) {\n if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {\n // This node has END_BYTE arc leaving, meaning it's an\n // \"exact\" match:\n count++;\n }\n }\n\n // Searcher just to find the single exact only\n // match, if present:\n Util.TopNSearcher> searcher;\n searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);\n\n // NOTE: we could almost get away with only using\n // the first start node. The only catch is if\n // maxSurfaceFormsPerAnalyzedForm had kicked in and\n // pruned our exact match from one of these nodes\n // ...:\n for (FSTUtil.Path> path : prefixPaths) {\n if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {\n // This node has END_BYTE arc leaving, meaning it's an\n // \"exact\" match:\n searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);\n }\n }\n\n MinResult> completions[] = searcher.search();\n\n // NOTE: this is rather inefficient: we enumerate\n // every matching \"exactly the same analyzed form\"\n // path, and then do linear scan to see if one of\n // these exactly matches the input. It should be\n // possible (though hairy) to do something similar\n // to getByOutput, since the surface form is encoded\n // into the FST output, so we more efficiently hone\n // in on the exact surface-form match. Still, I\n // suspect very little time is spent in this linear\n // seach: it's bounded by how many prefix start\n // nodes we have and the\n // maxSurfaceFormsPerAnalyzedForm:\n for(MinResult> completion : completions) {\n BytesRef output2 = completion.output.output2;\n if (sameSurfaceForm(utf8Key, output2)) {\n results.add(getLookupResult(completion.output.output1, output2, spare));\n break;\n }\n }\n\n if (results.size() == num) {\n // That was quick:\n return results;\n }\n }\n\n Util.TopNSearcher> searcher;\n searcher = new Util.TopNSearcher>(fst,\n num - results.size(),\n num * maxAnalyzedPathsForOneInput,\n weightComparator) {\n private final Set seen = new HashSet();\n\n @Override\n protected boolean acceptResult(IntsRef input, Pair output) {\n\n // Dedup: when the input analyzes to a graph we\n // can get duplicate surface forms:\n if (seen.contains(output.output2)) {\n return false;\n }\n seen.add(output.output2);\n \n if (!exactFirst) {\n return true;\n } else {\n // In exactFirst mode, don't accept any paths\n // matching the surface form since that will\n // create duplicate results:\n if (sameSurfaceForm(utf8Key, output.output2)) {\n // We found exact match, which means we should\n // have already found it in the first search:\n assert results.size() == 1;\n return false;\n } else {\n return true;\n }\n }\n }\n };\n\n prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);\n \n for (FSTUtil.Path> path : prefixPaths) {\n searcher.addStartPaths(path.fstNode, path.output, true, path.input);\n }\n\n MinResult> completions[] = searcher.search();\n\n for(MinResult> completion : completions) {\n\n LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);\n\n // TODO: for fuzzy case would be nice to return\n // how many edits were required\n\n //System.out.println(\" result=\" + result);\n results.add(result);\n\n if (results.size() == num) {\n // In the exactFirst=true case the search may\n // produce one extra path\n break;\n }\n }\n\n return results;\n } catch (IOException bogus) {\n throw new RuntimeException(bogus);\n }\n }\n\n /** Returns all prefix paths to initialize the search. */\n protected List>> getFullPrefixPaths(List>> prefixPaths,\n Automaton lookupAutomaton,\n FST> fst)\n throws IOException {\n return prefixPaths;\n }\n \n final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {\n // Analyze surface form:\n TokenStream ts = indexAnalyzer.tokenStream(\"\", new StringReader(surfaceForm.utf8ToString()));\n\n // Create corresponding automaton: labels are bytes\n // from each analyzed token, with byte 0 used as\n // separator between tokens:\n Automaton automaton = ts2a.toAutomaton(ts);\n ts.close();\n\n replaceSep(automaton);\n\n assert SpecialOperations.isFinite(automaton);\n\n // Get all paths from the automaton (there can be\n // more than one path, eg if the analyzer created a\n // graph using SynFilter or WDF):\n\n // TODO: we could walk & add simultaneously, so we\n // don't have to alloc [possibly biggish]\n // intermediate HashSet in RAM:\n return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);\n }\n\n final Automaton toLookupAutomaton(final CharSequence key) throws IOException {\n // TODO: is there a Reader from a CharSequence?\n // Turn tokenstream into automaton:\n TokenStream ts = queryAnalyzer.tokenStream(\"\", new StringReader(key.toString()));\n Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);\n ts.close();\n\n // TODO: we could use the end offset to \"guess\"\n // whether the final token was a partial token; this\n // would only be a heuristic ... but maybe an OK one.\n // This way we could eg differentiate \"net\" from \"net \",\n // which we can't today...\n\n replaceSep(automaton);\n\n // TODO: we can optimize this somewhat by determinizing\n // while we convert\n BasicOperations.determinize(automaton);\n return automaton;\n }\n \n \n\n /**\n * Returns the weight associated with an input string,\n * or null if it does not exist.\n */\n public Object get(CharSequence key) {\n throw new UnsupportedOperationException();\n }\n \n /** cost -> weight */\n private static int decodeWeight(long encoded) {\n return (int)(Integer.MAX_VALUE - encoded);\n }\n \n /** weight -> cost */\n private static int encodeWeight(long value) {\n if (value < 0 || value > Integer.MAX_VALUE) {\n throw new UnsupportedOperationException(\"cannot encode value: \" + value);\n }\n return Integer.MAX_VALUE - (int)value;\n }\n \n static final Comparator> weightComparator = new Comparator> () {\n @Override\n public int compare(Pair left, Pair right) {\n return left.output1.compareTo(right.output1);\n }\n };\n}\n =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision f81056da25f3671b9807c4a51d6b985389fe916e) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision ) @@ -17,21 +17,9 @@ * limitations under the License. */ -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqPayloadIterator; import org.apache.lucene.search.suggest.Lookup; @@ -53,143 +41,172 @@ import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; -import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.fst.PositiveIntOutputs; -import org.apache.lucene.util.fst.Util.MinResult; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.fst.Util.MinResult; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + /** * Suggester that first analyzes the surface form, adds the * analyzed form to a weighted FST, and then does the same * thing at lookup time. This means lookup is based on the * analyzed form while suggestions are still the surface * form(s). - * - *

+ *

+ *

* This can result in powerful suggester functionality. For - * example, if you use an analyzer removing stop words, + * example, if you use an analyzer removing stop words, * then the partial text "ghost chr..." could see the * suggestion "The Ghost of Christmas Past". Note that * position increments MUST NOT be preserved for this example * to work, so you should call * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}. - * - *

+ *

+ *

* If SynonymFilter is used to map wifi and wireless network to * hotspot then the partial text "wirele..." could suggest * "wifi router". Token normalization like stemmers, accent * removal, etc., would allow suggestions to ignore such * variations. - * - *

+ *

+ *

* When two matching suggestions have the same weight, they * are tie-broken by the analyzed form. If their analyzed * form is the same then the order is undefined. - * - *

+ *

+ *

* There are some limitations: *

    - * + *

    - *

  • A lookup from a query like "net" in English won't + *
  • A lookup from a query like "net" in English won't - * be any different than "net " (ie, user added a + * be any different than "net " (ie, user added a - * trailing space) because analyzers don't reflect + * trailing space) because analyzers don't reflect - * when they've seen a token separator and when they + * when they've seen a token separator and when they - * haven't. + * haven't. - * + *

    - *

  • If you're using {@code StopFilter}, and the user will + *
  • If you're using {@code StopFilter}, and the user will - * type "fast apple", but so far all they've typed is + * type "fast apple", but so far all they've typed is - * "fast a", again because the analyzer doesn't convey whether + * "fast a", again because the analyzer doesn't convey whether - * it's seen a token separator after the "a", + * it's seen a token separator after the "a", - * {@code StopFilter} will remove that "a" causing + * {@code StopFilter} will remove that "a" causing - * far more matches than you'd expect. + * far more matches than you'd expect. - * + *

    - *

  • Lookups with the empty string return no results + *
  • Lookups with the empty string return no results - * instead of all results. + * instead of all results. *
- * + * * @lucene.experimental */ public class AnalyzingSuggester extends Lookup { - + /** - * FST: + * FST: - * input is the analyzed form, with a null byte between terms + * input is the analyzed form, with a null byte between terms - * weights are encoded as costs: (Integer.MAX_VALUE-weight) + * weights are encoded as costs: (Integer.MAX_VALUE-weight) - * surface is the original, unanalyzed form. + * surface is the original, unanalyzed form. */ - private FST> fst = null; + private FST> fst = null; - - /** + + /** * Analyzer that will be used for analyzing suggestions at * index time. */ private final Analyzer indexAnalyzer; - /** + /** * Analyzer that will be used for analyzing suggestions at * query time. */ private final Analyzer queryAnalyzer; - - /** + + /** * True if exact match suggestions should always be returned first. */ private final boolean exactFirst; - - /** + + /** * True if separator between tokens should be preserved. */ private final boolean preserveSep; - /** Include this flag in the options parameter to {@link + /** + * Include this flag in the options parameter to {@link - * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to always + * #AnalyzingSuggester(Analyzer, Analyzer, int, int, int)} to always - * return the exact match first, regardless of score. This + * return the exact match first, regardless of score. This - * has no performance impact but could result in + * has no performance impact but could result in - * low-quality suggestions. */ + * low-quality suggestions. + */ public static final int EXACT_FIRST = 1; - /** Include this flag in the options parameter to {@link + /** + * Include this flag in the options parameter to {@link - * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to preserve + * #AnalyzingSuggester(Analyzer, Analyzer, int, int, int)} to preserve - * token separators when matching. */ + * token separators when matching. + */ public static final int PRESERVE_SEP = 2; - /** Represents the separation between tokens, if - * PRESERVE_SEP was specified */ - private static final int SEP_LABEL = 0xff; + /** + * Represents the separation between tokens, if + * PRESERVE_SEP was specified + */ + private static final int SEP_LABEL = 0x10FFFF; - /** Marks end of the analyzed input and start of dedup - * byte. */ + /** + * Marks end of the analyzed input and start of dedup + * byte. + */ private static final int END_BYTE = 0x0; - /** Maximum number of dup surface forms (different surface - * forms for the same analyzed form). */ + /** + * Maximum number of dup surface forms (different surface + * forms for the same analyzed form). + */ private final int maxSurfaceFormsPerAnalyzedForm; - /** Maximum graph paths to index for a single analyzed + /** + * Maximum graph paths to index for a single analyzed - * surface form. This only matters if your analyzer + * surface form. This only matters if your analyzer - * makes lots of alternate paths (e.g. contains + * makes lots of alternate paths (e.g. contains - * SynonymFilter). */ + * SynonymFilter). + */ private final int maxGraphExpansions; - /** Highest number of analyzed paths we saw for any single + /** + * Highest number of analyzed paths we saw for any single - * input surface form. For analyzers that never create + * input surface form. For analyzers that never create - * graphs this will always be 1. */ + * graphs this will always be 1. + */ private int maxAnalyzedPathsForOneInput; private boolean hasPayloads; private static final int PAYLOAD_SEP = '\u001f'; - /** Whether position holes should appear in the automaton. */ + /** + * Whether position holes should appear in the automaton. + */ private boolean preservePositionIncrements; /** - * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int) + * Calls {@link #AnalyzingSuggester(Analyzer, Analyzer, int, int, int) * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | * PRESERVE_SEP, 256, -1)} */ @@ -198,7 +215,7 @@ } /** - * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int) + * Calls {@link #AnalyzingSuggester(Analyzer, Analyzer, int, int, int) * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | * PRESERVE_SEP, 256, -1)} */ @@ -208,19 +225,19 @@ /** * Creates a new suggester. - * + * - * @param indexAnalyzer Analyzer that will be used for + * @param indexAnalyzer Analyzer that will be used for - * analyzing suggestions while building the index. + * analyzing suggestions while building the index. - * @param queryAnalyzer Analyzer that will be used for + * @param queryAnalyzer Analyzer that will be used for - * analyzing query text during lookup + * analyzing query text during lookup - * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} * @param maxSurfaceFormsPerAnalyzedForm Maximum number of - * surface forms to keep for a single analyzed form. + * surface forms to keep for a single analyzed form. - * When there are too many surface forms we discard the + * When there are too many surface forms we discard the - * lowest weighted ones. + * lowest weighted ones. - * @param maxGraphExpansions Maximum number of graph paths + * @param maxGraphExpansions Maximum number of graph paths - * to expand from the analyzed form. Set this to -1 for + * to expand from the analyzed form. Set this to -1 for - * no limit. + * no limit. */ public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions) { this.indexAnalyzer = indexAnalyzer; @@ -247,13 +264,17 @@ preservePositionIncrements = true; } - /** Whether to take position holes (position increment > 1) into account when - * building the automaton, true by default. */ + /** + * Whether to take position holes (position increment > 1) into account when + * building the automaton, true by default. + */ public void setPreservePositionIncrements(boolean preservePositionIncrements) { this.preservePositionIncrements = preservePositionIncrements; } - /** Returns byte size of the underlying FST. */ + /** + * Returns byte size of the underlying FST. + */ public long sizeInBytes() { return fst == null ? 0 : fst.sizeInBytes(); } @@ -262,7 +283,7 @@ if (to.isAccept()) { from.setAccept(true); } - for(Transition t : to.getTransitions()) { + for (Transition t : to.getTransitions()) { transitions.add(t); } } @@ -275,12 +296,12 @@ // Go in reverse topo sort so we know we only have to // make one pass: - for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) { + for (int stateNumber = states.length - 1; stateNumber >= 0; stateNumber--) { final State state = states[stateNumber]; List newTransitions = new ArrayList(); - for(Transition t : state.getTransitions()) { + for (Transition t : state.getTransitions()) { assert t.getMin() == t.getMax(); - if (t.getMin() == TokenStreamToAutomaton.POS_SEP) { + if (t.getMin() == TokenStreamToUnicodeAutomaton.POS_SEP) { if (preserveSep) { // Remap to SEP_LABEL: newTransitions.add(new Transition(SEP_LABEL, t.getDest())); @@ -288,7 +309,7 @@ copyDestTransitions(state, t.getDest(), newTransitions); a.setDeterministic(false); } - } else if (t.getMin() == TokenStreamToAutomaton.HOLE) { + } else if (t.getMin() == TokenStreamToUnicodeAutomaton.HOLE) { // Just remove the hole: there will then be two // SEP tokens next to each other, which will only @@ -307,25 +328,27 @@ } } - /** Just escapes the 0xff byte (which we still for SEP). */ - private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { + /** + * Just escapes the 0xff byte (which we still for SEP). + */ + private static final class EscapingTokenStreamToUnicodeAutomaton extends TokenStreamToUnicodeAutomaton { final BytesRef spare = new BytesRef(); @Override protected BytesRef changeToken(BytesRef in) { int upto = 0; - for(int i=0;i { private final boolean hasPayloads; @@ -413,7 +436,7 @@ return 0; } - }; + } @Override public void build(TermFreqIterator iterator) throws IOException { @@ -434,7 +457,7 @@ Sort.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); - TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + TokenStreamToUnicodeAutomaton ts2ua = getTokenStreamToUnicodeAutomaton(); boolean success = false; byte buffer[] = new byte[8]; @@ -443,17 +466,17 @@ BytesRef surfaceForm; while ((surfaceForm = iterator.next()) != null) { - Set paths = toFiniteStrings(surfaceForm, ts2a); + Set paths = toFiniteStrings(surfaceForm, ts2ua); - + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size()); for (IntsRef path : paths) { Util.toBytesRef(path, scratch); - + // length of the analyzed text (FST input) - if (scratch.length > Short.MAX_VALUE-2) { + if (scratch.length > Short.MAX_VALUE - 2) { - throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")"); + throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length + ")"); } short analyzedLength = (short) scratch.length; @@ -464,8 +487,8 @@ BytesRef payload; if (hasPayloads) { - if (surfaceForm.length > (Short.MAX_VALUE-2)) { + if (surfaceForm.length > (Short.MAX_VALUE - 2)) { - throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")"); + throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")"); } payload = payloads.payload(); // payload + surfaceLength (short) @@ -473,9 +496,9 @@ } else { payload = null; } - + buffer = ArrayUtil.grow(buffer, requiredLength); - + output.reset(buffer); output.writeShort(analyzedLength); @@ -485,7 +508,7 @@ output.writeInt(encodeWeight(iterator.weight())); if (hasPayloads) { - for(int i=0;i outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); + PairOutputs outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); - Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); + Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; @@ -532,7 +555,7 @@ while (reader.read(scratch)) { input.reset(scratch.bytes, scratch.offset, scratch.length); short analyzedLength = input.readShort(); - analyzed.grow(analyzedLength+2); + analyzed.grow(analyzedLength + 2); input.readBytes(analyzed.bytes, 0, analyzedLength); analyzed.length = analyzedLength; @@ -546,7 +569,7 @@ surface.offset = input.getPosition(); surface.length = scratch.length - surface.offset; } - + if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.copyBytes(analyzed); @@ -577,8 +600,8 @@ // NOTE: must be byte 0 so we sort before whatever // is next - analyzed.bytes[analyzed.offset+analyzed.length] = 0; + analyzed.bytes[analyzed.offset + analyzed.length] = 0; - analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup; + analyzed.bytes[analyzed.offset + analyzed.length + 1] = (byte) dedup; analyzed.length += 2; Util.toIntsRef(analyzed, scratchInts); @@ -591,7 +614,7 @@ BytesRef br = new BytesRef(surface.length + 1 + payloadLength); System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length); br.bytes[surface.length] = PAYLOAD_SEP; - System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength); + System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength); br.length = br.bytes.length; builder.add(scratchInts, outputs.newPair(cost, br)); } @@ -599,7 +622,7 @@ fst = builder.finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); - + success = true; } finally { if (success) { @@ -607,7 +630,7 @@ } else { IOUtils.closeWhileHandlingException(reader, writer); } - + tempInput.delete(); tempSorted.delete(); } @@ -634,7 +657,7 @@ public boolean load(InputStream input) throws IOException { DataInput dataIn = new InputStreamDataInput(input); try { - this.fst = new FST>(dataIn, new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + this.fst = new FST>(dataIn, new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); maxAnalyzedPathsForOneInput = dataIn.readVInt(); hasPayloads = dataIn.readByte() == 1; } finally { @@ -647,8 +670,8 @@ LookupResult result; if (hasPayloads) { int sepIndex = -1; - for(int i=0;i= output2.length) { return false; } - for(int i=0;i> searcher; + Util.TopNSearcher> searcher; - searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); + searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); // NOTE: we could almost get away with only using // the first start node. The only catch is if // maxSurfaceFormsPerAnalyzedForm had kicked in and // pruned our exact match from one of these nodes // ...: - for (FSTUtil.Path> path : prefixPaths) { + for (FSTUtil.Path> path : prefixPaths) { if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: @@ -752,7 +775,7 @@ } } - MinResult> completions[] = searcher.search(); + MinResult> completions[] = searcher.search(); // NOTE: this is rather inefficient: we enumerate // every matching "exactly the same analyzed form" @@ -766,7 +789,7 @@ // seach: it's bounded by how many prefix start // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: - for(MinResult> completion : completions) { + for (MinResult> completion : completions) { BytesRef output2 = completion.output.output2; if (sameSurfaceForm(utf8Key, output2)) { results.add(getLookupResult(completion.output.output1, output2, spare)); @@ -780,15 +803,15 @@ } } - Util.TopNSearcher> searcher; + Util.TopNSearcher> searcher; - searcher = new Util.TopNSearcher>(fst, + searcher = new Util.TopNSearcher>(fst, - num - results.size(), - num * maxAnalyzedPathsForOneInput, - weightComparator) { + num - results.size(), + num * maxAnalyzedPathsForOneInput, + weightComparator) { private final Set seen = new HashSet(); @Override - protected boolean acceptResult(IntsRef input, Pair output) { + protected boolean acceptResult(IntsRef input, Pair output) { // Dedup: when the input analyzes to a graph we // can get duplicate surface forms: @@ -796,7 +819,7 @@ return false; } seen.add(output.output2); - + if (!exactFirst) { return true; } else { @@ -816,14 +839,14 @@ }; prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst); - + - for (FSTUtil.Path> path : prefixPaths) { + for (FSTUtil.Path> path : prefixPaths) { searcher.addStartPaths(path.fstNode, path.output, true, path.input); } - MinResult> completions[] = searcher.search(); + MinResult> completions[] = searcher.search(); - for(MinResult> completion : completions) { + for (MinResult> completion : completions) { LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare); @@ -846,26 +869,30 @@ } } - /** Returns all prefix paths to initialize the search. */ + /** + * Returns all prefix paths to initialize the search. + */ - protected List>> getFullPrefixPaths(List>> prefixPaths, + protected List>> getFullPrefixPaths(List>> prefixPaths, - Automaton lookupAutomaton, + Automaton lookupAutomaton, - FST> fst) + FST> fst) - throws IOException { + throws IOException { return prefixPaths; } - + - final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { + final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToUnicodeAutomaton ts2ua) throws IOException { - // Analyze surface form: + // Analyze surface form: TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); - // Create corresponding automaton: labels are bytes - // from each analyzed token, with byte 0 used as + // Create corresponding automaton: labels are Unicode code points + // from each analyzed token, with code point 0 used as // separator between tokens: - Automaton automaton = ts2a.toAutomaton(ts); + Automaton unicodeAutomaton = ts2ua.toAutomaton(ts); ts.close(); - replaceSep(automaton); + replaceSep(unicodeAutomaton); + Automaton automaton = new UTF32ToUTF8().convert(unicodeAutomaton); + assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be @@ -882,7 +909,7 @@ // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); - Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + Automaton unicodeAutomaton = (getTokenStreamToUnicodeAutomaton()).toAutomaton(ts); ts.close(); // TODO: we could use the end offset to "guess" @@ -891,16 +918,15 @@ // This way we could eg differentiate "net" from "net ", // which we can't today... - replaceSep(automaton); + replaceSep(unicodeAutomaton); // TODO: we can optimize this somewhat by determinizing // while we convert - BasicOperations.determinize(automaton); - return automaton; + BasicOperations.determinize(unicodeAutomaton); + return unicodeAutomaton; } - - + + - /** * Returns the weight associated with an input string, * or null if it does not exist. @@ -908,23 +934,27 @@ public Object get(CharSequence key) { throw new UnsupportedOperationException(); } - + - /** cost -> weight */ + /** + * cost -> weight + */ private static int decodeWeight(long encoded) { - return (int)(Integer.MAX_VALUE - encoded); + return (int) (Integer.MAX_VALUE - encoded); } - + - /** weight -> cost */ + /** + * weight -> cost + */ private static int encodeWeight(long value) { if (value < 0 || value > Integer.MAX_VALUE) { throw new UnsupportedOperationException("cannot encode value: " + value); } - return Integer.MAX_VALUE - (int)value; + return Integer.MAX_VALUE - (int) value; } - + - static final Comparator> weightComparator = new Comparator> () { + static final Comparator> weightComparator = new Comparator>() { @Override - public int compare(Pair left, Pair right) { + public int compare(Pair left, Pair right) { return left.output1.compareTo(right.output1); } }; Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.IOException;\nimport java.io.Reader;\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\nimport java.util.TreeSet;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.CannedTokenStream;\nimport org.apache.lucene.analysis.MockAnalyzer;\nimport org.apache.lucene.analysis.MockTokenFilter;\nimport org.apache.lucene.analysis.MockTokenizer;\nimport org.apache.lucene.analysis.Token;\nimport org.apache.lucene.analysis.TokenFilter;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.TokenStreamToAutomaton;\nimport org.apache.lucene.analysis.Tokenizer;\nimport org.apache.lucene.analysis.tokenattributes.CharTermAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.search.suggest.Lookup.LookupResult;\nimport org.apache.lucene.search.suggest.TermFreq;\nimport org.apache.lucene.search.suggest.TermFreqArrayIterator;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.LuceneTestCase;\nimport org.apache.lucene.util._TestUtil;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.fst.Util;\n\npublic class FuzzySuggesterTest extends LuceneTestCase {\n \n public void testRandomEdits() throws IOException {\n List keys = new ArrayList();\n int numTerms = atLeast(100);\n for (int i = 0; i < numTerms; i++) {\n keys.add(new TermFreq(\"boo\" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));\n }\n keys.add(new TermFreq(\"foo bar boo far\", 12));\n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n int numIters = atLeast(10);\n for (int i = 0; i < numIters; i++) {\n String addRandomEdit = addRandomEdit(\"foo bar boo\", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX);\n List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);\n assertEquals(addRandomEdit, 1, results.size());\n assertEquals(\"foo bar boo far\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F); \n }\n }\n \n /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */\n public void testKeyword() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"foo\", 50),\n new TermFreq(\"bar\", 10),\n new TermFreq(\"barbar\", 12),\n new TermFreq(\"barbara\", 6)\n };\n \n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"bariar\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbr\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbara\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbara\", results.get(0).key.toString());\n assertEquals(6, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbar\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"barbara\", results.get(1).key.toString());\n assertEquals(6, results.get(1).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbaa\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"barbara\", results.get(1).key.toString());\n assertEquals(6, results.get(1).value, 0.01F);\n \n // top N of 2, but only foo is available\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"f\", random()), false, 2);\n assertEquals(1, results.size());\n assertEquals(\"foo\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n \n // top N of 1 for 'bar': we return this even though\n // barbar is higher because exactFirst is enabled:\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"bar\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"bar\", results.get(0).key.toString());\n assertEquals(10, results.get(0).value, 0.01F);\n \n // top N Of 2 for 'b'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"b\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n \n // top N of 3 for 'ba'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ba\", random()), false, 3);\n assertEquals(3, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n assertEquals(\"barbara\", results.get(2).key.toString());\n assertEquals(6, results.get(2).value, 0.01F);\n }\n \n /**\n * basic \"standardanalyzer\" test with stopword removal\n */\n public void testStandard() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"the ghost of christmas past\", 50),\n };\n \n Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);\n FuzzySuggester suggester = new FuzzySuggester(standard);\n suggester.setPreservePositionIncrements(false);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"the ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' since its a stopword, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' and 'of' since they are stopwords, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n }\n\n public void testNoSeps() throws Exception {\n TermFreq[] keys = new TermFreq[] {\n new TermFreq(\"ab cd\", 0),\n new TermFreq(\"abcd\", 1),\n };\n\n int options = 0;\n\n Analyzer a = new MockAnalyzer(random());\n FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);\n suggester.build(new TermFreqArrayIterator(keys));\n // TODO: would be nice if \"ab \" would allow the test to\n // pass, and more generally if the analyzer can know\n // that the user's current query has ended at a word, \n // but, analyzers don't produce SEP tokens!\n List r = suggester.lookup(_TestUtil.stringToCharSequence(\"ab c\", random()), false, 2);\n assertEquals(2, r.size());\n\n // With no PRESERVE_SEPS specified, \"ab c\" should also\n // complete to \"abcd\", which has higher weight so should\n // appear first:\n assertEquals(\"abcd\", r.get(0).key.toString());\n }\n\n public void testGraphDups() throws Exception {\n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"slow\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"wi\",1,1),\n token(\"hotspot\",0,3),\n token(\"fi\",1,1),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"fast\",1,1)\n\n }),\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1)\n }),\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"wifi network is slow\", 50),\n new TermFreq(\"wi fi network is fast\", 10),\n };\n FuzzySuggester suggester = new FuzzySuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(\"wifi network\", false, 10);\n if (VERBOSE) {\n System.out.println(\"Results: \" + results);\n }\n assertEquals(2, results.size());\n assertEquals(\"wifi network is slow\", results.get(0).key);\n assertEquals(50, results.get(0).value);\n assertEquals(\"wi fi network is fast\", results.get(1).key);\n assertEquals(10, results.get(1).value);\n }\n\n public void testEmpty() throws Exception {\n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(new TermFreq[0]));\n\n List result = suggester.lookup(\"a\", false, 20);\n assertTrue(result.isEmpty());\n }\n\n public void testInputPathRequired() throws Exception {\n\n // SynonymMap.Builder b = new SynonymMap.Builder(false);\n // b.add(new CharsRef(\"ab\"), new CharsRef(\"ba\"), true);\n // final SynonymMap map = b.build();\n\n // The Analyzer below mimics the functionality of the SynonymAnalyzer\n // using the above map, so that the suggest module does not need a dependency on the \n // synonym module \n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"xc\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ba\",1,1), \n token(\"xd\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"x\",1,1)\n })\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"ab xc\", 50),\n new TermFreq(\"ba xd\", 50),\n };\n FuzzySuggester suggester = new FuzzySuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n List results = suggester.lookup(\"ab x\", false, 1);\n assertTrue(results.size() == 1);\n }\n\n private static Token token(String term, int posInc, int posLength) {\n final Token t = new Token(term, 0, 0);\n t.setPositionIncrement(posInc);\n t.setPositionLength(posLength);\n return t;\n }\n\n /*\n private void printTokens(final Analyzer analyzer, String input) throws IOException {\n System.out.println(\"Tokens for \" + input);\n TokenStream ts = analyzer.tokenStream(\"\", new StringReader(input));\n ts.reset();\n final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);\n final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);\n final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);\n \n while(ts.incrementToken()) {\n termBytesAtt.fillBytesRef();\n System.out.println(String.format(\"%s,%s,%s\", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); \n }\n ts.end();\n ts.close();\n } \n */ \n\n private final Analyzer getUnusualAnalyzer() {\n return new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n\n int count;\n\n @Override\n public TokenStream getTokenStream() {\n // 4th time we are called, return tokens a b,\n // else just a:\n if (count++ != 3) {\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n });\n } else {\n // After that \"a b\":\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n token(\"b\", 1, 1),\n });\n }\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n }\n\n public void testExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n //System.out.println(\"ALL: \" + suggester.lookup(\"x y\", false, 6));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"x y\", false, topN);\n //System.out.println(\"topN=\" + topN + \" \" + results);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"x y\", results.get(0).key);\n assertEquals(1, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"z z z\", results.get(1).key);\n assertEquals(20, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x y z\", results.get(2).key);\n assertEquals(3, results.get(2).value);\n\n if (topN > 3) {\n assertEquals(\"x\", results.get(3).key);\n assertEquals(2, results.get(3).value);\n }\n }\n }\n }\n }\n\n public void testNonExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"p\", false, topN);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"z z z\", results.get(0).key);\n assertEquals(20, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"x y z\", results.get(1).key);\n assertEquals(3, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x\", results.get(2).key);\n assertEquals(2, results.get(2).value);\n \n if (topN > 3) {\n assertEquals(\"x y\", results.get(3).key);\n assertEquals(1, results.get(3).value);\n }\n }\n }\n }\n }\n \n // Holds surface form separately:\n private static class TermFreq2 implements Comparable {\n public final String surfaceForm;\n public final String analyzedForm;\n public final long weight;\n\n public TermFreq2(String surfaceForm, String analyzedForm, long weight) {\n this.surfaceForm = surfaceForm;\n this.analyzedForm = analyzedForm;\n this.weight = weight;\n }\n\n @Override\n public int compareTo(TermFreq2 other) {\n int cmp = analyzedForm.compareTo(other.analyzedForm);\n if (cmp != 0) {\n return cmp;\n } else if (weight > other.weight) {\n return -1;\n } else if (weight < other.weight) {\n return 1;\n } else {\n assert false;\n return 0;\n }\n }\n }\n\n static boolean isStopChar(char ch, int numStopChars) {\n //System.out.println(\"IS? \" + ch + \": \" + (ch - 'a') + \": \" + ((ch - 'a') < numStopChars));\n return (ch - 'a') < numStopChars;\n }\n\n // Like StopFilter:\n private static class TokenEater extends TokenFilter {\n private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);\n private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);\n private final int numStopChars;\n private final boolean preserveHoles;\n private boolean first;\n\n public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) {\n super(in);\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public void reset() throws IOException {\n super.reset();\n first = true;\n }\n\n @Override\n public final boolean incrementToken() throws IOException {\n int skippedPositions = 0;\n while (input.incrementToken()) {\n if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) {\n int posInc = posIncrAtt.getPositionIncrement() + skippedPositions;\n if (first) {\n if (posInc == 0) {\n // first token having posinc=0 is illegal.\n posInc = 1;\n }\n first = false;\n }\n posIncrAtt.setPositionIncrement(posInc);\n //System.out.println(\"RETURN term=\" + termAtt + \" numStopChars=\" + numStopChars);\n return true;\n }\n if (preserveHoles) {\n skippedPositions += posIncrAtt.getPositionIncrement();\n }\n }\n\n return false;\n }\n }\n\n private static class MockTokenEatingAnalyzer extends Analyzer {\n private int numStopChars;\n private boolean preserveHoles;\n\n public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public TokenStreamComponents createComponents(String fieldName, Reader reader) {\n MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);\n tokenizer.setEnableChecks(true);\n TokenStream next;\n if (numStopChars != 0) {\n next = new TokenEater(preserveHoles, tokenizer, numStopChars);\n } else {\n next = tokenizer;\n }\n return new TokenStreamComponents(tokenizer, next);\n }\n }\n\n public void testRandom() throws Exception {\n\n int numQueries = atLeast(100);\n \n final List slowCompletor = new ArrayList();\n final TreeSet allPrefixes = new TreeSet();\n final Set seen = new HashSet();\n \n TermFreq[] keys = new TermFreq[numQueries];\n\n boolean preserveSep = random().nextBoolean();\n\n final int numStopChars = random().nextInt(10);\n final boolean preserveHoles = random().nextBoolean();\n\n if (VERBOSE) {\n System.out.println(\"TEST: \" + numQueries + \" words; preserveSep=\" + preserveSep + \" numStopChars=\" + numStopChars + \" preserveHoles=\" + preserveHoles);\n }\n \n for (int i = 0; i < numQueries; i++) {\n int numTokens = _TestUtil.nextInt(random(), 1, 4);\n String key;\n String analyzedKey;\n while(true) {\n key = \"\";\n analyzedKey = \"\";\n boolean lastRemoved = false;\n for(int token=0;token < numTokens;token++) {\n String s;\n while (true) {\n // TODO: would be nice to fix this slowCompletor/comparator to\n // use full range, but we might lose some coverage too...\n s = _TestUtil.randomSimpleString(random());\n if (s.length() > 0) {\n if (token > 0) {\n key += \" \";\n }\n if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') {\n analyzedKey += \" \";\n }\n key += s;\n if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n analyzedKey += '\\u0000';\n }\n lastRemoved = true;\n } else {\n analyzedKey += s;\n lastRemoved = false;\n }\n break;\n }\n }\n }\n\n analyzedKey = analyzedKey.replaceAll(\"(^| )\\u0000$\", \"\");\n\n if (preserveSep && lastRemoved) {\n analyzedKey += \" \";\n }\n\n // Don't add same surface form more than once:\n if (!seen.contains(key)) {\n seen.add(key);\n break;\n }\n }\n\n for (int j = 1; j < key.length(); j++) {\n allPrefixes.add(key.substring(0, j));\n }\n // we can probably do Integer.MAX_VALUE here, but why worry.\n int weight = random().nextInt(1<<24);\n keys[i] = new TermFreq(key, weight);\n\n slowCompletor.add(new TermFreq2(key, analyzedKey, weight));\n }\n\n if (VERBOSE) {\n // Don't just sort original list, to avoid VERBOSE\n // altering the test:\n List sorted = new ArrayList(slowCompletor);\n Collections.sort(sorted);\n for(TermFreq2 ent : sorted) {\n System.out.println(\" surface='\" + ent.surfaceForm + \" analyzed='\" + ent.analyzedForm + \"' weight=\" + ent.weight);\n }\n }\n\n Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);\n FuzzySuggester suggester = new FuzzySuggester(a, a,\n preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);\n suggester.build(new TermFreqArrayIterator(keys));\n\n for (String prefix : allPrefixes) {\n\n if (VERBOSE) {\n System.out.println(\"\\nTEST: prefix=\" + prefix);\n }\n\n final int topN = _TestUtil.nextInt(random(), 1, 10);\n List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN);\n\n // 2. go thru whole set to find suggestions:\n List matches = new ArrayList();\n\n // \"Analyze\" the key:\n String[] tokens = prefix.split(\" \");\n StringBuilder builder = new StringBuilder();\n boolean lastRemoved = false;\n for(int i=0;i 0 && !builder.toString().endsWith(\" \")) {\n builder.append(' ');\n }\n\n if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n builder.append(\"\\u0000\");\n }\n lastRemoved = true;\n } else {\n builder.append(token);\n lastRemoved = false;\n }\n }\n\n String analyzedKey = builder.toString();\n\n // Remove trailing sep/holes (TokenStream.end() does\n // not tell us any trailing holes, yet ... there is an\n // issue open for this):\n while (true) {\n String s = analyzedKey.replaceAll(\"(^| )\\u0000$\", \"\");\n s = s.replaceAll(\"\\\\s+$\", \"\");\n if (s.equals(analyzedKey)) {\n break;\n }\n analyzedKey = s;\n }\n\n if (analyzedKey.length() == 0) {\n // Currently suggester can't suggest from the empty\n // string! You get no results, not all results...\n continue;\n }\n\n if (preserveSep && (prefix.endsWith(\" \") || lastRemoved)) {\n analyzedKey += \" \";\n }\n\n if (VERBOSE) {\n System.out.println(\" analyzed: \" + analyzedKey);\n }\n TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();\n\n // NOTE: not great that we ask the suggester to give\n // us the \"answer key\" (ie maybe we have a bug in\n // suggester.toLevA ...) ... but testRandom2() fixes\n // this:\n Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));\n assertTrue(automaton.isDeterministic());\n // TODO: could be faster... but its slowCompletor for a reason\n BytesRef spare = new BytesRef();\n for (TermFreq2 e : slowCompletor) {\n spare.copyChars(e.analyzedForm);\n Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);\n for (IntsRef intsRef : finiteStrings) {\n State p = automaton.getInitialState();\n BytesRef ref = Util.toBytesRef(intsRef, spare);\n boolean added = false;\n for (int i = ref.offset; i < ref.length; i++) {\n State q = p.step(ref.bytes[i] & 0xff);\n if (q == null) {\n break;\n } else if (q.isAccept()) {\n matches.add(new LookupResult(e.surfaceForm, e.weight));\n added = true;\n break;\n }\n p = q;\n }\n if (!added && p.isAccept()) {\n matches.add(new LookupResult(e.surfaceForm, e.weight));\n } \n }\n }\n\n assertTrue(numStopChars > 0 || matches.size() > 0);\n\n if (matches.size() > 1) {\n Collections.sort(matches, new Comparator() {\n @Override\n public int compare(LookupResult left, LookupResult right) {\n int cmp = Float.compare(right.value, left.value);\n if (cmp == 0) {\n return left.compareTo(right);\n } else {\n return cmp;\n }\n }\n });\n }\n\n if (matches.size() > topN) {\n matches = matches.subList(0, topN);\n }\n\n if (VERBOSE) {\n System.out.println(\" expected:\");\n for(LookupResult lr : matches) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n\n System.out.println(\" actual:\");\n for(LookupResult lr : r) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n }\n \n assertEquals(prefix + \" \" + topN, matches.size(), r.size());\n for(int hit=0;hit keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"a\", 40),\n new TermFreq(\"a \", 50),\n new TermFreq(\" a\", 60),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n List results = suggester.lookup(\"a\", false, 5);\n assertEquals(2, results.size());\n assertEquals(\" a\", results.get(0).key);\n assertEquals(60, results.get(0).value);\n assertEquals(\"a \", results.get(1).key);\n assertEquals(50, results.get(1).value);\n }\n\n public void testEditSeps() throws Exception {\n Analyzer a = new MockAnalyzer(random());\n FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);\n\n List keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"foo bar\", 40),\n new TermFreq(\"foo bar baz\", 50),\n new TermFreq(\"barbaz\", 60),\n new TermFreq(\"barbazfoo\", 10),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n assertEquals(\"[foo bar baz/50, foo bar/40]\", suggester.lookup(\"foobar\", false, 5).toString());\n assertEquals(\"[foo bar baz/50]\", suggester.lookup(\"foobarbaz\", false, 5).toString());\n assertEquals(\"[barbaz/60, barbazfoo/10]\", suggester.lookup(\"bar baz\", false, 5).toString());\n assertEquals(\"[barbazfoo/10]\", suggester.lookup(\"bar baz foo\", false, 5).toString());\n }\n \n @SuppressWarnings(\"fallthrough\")\n private static String addRandomEdit(String string, int prefixLength) {\n char[] input = string.toCharArray();\n StringBuilder builder = new StringBuilder();\n for (int i = 0; i < input.length; i++) {\n if (i >= prefixLength && random().nextBoolean() && i < input.length-1) {\n switch(random().nextInt(4)) {\n case 3:\n if (i < input.length-1) {\n // Transpose input[i] and input[1+i]:\n builder.append(input[i+1]);\n builder.append(input[i]);\n for(int j=i+2;j answers = new ArrayList();\n final Set seen = new HashSet();\n for(int i=0;i() {\n @Override\n public int compare(TermFreq a, TermFreq b) {\n return a.term.compareTo(b.term);\n }\n });\n if (VERBOSE) {\n System.out.println(\"\\nTEST: targets\");\n for(TermFreq tf : answers) {\n System.out.println(\" \" + tf.term.utf8ToString() + \" freq=\" + tf.v);\n }\n }\n\n Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);\n int maxEdits = random().nextBoolean() ? 1 : 2;\n int prefixLen = random().nextInt(4);\n boolean transpositions = random().nextBoolean();\n // TODO: test graph analyzers\n // TODO: test exactFirst / preserveSep permutations\n FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, prefixLen);\n\n if (VERBOSE) {\n System.out.println(\"TEST: maxEdits=\" + maxEdits + \" prefixLen=\" + prefixLen + \" transpositions=\" + transpositions + \" num=\" + NUM);\n }\n\n Collections.shuffle(answers, random());\n suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()])));\n\n final int ITERS = atLeast(100);\n for(int iter=0;iter expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag);\n if (VERBOSE) {\n System.out.println(\" expected: \" + expected.size());\n for(LookupResult c : expected) {\n System.out.println(\" \" + c);\n }\n }\n final List actual = suggest.lookup(frag, false, NUM);\n if (VERBOSE) {\n System.out.println(\" actual: \" + actual.size());\n for(LookupResult c : actual) {\n System.out.println(\" \" + c);\n }\n }\n\n Collections.sort(actual, new CompareByCostThenAlpha());\n\n final int limit = Math.min(expected.size(), actual.size());\n for(int ans=0;ans slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List answers, String frag) {\n final List results = new ArrayList();\n final int fragLen = frag.length();\n for(TermFreq tf : answers) {\n //System.out.println(\" check s=\" + tf.term.utf8ToString());\n boolean prefixMatches = true;\n for(int i=0;i= fragLen-maxEdits) {\n // OK it's possible:\n //System.out.println(\" possible\");\n int d;\n final String s = tf.term.utf8ToString();\n if (fragLen == prefixLen) {\n d = 0;\n } else if (false && len < fragLen) {\n d = getDistance(frag, s, allowTransposition);\n } else {\n //System.out.println(\" try loop\");\n d = maxEdits + 1;\n //for(int ed=-maxEdits;ed<=maxEdits;ed++) {\n for(int ed=-maxEdits;ed<=maxEdits;ed++) {\n if (s.length() < fragLen - ed) {\n continue;\n }\n String check = s.substring(0, fragLen-ed);\n d = getDistance(frag, check, allowTransposition);\n //System.out.println(\" sub check s=\" + check + \" d=\" + d);\n if (d <= maxEdits) {\n break;\n }\n }\n }\n if (d <= maxEdits) {\n results.add(new LookupResult(tf.term.utf8ToString(), tf.v));\n }\n }\n }\n\n Collections.sort(results, new CompareByCostThenAlpha());\n }\n\n return results;\n }\n\n private static class CharSequenceComparator implements Comparator {\n\n @Override\n public int compare(CharSequence o1, CharSequence o2) {\n final int l1 = o1.length();\n final int l2 = o2.length();\n \n final int aStop = Math.min(l1, l2);\n for (int i = 0; i < aStop; i++) {\n int diff = o1.charAt(i) - o2.charAt(i);\n if (diff != 0) {\n return diff;\n }\n }\n // One is a prefix of the other, or, they are equal:\n return l1 - l2;\n }\n }\n\n private static final Comparator CHARSEQUENCE_COMPARATOR = new CharSequenceComparator();\n\n public class CompareByCostThenAlpha implements Comparator {\n @Override\n public int compare(LookupResult a, LookupResult b) {\n if (a.value > b.value) {\n return -1;\n } else if (a.value < b.value) {\n return 1;\n } else {\n final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);\n assert c != 0: \"term=\" + a.key;\n return c;\n }\n }\n }\n\n // NOTE: copied from\n // modules/suggest/src/java/org/apache/lucene/search/spell/LuceneLevenshteinDistance.java\n // and tweaked to return the edit distance not the float\n // lucene measure\n\n /* Finds unicode (code point) Levenstein (edit) distance\n * between two strings, including transpositions. */\n public int getDistance(String target, String other, boolean allowTransposition) {\n IntsRef targetPoints;\n IntsRef otherPoints;\n int n;\n int d[][]; // cost array\n \n // NOTE: if we cared, we could 3*m space instead of m*n space, similar to \n // what LevenshteinDistance does, except cycling thru a ring of three \n // horizontal cost arrays... but this comparator is never actually used by \n // DirectSpellChecker, its only used for merging results from multiple shards \n // in \"distributed spellcheck\", and its inefficient in other ways too...\n\n // cheaper to do this up front once\n targetPoints = toIntsRef(target);\n otherPoints = toIntsRef(other);\n n = targetPoints.length;\n final int m = otherPoints.length;\n d = new int[n+1][m+1];\n \n if (n == 0 || m == 0) {\n if (n == m) {\n return 0;\n }\n else {\n return Math.max(n, m);\n }\n } \n\n // indexes into strings s and t\n int i; // iterates through s\n int j; // iterates through t\n\n int t_j; // jth character of t\n\n int cost; // cost\n\n for (i = 0; i<=n; i++) {\n d[i][0] = i;\n }\n \n for (j = 0; j<=m; j++) {\n d[0][j] = j;\n }\n\n for (j = 1; j<=m; j++) {\n t_j = otherPoints.ints[j-1];\n\n for (i=1; i<=n; i++) {\n cost = targetPoints.ints[i-1]==t_j ? 0 : 1;\n // minimum of cell to the left+1, to the top+1, diagonally left and up +cost\n d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);\n // transposition\n if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {\n d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);\n }\n }\n }\n \n return d[n][m];\n }\n \n private static IntsRef toIntsRef(String s) {\n IntsRef ref = new IntsRef(s.length()); // worst case\n int utf16Len = s.length();\n for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {\n cp = ref.ints[ref.length++] = Character.codePointAt(s, i);\n }\n return ref;\n }\n}\n =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision f81056da25f3671b9807c4a51d6b985389fe916e) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision ) @@ -17,17 +17,6 @@ * limitations under the License. */ -import java.io.IOException; -import java.io.Reader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockAnalyzer; @@ -36,7 +25,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -51,8 +40,19 @@ import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.fst.Util; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + public class FuzzySuggesterTest extends LuceneTestCase { - + public void testRandomEdits() throws IOException { List keys = new ArrayList(); int numTerms = atLeast(100); @@ -68,64 +68,87 @@ List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); assertEquals(addRandomEdit, 1, results.size()); assertEquals("foo bar boo far", results.get(0).key.toString()); - assertEquals(12, results.get(0).value, 0.01F); + assertEquals(12, results.get(0).value, 0.01F); } } - + - /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ + public void testNonLatinRandomEdits() throws IOException { + List keys = new ArrayList(); + int numTerms = atLeast(100); + for (int i = 0; i < numTerms; i++) { + keys.add(new TermFreq("буу" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); + } + keys.add(new TermFreq("фуу бар буу \u00ff фар", 12)); + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, + 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH); + suggester.build(new TermFreqArrayIterator(keys)); + int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + String addRandomEdit = addRandomEdit("фуу бар буу", 0); + List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); + assertEquals(addRandomEdit, 1, results.size()); + assertEquals("фуу бар буу \u00ff фар", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + } + } + + /** + * this is basically the WFST test ported to KeywordAnalyzer. so it acts the same + */ public void testKeyword() throws Exception { - TermFreq keys[] = new TermFreq[] { + TermFreq keys[] = new TermFreq[]{ new TermFreq("foo", 50), new TermFreq("bar", 10), new TermFreq("barbar", 12), new TermFreq("barbara", 6) }; - + FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); suggester.build(new TermFreqArrayIterator(keys)); - + List results = suggester.lookup(_TestUtil.stringToCharSequence("bariar", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); - + results = suggester.lookup(_TestUtil.stringToCharSequence("barbr", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); - + results = suggester.lookup(_TestUtil.stringToCharSequence("barbara", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbara", results.get(0).key.toString()); assertEquals(6, results.get(0).value, 0.01F); - + results = suggester.lookup(_TestUtil.stringToCharSequence("barbar", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("barbara", results.get(1).key.toString()); assertEquals(6, results.get(1).value, 0.01F); - + results = suggester.lookup(_TestUtil.stringToCharSequence("barbaa", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("barbara", results.get(1).key.toString()); assertEquals(6, results.get(1).value, 0.01F); - + // top N of 2, but only foo is available results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2); assertEquals(1, results.size()); assertEquals("foo", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); - + // top N of 1 for 'bar': we return this even though // barbar is higher because exactFirst is enabled: results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1); assertEquals(1, results.size()); assertEquals("bar", results.get(0).key.toString()); assertEquals(10, results.get(0).value, 0.01F); - + // top N Of 2 for 'b' results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2); assertEquals(2, results.size()); @@ -133,7 +156,7 @@ assertEquals(12, results.get(0).value, 0.01F); assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); - + // top N of 3 for 'ba' results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3); assertEquals(3, results.size()); @@ -144,20 +167,20 @@ assertEquals("barbara", results.get(2).key.toString()); assertEquals(6, results.get(2).value, 0.01F); } - + /** * basic "standardanalyzer" test with stopword removal */ public void testStandard() throws Exception { - TermFreq keys[] = new TermFreq[] { + TermFreq keys[] = new TermFreq[]{ new TermFreq("the ghost of christmas past", 50), }; - + Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); FuzzySuggester suggester = new FuzzySuggester(standard); suggester.setPreservePositionIncrements(false); suggester.build(new TermFreqArrayIterator(keys)); - + List results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); @@ -177,9 +200,9 @@ } public void testNoSeps() throws Exception { - TermFreq[] keys = new TermFreq[] { + TermFreq[] keys = new TermFreq[]{ - new TermFreq("ab cd", 0), - new TermFreq("abcd", 1), + new TermFreq("ab cd", 0), + new TermFreq("abcd", 1), }; int options = 0; @@ -206,30 +229,30 @@ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - + return new TokenStreamComponents(tokenizer) { int tokenStreamCounter = 0; - final TokenStream[] tokenStreams = new TokenStream[] { + final TokenStream[] tokenStreams = new TokenStream[]{ - new CannedTokenStream(new Token[] { + new CannedTokenStream(new Token[]{ - token("wifi",1,1), + token("wifi", 1, 1), - token("hotspot",0,2), + token("hotspot", 0, 2), - token("network",1,1), + token("network", 1, 1), - token("is",1,1), + token("is", 1, 1), - token("slow",1,1) + token("slow", 1, 1) }), - new CannedTokenStream(new Token[] { + new CannedTokenStream(new Token[]{ - token("wi",1,1), + token("wi", 1, 1), - token("hotspot",0,3), + token("hotspot", 0, 3), - token("fi",1,1), + token("fi", 1, 1), - token("network",1,1), + token("network", 1, 1), - token("is",1,1), + token("is", 1, 1), - token("fast",1,1) + token("fast", 1, 1) }), - new CannedTokenStream(new Token[] { + new CannedTokenStream(new Token[]{ - token("wifi",1,1), + token("wifi", 1, 1), - token("hotspot",0,2), + token("hotspot", 0, 2), - token("network",1,1) + token("network", 1, 1) }), }; @@ -239,7 +262,7 @@ tokenStreamCounter++; return result; } - + @Override protected void setReader(final Reader reader) throws IOException { } @@ -247,13 +270,13 @@ } }; - TermFreq keys[] = new TermFreq[] { + TermFreq keys[] = new TermFreq[]{ new TermFreq("wifi network is slow", 50), new TermFreq("wi fi network is fast", 10), }; FuzzySuggester suggester = new FuzzySuggester(analyzer); suggester.build(new TermFreqArrayIterator(keys)); - + List results = suggester.lookup("wifi network", false, 10); if (VERBOSE) { System.out.println("Results: " + results); @@ -287,23 +310,23 @@ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - + return new TokenStreamComponents(tokenizer) { int tokenStreamCounter = 0; - final TokenStream[] tokenStreams = new TokenStream[] { + final TokenStream[] tokenStreams = new TokenStream[]{ - new CannedTokenStream(new Token[] { + new CannedTokenStream(new Token[]{ - token("ab",1,1), + token("ab", 1, 1), - token("ba",0,1), + token("ba", 0, 1), - token("xc",1,1) + token("xc", 1, 1) }), - new CannedTokenStream(new Token[] { + new CannedTokenStream(new Token[]{ - token("ba",1,1), + token("ba", 1, 1), - token("xd",1,1) + token("xd", 1, 1) }), - new CannedTokenStream(new Token[] { + new CannedTokenStream(new Token[]{ - token("ab",1,1), + token("ab", 1, 1), - token("ba",0,1), + token("ba", 0, 1), - token("x",1,1) + token("x", 1, 1) }) }; @@ -313,7 +336,7 @@ tokenStreamCounter++; return result; } - + @Override protected void setReader(final Reader reader) throws IOException { } @@ -321,7 +344,7 @@ } }; - TermFreq keys[] = new TermFreq[] { + TermFreq keys[] = new TermFreq[]{ new TermFreq("ab xc", 50), new TermFreq("ba xd", 50), }; @@ -354,14 +377,14 @@ ts.end(); ts.close(); } - */ + */ private final Analyzer getUnusualAnalyzer() { return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - + return new TokenStreamComponents(tokenizer) { int count; @@ -371,18 +394,18 @@ // 4th time we are called, return tokens a b, // else just a: if (count++ != 3) { - return new CannedTokenStream(new Token[] { + return new CannedTokenStream(new Token[]{ token("a", 1, 1), - }); + }); } else { // After that "a b": - return new CannedTokenStream(new Token[] { + return new CannedTokenStream(new Token[]{ token("a", 1, 1), token("b", 1, 1), - }); + }); } } - + @Override protected void setReader(final Reader reader) throws IOException { } @@ -395,16 +418,16 @@ Analyzer a = getUnusualAnalyzer(); FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3); - suggester.build(new TermFreqArrayIterator(new TermFreq[] { + suggester.build(new TermFreqArrayIterator(new TermFreq[]{ - new TermFreq("x y", 1), - new TermFreq("x y z", 3), - new TermFreq("x", 2), - new TermFreq("z z z", 20), - })); + new TermFreq("x y", 1), + new TermFreq("x y z", 3), + new TermFreq("x", 2), + new TermFreq("z z z", 20), + })); //System.out.println("ALL: " + suggester.lookup("x y", false, 6)); - for(int topN=1;topN<6;topN++) { + for (int topN = 1; topN < 6; topN++) { List results = suggester.lookup("x y", false, topN); //System.out.println("topN=" + topN + " " + results); @@ -435,14 +458,14 @@ Analyzer a = getUnusualAnalyzer(); FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3); - suggester.build(new TermFreqArrayIterator(new TermFreq[] { + suggester.build(new TermFreqArrayIterator(new TermFreq[]{ - new TermFreq("x y", 1), - new TermFreq("x y z", 3), - new TermFreq("x", 2), - new TermFreq("z z z", 20), - })); + new TermFreq("x y", 1), + new TermFreq("x y z", 3), + new TermFreq("x", 2), + new TermFreq("z z z", 20), + })); - for(int topN=1;topN<6;topN++) { + for (int topN = 1; topN < 6; topN++) { List results = suggester.lookup("p", false, topN); assertEquals(Math.min(topN, 4), results.size()); @@ -457,7 +480,7 @@ if (topN > 2) { assertEquals("x", results.get(2).key); assertEquals(2, results.get(2).value); - + if (topN > 3) { assertEquals("x y", results.get(3).key); assertEquals(1, results.get(3).value); @@ -466,7 +489,7 @@ } } } - + // Holds surface form separately: private static class TermFreq2 implements Comparable { public final String surfaceForm; @@ -572,11 +595,11 @@ public void testRandom() throws Exception { int numQueries = atLeast(100); - + final List slowCompletor = new ArrayList(); final TreeSet allPrefixes = new TreeSet(); final Set seen = new HashSet(); - + TermFreq[] keys = new TermFreq[numQueries]; boolean preserveSep = random().nextBoolean(); @@ -587,16 +610,16 @@ if (VERBOSE) { System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); } - + for (int i = 0; i < numQueries; i++) { int numTokens = _TestUtil.nextInt(random(), 1, 4); String key; String analyzedKey; - while(true) { + while (true) { key = ""; analyzedKey = ""; boolean lastRemoved = false; - for(int token=0;token < numTokens;token++) { + for (int token = 0; token < numTokens; token++) { String s; while (true) { // TODO: would be nice to fix this slowCompletor/comparator to @@ -606,7 +629,7 @@ if (token > 0) { key += " "; } - if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') { + if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length() - 1) != ' ') { analyzedKey += " "; } key += s; @@ -641,7 +664,7 @@ allPrefixes.add(key.substring(0, j)); } // we can probably do Integer.MAX_VALUE here, but why worry. - int weight = random().nextInt(1<<24); + int weight = random().nextInt(1 << 24); keys[i] = new TermFreq(key, weight); slowCompletor.add(new TermFreq2(key, analyzedKey, weight)); @@ -652,14 +675,14 @@ // altering the test: List sorted = new ArrayList(slowCompletor); Collections.sort(sorted); - for(TermFreq2 ent : sorted) { + for (TermFreq2 ent : sorted) { System.out.println(" surface='" + ent.surfaceForm + " analyzed='" + ent.analyzedForm + "' weight=" + ent.weight); } } Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); FuzzySuggester suggester = new FuzzySuggester(a, a, - preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3); + preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3); suggester.build(new TermFreqArrayIterator(keys)); for (String prefix : allPrefixes) { @@ -678,7 +701,7 @@ String[] tokens = prefix.split(" "); StringBuilder builder = new StringBuilder(); boolean lastRemoved = false; - for(int i=0;i 0 && !builder.toString().endsWith(" ")) { builder.append(' '); @@ -722,7 +745,7 @@ if (VERBOSE) { System.out.println(" analyzed: " + analyzedKey); } - TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton(); + TokenStreamToUnicodeAutomaton tokenStreamToUnicodeAutomaton = suggester.getTokenStreamToUnicodeAutomaton(); // NOTE: not great that we ask the suggester to give // us the "answer key" (ie maybe we have a bug in @@ -734,7 +757,7 @@ BytesRef spare = new BytesRef(); for (TermFreq2 e : slowCompletor) { spare.copyChars(e.analyzedForm); - Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton); + Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToUnicodeAutomaton); for (IntsRef intsRef : finiteStrings) { State p = automaton.getInitialState(); BytesRef ref = Util.toBytesRef(intsRef, spare); @@ -752,7 +775,7 @@ } if (!added && p.isAccept()) { matches.add(new LookupResult(e.surfaceForm, e.weight)); - } + } } } @@ -760,16 +783,16 @@ if (matches.size() > 1) { Collections.sort(matches, new Comparator() { - @Override - public int compare(LookupResult left, LookupResult right) { - int cmp = Float.compare(right.value, left.value); - if (cmp == 0) { - return left.compareTo(right); - } else { - return cmp; - } - } - }); + @Override + public int compare(LookupResult left, LookupResult right) { + int cmp = Float.compare(right.value, left.value); + if (cmp == 0) { + return left.compareTo(right); + } else { + return cmp; + } + } + }); } if (matches.size() > topN) { @@ -778,18 +801,18 @@ if (VERBOSE) { System.out.println(" expected:"); - for(LookupResult lr : matches) { + for (LookupResult lr : matches) { System.out.println(" key=" + lr.key + " weight=" + lr.value); } System.out.println(" actual:"); - for(LookupResult lr : r) { + for (LookupResult lr : r) { System.out.println(" key=" + lr.key + " weight=" + lr.value); } } - + assertEquals(prefix + " " + topN, matches.size(), r.size()); - for(int hit=0;hit keys = Arrays.asList(new TermFreq[] { + List keys = Arrays.asList(new TermFreq[]{ new TermFreq("a", 40), new TermFreq("a ", 50), new TermFreq(" a", 60), - }); + }); Collections.shuffle(keys, random()); suggester.build(new TermFreqArrayIterator(keys)); @@ -822,12 +845,12 @@ Analyzer a = new MockAnalyzer(random()); FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3); - List keys = Arrays.asList(new TermFreq[] { + List keys = Arrays.asList(new TermFreq[]{ new TermFreq("foo bar", 40), new TermFreq("foo bar baz", 50), new TermFreq("barbaz", 60), new TermFreq("barbazfoo", 10), - }); + }); Collections.shuffle(keys, random()); suggester.build(new TermFreqArrayIterator(keys)); @@ -837,20 +860,20 @@ assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString()); assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString()); } - + @SuppressWarnings("fallthrough") private static String addRandomEdit(String string, int prefixLength) { char[] input = string.toCharArray(); StringBuilder builder = new StringBuilder(); for (int i = 0; i < input.length; i++) { - if (i >= prefixLength && random().nextBoolean() && i < input.length-1) { + if (i >= prefixLength && random().nextBoolean() && i < input.length - 1) { - switch(random().nextInt(4)) { + switch (random().nextInt(4)) { case 3: - if (i < input.length-1) { + if (i < input.length - 1) { // Transpose input[i] and input[1+i]: - builder.append(input[i+1]); + builder.append(input[i + 1]); builder.append(input[i]); - for(int j=i+2;j answers = new ArrayList(); final Set seen = new HashSet(); - for(int i=0;i() { - @Override - public int compare(TermFreq a, TermFreq b) { - return a.term.compareTo(b.term); - } - }); + @Override + public int compare(TermFreq a, TermFreq b) { + return a.term.compareTo(b.term); + } + }); if (VERBOSE) { System.out.println("\nTEST: targets"); - for(TermFreq tf : answers) { + for (TermFreq tf : answers) { System.out.println(" " + tf.term.utf8ToString() + " freq=" + tf.v); } } @@ -943,7 +966,7 @@ suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()]))); final int ITERS = atLeast(100); - for(int iter=0;iter actual = suggest.lookup(frag, false, NUM); if (VERBOSE) { System.out.println(" actual: " + actual.size()); - for(LookupResult c : actual) { + for (LookupResult c : actual) { System.out.println(" " + c); } } @@ -966,13 +989,13 @@ Collections.sort(actual, new CompareByCostThenAlpha()); final int limit = Math.min(expected.size(), actual.size()); - for(int ans=0;ans slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List answers, String frag) { final List results = new ArrayList(); final int fragLen = frag.length(); - for(TermFreq tf : answers) { + for (TermFreq tf : answers) { //System.out.println(" check s=" + tf.term.utf8ToString()); boolean prefixMatches = true; - for(int i=0;i= fragLen-maxEdits) { + if (len >= fragLen - maxEdits) { // OK it's possible: //System.out.println(" possible"); int d; @@ -1012,11 +1035,11 @@ //System.out.println(" try loop"); d = maxEdits + 1; //for(int ed=-maxEdits;ed<=maxEdits;ed++) { - for(int ed=-maxEdits;ed<=maxEdits;ed++) { + for (int ed = -maxEdits; ed <= maxEdits; ed++) { if (s.length() < fragLen - ed) { continue; } - String check = s.substring(0, fragLen-ed); + String check = s.substring(0, fragLen - ed); d = getDistance(frag, check, allowTransposition); //System.out.println(" sub check s=" + check + " d=" + d); if (d <= maxEdits) { @@ -1042,7 +1065,7 @@ public int compare(CharSequence o1, CharSequence o2) { final int l1 = o1.length(); final int l2 = o2.length(); - + final int aStop = Math.min(l1, l2); for (int i = 0; i < aStop; i++) { int diff = o1.charAt(i) - o2.charAt(i); @@ -1066,7 +1089,7 @@ return 1; } else { final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); - assert c != 0: "term=" + a.key; + assert c != 0 : "term=" + a.key; return c; } } @@ -1084,7 +1107,7 @@ IntsRef otherPoints; int n; int d[][]; // cost array - + // NOTE: if we cared, we could 3*m space instead of m*n space, similar to // what LevenshteinDistance does, except cycling thru a ring of three // horizontal cost arrays... but this comparator is never actually used by @@ -1096,16 +1119,15 @@ otherPoints = toIntsRef(other); n = targetPoints.length; final int m = otherPoints.length; - d = new int[n+1][m+1]; + d = new int[n + 1][m + 1]; - + if (n == 0 || m == 0) { if (n == m) { return 0; - } - else { + } else { return Math.max(n, m); } - } + } // indexes into strings s and t int i; // iterates through s @@ -1115,31 +1137,31 @@ int cost; // cost - for (i = 0; i<=n; i++) { + for (i = 0; i <= n; i++) { d[i][0] = i; } - + - for (j = 0; j<=m; j++) { + for (j = 0; j <= m; j++) { d[0][j] = j; } - for (j = 1; j<=m; j++) { + for (j = 1; j <= m; j++) { - t_j = otherPoints.ints[j-1]; + t_j = otherPoints.ints[j - 1]; - for (i=1; i<=n; i++) { + for (i = 1; i <= n; i++) { - cost = targetPoints.ints[i-1]==t_j ? 0 : 1; + cost = targetPoints.ints[i - 1] == t_j ? 0 : 1; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost - d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost); + d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost); // transposition - if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) { + if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i - 1] == otherPoints.ints[j - 2] && targetPoints.ints[i - 2] == otherPoints.ints[j - 1]) { - d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost); + d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost); } } } - + return d[n][m]; } - + private static IntsRef toIntsRef(String s) { IntsRef ref = new IntsRef(s.length()); // worst case int utf16Len = s.length(); Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java (revision ) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java (revision ) @@ -0,0 +1,257 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RollingBuffer; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +import java.io.IOException; + +// TODO: maybe also toFST? then we can translate atts into FST outputs/weights + +/** + * Consumes a TokenStream and creates an {@link org.apache.lucene.util.automaton.Automaton} + * where the transition labels are UTF8 bytes from the {@link + * org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}. Between tokens we insert + * POS_SEP and for holes we insert HOLE. + * + * @lucene.experimental + */ +public class TokenStreamToUnicodeAutomaton { + + private boolean preservePositionIncrements; + + /** + * Sole constructor. + */ + public TokenStreamToUnicodeAutomaton() { + this.preservePositionIncrements = true; + } + + /** + * Whether to generate holes in the automaton for missing positions, true by default. + */ + public void setPreservePositionIncrements(boolean enablePositionIncrements) { + this.preservePositionIncrements = enablePositionIncrements; + } + + private static class Position implements RollingBuffer.Resettable { + // Any tokens that ended at our position arrive to this state: + State arriving; + + // Any tokens that start at our position leave from this state: + State leaving; + + @Override + public void reset() { + arriving = null; + leaving = null; + } + } + + private static class Positions extends RollingBuffer { + @Override + protected Position newInstance() { + return new Position(); + } + } + + /** + * Subclass & implement this if you need to change the + * token (such as escaping certain bytes) before it's + * turned into a graph. + */ + protected BytesRef changeToken(BytesRef in) { + return in; + } + + /** + * We create transition between two adjacent tokens. + */ + public static final int POS_SEP = 0x10FFFF; + + /** + * We add this arc to represent a hole. + */ + public static final int HOLE = POS_SEP - 1; + + /** + * Pulls the graph (including {@link + * org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}) from the provided {@link + * org.apache.lucene.analysis.TokenStream}, and creates the corresponding + * automaton where arcs are Unicode code points from each term. + */ + public Automaton toAutomaton(TokenStream in) throws IOException { + final Automaton a = new Automaton(); + boolean deterministic = true; + + final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); + final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class); + + final BytesRef term = termBytesAtt.getBytesRef(); + + in.reset(); + + // Only temporarily holds states ahead of our current + // position: + + final RollingBuffer positions = new Positions(); + + int pos = -1; + Position posData = null; + int maxOffset = 0; + while (in.incrementToken()) { + int posInc = posIncAtt.getPositionIncrement(); + if (!preservePositionIncrements && posInc > 1) { + posInc = 1; + } + assert pos > -1 || posInc > 0; + + if (posInc > 0) { + + // New node: + pos += posInc; + + posData = positions.get(pos); + assert posData.leaving == null; + + if (posData.arriving == null) { + // No token ever arrived to this position + if (pos == 0) { + // OK: this is the first token + posData.leaving = a.getInitialState(); + } else { + // This means there's a hole (eg, StopFilter + // does this): + posData.leaving = new State(); + addHoles(a.getInitialState(), positions, pos); + } + } else { + posData.leaving = new State(); + posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); + if (posInc > 1) { + // A token spanned over a hole; add holes + // "under" it: + addHoles(a.getInitialState(), positions, pos); + } + } + positions.freeBefore(pos); + } else { + // note: this isn't necessarily true. its just that we aren't surely det. + // we could optimize this further (e.g. buffer and sort synonyms at a position) + // but thats probably overkill. this is cheap and dirty + deterministic = false; + } + + final int endPos = pos + posLengthAtt.getPositionLength(); + + termBytesAtt.fillBytesRef(); + final String utf16 = changeToken(term).utf8ToString(); + final int[] term2 = new int[utf16.codePointCount(0, utf16.length())]; + for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) + term2[j++] = cp = utf16.codePointAt(i); + + final Position endPosData = positions.get(endPos); + if (endPosData.arriving == null) { + endPosData.arriving = new State(); + } + + State state = posData.leaving; + for (int charIDX = 0; charIDX < term2.length; charIDX++) { + final State nextState = charIDX == term2.length - 1 ? endPosData.arriving : new State(); + state.addTransition(new Transition(term2[charIDX], nextState)); + state = nextState; + } + + maxOffset = Math.max(maxOffset, offsetAtt.endOffset()); + } + + in.end(); + State endState = null; + if (offsetAtt.endOffset() > maxOffset) { + endState = new State(); + endState.setAccept(true); + } + + pos++; + while (pos <= positions.getMaxPos()) { + posData = positions.get(pos); + if (posData.arriving != null) { + if (endState != null) { + posData.arriving.addTransition(new Transition(POS_SEP, endState)); + } else { + posData.arriving.setAccept(true); + } + } + pos++; + } + + //toDot(a); + a.setDeterministic(deterministic); + return a; + } + + // for debugging! + /* + private static void toDot(Automaton a) throws IOException { + final String s = a.toDot(); + Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot")); + w.write(s); + w.close(); + System.out.println("TEST: saved to /tmp/out.dot"); + } + */ + + private static void addHoles(State startState, RollingBuffer positions, int pos) { + Position posData = positions.get(pos); + Position prevPosData = positions.get(pos - 1); + + while (posData.arriving == null || prevPosData.leaving == null) { + if (posData.arriving == null) { + posData.arriving = new State(); + posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); + } + if (prevPosData.leaving == null) { + if (pos == 1) { + prevPosData.leaving = startState; + } else { + prevPosData.leaving = new State(); + } + if (prevPosData.arriving != null) { + prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving)); + } + } + prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving)); + pos--; + if (pos <= 0) { + break; + } + posData = prevPosData; + prevPosData = positions.get(pos - 1); + } + } +} Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\nimport java.io.FileOutputStream;\nimport java.io.IOException;\nimport java.io.OutputStreamWriter;\nimport java.io.Writer;\nimport java.util.Arrays;\nimport java.util.List;\nimport java.util.Set;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.BasicAutomata;\nimport org.apache.lucene.util.automaton.BasicOperations;\nimport org.apache.lucene.util.automaton.LevenshteinAutomata;\nimport org.apache.lucene.util.automaton.SpecialOperations;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.PairOutputs.Pair;\n\n/**\n * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is\n * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though\n * you can explicitly choose classic Levenshtein by passing false\n * for the transpositions parameter.\n *

\n * At most, this query will match terms up to\n * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}\n * edits. Higher distances are not supported. Note that the\n * fuzzy distance is measured in \"byte space\" on the bytes\n * returned by the {@link TokenStream}'s {@link\n * TermToBytesRefAttribute}, usually UTF8. By default\n * the analyzed bytes must be at least 3 {@link\n * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are\n * considered. Furthermore, the first 1 {@link\n * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be\n * edited. We allow up to 1 (@link\n * #DEFAULT_MAX_EDITS} edit.\n *\n *

\n * NOTE: This suggester does not boost suggestions that\n * required no edits over suggestions that did require\n * edits. This is a known limitation.\n *\n *

\n * Note: complex query analyzers can have a significant impact on the lookup\n * performance. It's recommended to not use analyzers that drop or inject terms\n * like synonyms to keep the complexity of the prefix intersection low for good\n * lookup performance. At index time, complex analyzers can safely be used.\n *

\n */\npublic final class FuzzySuggester extends AnalyzingSuggester {\n private final int maxEdits;\n private final boolean transpositions;\n private final int nonFuzzyPrefix;\n private final int minFuzzyLength;\n\n /**\n * The default minimum length of the key passed to {@link\n * #lookup} before any edits are allowed.\n */\n public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;\n\n /**\n * The default prefix length where edits are not allowed.\n */\n public static final int DEFAULT_NON_FUZZY_PREFIX = 1;\n \n /**\n * The default maximum number of edits for fuzzy\n * suggestions.\n */\n public static final int DEFAULT_MAX_EDITS = 1;\n \n /**\n * The default transposition value passed to {@link LevenshteinAutomata}\n */\n public static final boolean DEFAULT_TRANSPOSITIONS = true;\n\n /**\n * Creates a {@link FuzzySuggester} instance initialized with default values.\n * \n * @param analyzer the analyzer used for this suggester\n */\n public FuzzySuggester(Analyzer analyzer) {\n this(analyzer, analyzer);\n }\n \n /**\n * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.\n * \n * @param indexAnalyzer\n * Analyzer that will be used for analyzing suggestions while building the index.\n * @param queryAnalyzer\n * Analyzer that will be used for analyzing query text during lookup\n */\n public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,\n DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);\n }\n\n /**\n * Creates a {@link FuzzySuggester} instance.\n * \n * @param indexAnalyzer Analyzer that will be used for\n * analyzing suggestions while building the index.\n * @param queryAnalyzer Analyzer that will be used for\n * analyzing query text during lookup\n * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}\n * @param maxSurfaceFormsPerAnalyzedForm Maximum number of\n * surface forms to keep for a single analyzed form.\n * When there are too many surface forms we discard the\n * lowest weighted ones.\n * @param maxGraphExpansions Maximum number of graph paths\n * to expand from the analyzed form. Set this to -1 for\n * no limit.\n * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .\n * @param transpositions true if transpositions should be treated as a primitive \n * edit operation. If this is false, comparisons will implement the classic\n * Levenshtein algorithm.\n * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}\n * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})\n */\n public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,\n int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,\n int maxEdits, boolean transpositions, int nonFuzzyPrefix,\n int minFuzzyLength) {\n super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);\n if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {\n throw new IllegalArgumentException(\"maxEdits must be between 0 and \" + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);\n }\n if (nonFuzzyPrefix < 0) {\n throw new IllegalArgumentException(\"nonFuzzyPrefix must not be >= 0 (got \" + nonFuzzyPrefix + \")\");\n }\n if (minFuzzyLength < 0) {\n throw new IllegalArgumentException(\"minFuzzyLength must not be >= 0 (got \" + minFuzzyLength + \")\");\n }\n \n this.maxEdits = maxEdits;\n this.transpositions = transpositions;\n this.nonFuzzyPrefix = nonFuzzyPrefix;\n this.minFuzzyLength = minFuzzyLength;\n }\n \n @Override\n protected List>> getFullPrefixPaths(List>> prefixPaths,\n Automaton lookupAutomaton,\n FST> fst)\n throws IOException {\n\n // TODO: right now there's no penalty for fuzzy/edits,\n // ie a completion whose prefix matched exactly what the\n // user typed gets no boost over completions that\n // required an edit, which get no boost over completions\n // requiring two edits. I suspect a multiplicative\n // factor is appropriate (eg, say a fuzzy match must be at\n // least 2X better weight than the non-fuzzy match to\n // \"compete\") ... in which case I think the wFST needs\n // to be log weights or something ...\n\n Automaton levA = toLevenshteinAutomata(lookupAutomaton);\n /*\n Writer w = new OutputStreamWriter(new FileOutputStream(\"out.dot\"), \"UTF-8\");\n w.write(levA.toDot());\n w.close();\n System.out.println(\"Wrote LevA to out.dot\");\n */\n return FSTUtil.intersectPrefixPaths(levA, fst);\n }\n\n Automaton toLevenshteinAutomata(Automaton automaton) {\n final Set ref = SpecialOperations.getFiniteStrings(automaton, -1);\n Automaton subs[] = new Automaton[ref.size()];\n int upto = 0;\n for (IntsRef path : ref) {\n if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {\n subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);\n upto++;\n } else {\n Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);\n int ints[] = new int[path.length-nonFuzzyPrefix];\n System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);\n // TODO: maybe add alphaMin to LevenshteinAutomata,\n // and pass 1 instead of 0? We probably don't want\n // to allow the trailing dedup bytes to be\n // edited... but then 0 byte is \"in general\" allowed\n // on input (but not in UTF8).\n LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);\n Automaton levAutomaton = lev.toAutomaton(maxEdits);\n Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));\n combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already\n subs[upto] = combined;\n upto++;\n }\n }\n\n if (subs.length == 0) {\n // automaton is empty, there is no accepted paths through it\n return BasicAutomata.makeEmpty(); // matches nothing\n } else if (subs.length == 1) {\n // no synonyms or anything: just a single path through the tokenstream\n return subs[0];\n } else {\n // multiple paths: this is really scary! is it slow?\n // maybe we should not do this and throw UOE?\n Automaton a = BasicOperations.union(Arrays.asList(subs));\n // TODO: we could call toLevenshteinAutomata() before det? \n // this only happens if you have multiple paths anyway (e.g. synonyms)\n BasicOperations.determinize(a);\n\n return a;\n }\n }\n}\n =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision f81056da25f3671b9807c4a51d6b985389fe916e) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision ) @@ -15,17 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.Arrays; -import java.util.List; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; @@ -33,9 +26,15 @@ import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs.Pair; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Set; + /** * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though @@ -54,12 +53,12 @@ * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be * edited. We allow up to 1 (@link * #DEFAULT_MAX_EDITS} edit. - * + *

*

* NOTE: This suggester does not boost suggestions that * required no edits over suggestions that did require * edits. This is a known limitation. - * + *

*

* Note: complex query analyzers can have a significant impact on the lookup * performance. It's recommended to not use analyzers that drop or inject terms @@ -83,13 +82,13 @@ * The default prefix length where edits are not allowed. */ public static final int DEFAULT_NON_FUZZY_PREFIX = 1; - + /** * The default maximum number of edits for fuzzy * suggestions. */ public static final int DEFAULT_MAX_EDITS = 1; - + /** * The default transposition value passed to {@link LevenshteinAutomata} */ @@ -97,47 +96,45 @@ /** * Creates a {@link FuzzySuggester} instance initialized with default values. - * + * * @param analyzer the analyzer used for this suggester */ public FuzzySuggester(Analyzer analyzer) { this(analyzer, analyzer); } - + /** * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values. - * + * - * @param indexAnalyzer - * Analyzer that will be used for analyzing suggestions while building the index. - * @param queryAnalyzer - * Analyzer that will be used for analyzing query text during lookup + * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, - DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH); + DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH); } /** * Creates a {@link FuzzySuggester} instance. - * + * - * @param indexAnalyzer Analyzer that will be used for + * @param indexAnalyzer Analyzer that will be used for - * analyzing suggestions while building the index. + * analyzing suggestions while building the index. - * @param queryAnalyzer Analyzer that will be used for + * @param queryAnalyzer Analyzer that will be used for - * analyzing query text during lookup + * analyzing query text during lookup - * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} * @param maxSurfaceFormsPerAnalyzedForm Maximum number of - * surface forms to keep for a single analyzed form. + * surface forms to keep for a single analyzed form. - * When there are too many surface forms we discard the + * When there are too many surface forms we discard the - * lowest weighted ones. + * lowest weighted ones. - * @param maxGraphExpansions Maximum number of graph paths + * @param maxGraphExpansions Maximum number of graph paths - * to expand from the analyzed form. Set this to -1 for + * to expand from the analyzed form. Set this to -1 for - * no limit. + * no limit. - * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} . + * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} . - * @param transpositions true if transpositions should be treated as a primitive + * @param transpositions true if transpositions should be treated as a primitive - * edit operation. If this is false, comparisons will implement the classic + * edit operation. If this is false, comparisons will implement the classic - * Levenshtein algorithm. + * Levenshtein algorithm. - * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} + * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} - * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) + * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, @@ -153,18 +150,18 @@ if (minFuzzyLength < 0) { throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")"); } - + this.maxEdits = maxEdits; this.transpositions = transpositions; this.nonFuzzyPrefix = nonFuzzyPrefix; this.minFuzzyLength = minFuzzyLength; } - + @Override - protected List>> getFullPrefixPaths(List>> prefixPaths, + protected List>> getFullPrefixPaths(List>> prefixPaths, - Automaton lookupAutomaton, + Automaton lookupAutomaton, - FST> fst) + FST> fst) - throws IOException { + throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the @@ -177,13 +174,15 @@ // to be log weights or something ... Automaton levA = toLevenshteinAutomata(lookupAutomaton); + Automaton utf8LevA = new UTF32ToUTF8().convert(levA); + BasicOperations.determinize(utf8LevA); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ - return FSTUtil.intersectPrefixPaths(levA, fst); + return FSTUtil.intersectPrefixPaths(utf8LevA, fst); } Automaton toLevenshteinAutomata(Automaton automaton) { @@ -196,14 +195,14 @@ upto++; } else { Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix); - int ints[] = new int[path.length-nonFuzzyPrefix]; + int ints[] = new int[path.length - nonFuzzyPrefix]; - System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length); + System.arraycopy(path.ints, path.offset + nonFuzzyPrefix, ints, 0, ints.length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). - LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); + LevenshteinAutomata lev = new LevenshteinAutomata(ints, Character.MAX_CODE_POINT, transpositions); Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already