Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.File;\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.OutputStream;\nimport java.io.StringReader;\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.TokenStreamToAutomaton;\nimport org.apache.lucene.search.spell.TermFreqIterator;\nimport org.apache.lucene.search.spell.TermFreqPayloadIterator;\nimport org.apache.lucene.search.suggest.Lookup;\nimport org.apache.lucene.search.suggest.Sort;\nimport org.apache.lucene.store.ByteArrayDataInput;\nimport org.apache.lucene.store.ByteArrayDataOutput;\nimport org.apache.lucene.store.DataInput;\nimport org.apache.lucene.store.DataOutput;\nimport org.apache.lucene.store.InputStreamDataInput;\nimport org.apache.lucene.store.OutputStreamDataOutput;\nimport org.apache.lucene.util.ArrayUtil;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.CharsRef;\nimport org.apache.lucene.util.IOUtils;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.UnicodeUtil;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.BasicOperations;\nimport org.apache.lucene.util.automaton.SpecialOperations;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.automaton.Transition;\nimport org.apache.lucene.util.fst.Builder;\nimport org.apache.lucene.util.fst.ByteSequenceOutputs;\nimport org.apache.lucene.util.fst.FST.BytesReader;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.PairOutputs.Pair;\nimport org.apache.lucene.util.fst.PairOutputs;\nimport org.apache.lucene.util.fst.PositiveIntOutputs;\nimport org.apache.lucene.util.fst.Util.MinResult;\nimport org.apache.lucene.util.fst.Util;\n\n/**\n * Suggester that first analyzes the surface form, adds the\n * analyzed form to a weighted FST, and then does the same\n * thing at lookup time. This means lookup is based on the\n * analyzed form while suggestions are still the surface\n * form(s).\n *\n *
\n * This can result in powerful suggester functionality. For\n * example, if you use an analyzer removing stop words, \n * then the partial text \"ghost chr...\" could see the\n * suggestion \"The Ghost of Christmas Past\". Note that\n * position increments MUST NOT be preserved for this example\n * to work, so you should call\n * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.\n *\n *
\n * If SynonymFilter is used to map wifi and wireless network to\n * hotspot then the partial text \"wirele...\" could suggest\n * \"wifi router\". Token normalization like stemmers, accent\n * removal, etc., would allow suggestions to ignore such\n * variations.\n *\n *
\n * When two matching suggestions have the same weight, they\n * are tie-broken by the analyzed form. If their analyzed\n * form is the same then the order is undefined.\n *\n *
\n * There are some limitations:\n *
true by default. */\n public void setPreservePositionIncrements(boolean preservePositionIncrements) {\n this.preservePositionIncrements = preservePositionIncrements;\n }\n\n /** Returns byte size of the underlying FST. */\n public long sizeInBytes() {\n return fst == null ? 0 : fst.sizeInBytes();\n }\n\n private void copyDestTransitions(State from, State to, List+ *
+ * * This can result in powerful suggester functionality. For - * example, if you use an analyzer removing stop words, + * example, if you use an analyzer removing stop words, * then the partial text "ghost chr..." could see the * suggestion "The Ghost of Christmas Past". Note that * position increments MUST NOT be preserved for this example * to work, so you should call * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}. - * - *+ *
+ * * If SynonymFilter is used to map wifi and wireless network to * hotspot then the partial text "wirele..." could suggest * "wifi router". Token normalization like stemmers, accent * removal, etc., would allow suggestions to ignore such * variations. - * - *+ *
+ * * When two matching suggestions have the same weight, they * are tie-broken by the analyzed form. If their analyzed * form is the same then the order is undefined. - * - *+ *
+ * * There are some limitations: *true by default. */
+ /**
+ * Whether to take position holes (position increment > 1) into account when
+ * building the automaton, true by default.
+ */
public void setPreservePositionIncrements(boolean preservePositionIncrements) {
this.preservePositionIncrements = preservePositionIncrements;
}
- /** Returns byte size of the underlying FST. */
+ /**
+ * Returns byte size of the underlying FST.
+ */
public long sizeInBytes() {
return fst == null ? 0 : fst.sizeInBytes();
}
@@ -262,7 +283,7 @@
if (to.isAccept()) {
from.setAccept(true);
}
- for(Transition t : to.getTransitions()) {
+ for (Transition t : to.getTransitions()) {
transitions.add(t);
}
}
@@ -275,12 +296,12 @@
// Go in reverse topo sort so we know we only have to
// make one pass:
- for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {
+ for (int stateNumber = states.length - 1; stateNumber >= 0; stateNumber--) {
final State state = states[stateNumber];
Listtrue by default.
+ */
+ public void setPreservePositionIncrements(boolean enablePositionIncrements) {
+ this.preservePositionIncrements = enablePositionIncrements;
+ }
+
+ private static class Position implements RollingBuffer.Resettable {
+ // Any tokens that ended at our position arrive to this state:
+ State arriving;
+
+ // Any tokens that start at our position leave from this state:
+ State leaving;
+
+ @Override
+ public void reset() {
+ arriving = null;
+ leaving = null;
+ }
+ }
+
+ private static class Positions extends RollingBufferfalse\n * for the transpositions parameter.\n * \n * At most, this query will match terms up to\n * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}\n * edits. Higher distances are not supported. Note that the\n * fuzzy distance is measured in \"byte space\" on the bytes\n * returned by the {@link TokenStream}'s {@link\n * TermToBytesRefAttribute}, usually UTF8. By default\n * the analyzed bytes must be at least 3 {@link\n * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are\n * considered. Furthermore, the first 1 {@link\n * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be\n * edited. We allow up to 1 (@link\n * #DEFAULT_MAX_EDITS} edit.\n *\n *
\n * NOTE: This suggester does not boost suggestions that\n * required no edits over suggestions that did require\n * edits. This is a known limitation.\n *\n *
\n * Note: complex query analyzers can have a significant impact on the lookup\n * performance. It's recommended to not use analyzers that drop or inject terms\n * like synonyms to keep the complexity of the prefix intersection low for good\n * lookup performance. At index time, complex analyzers can safely be used.\n *
\n */\npublic final class FuzzySuggester extends AnalyzingSuggester {\n private final int maxEdits;\n private final boolean transpositions;\n private final int nonFuzzyPrefix;\n private final int minFuzzyLength;\n\n /**\n * The default minimum length of the key passed to {@link\n * #lookup} before any edits are allowed.\n */\n public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;\n\n /**\n * The default prefix length where edits are not allowed.\n */\n public static final int DEFAULT_NON_FUZZY_PREFIX = 1;\n \n /**\n * The default maximum number of edits for fuzzy\n * suggestions.\n */\n public static final int DEFAULT_MAX_EDITS = 1;\n \n /**\n * The default transposition value passed to {@link LevenshteinAutomata}\n */\n public static final boolean DEFAULT_TRANSPOSITIONS = true;\n\n /**\n * Creates a {@link FuzzySuggester} instance initialized with default values.\n * \n * @param analyzer the analyzer used for this suggester\n */\n public FuzzySuggester(Analyzer analyzer) {\n this(analyzer, analyzer);\n }\n \n /**\n * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.\n * \n * @param indexAnalyzer\n * Analyzer that will be used for analyzing suggestions while building the index.\n * @param queryAnalyzer\n * Analyzer that will be used for analyzing query text during lookup\n */\n public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,\n DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);\n }\n\n /**\n * Creates a {@link FuzzySuggester} instance.\n * \n * @param indexAnalyzer Analyzer that will be used for\n * analyzing suggestions while building the index.\n * @param queryAnalyzer Analyzer that will be used for\n * analyzing query text during lookup\n * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}\n * @param maxSurfaceFormsPerAnalyzedForm Maximum number of\n * surface forms to keep for a single analyzed form.\n * When there are too many surface forms we discard the\n * lowest weighted ones.\n * @param maxGraphExpansions Maximum number of graph paths\n * to expand from the analyzed form. Set this to -1 for\n * no limit.\n * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .\n * @param transpositionstrue if transpositions should be treated as a primitive \n * edit operation. If this is false, comparisons will implement the classic\n * Levenshtein algorithm.\n * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}\n * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})\n */\n public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,\n int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,\n int maxEdits, boolean transpositions, int nonFuzzyPrefix,\n int minFuzzyLength) {\n super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);\n if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {\n throw new IllegalArgumentException(\"maxEdits must be between 0 and \" + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);\n }\n if (nonFuzzyPrefix < 0) {\n throw new IllegalArgumentException(\"nonFuzzyPrefix must not be >= 0 (got \" + nonFuzzyPrefix + \")\");\n }\n if (minFuzzyLength < 0) {\n throw new IllegalArgumentException(\"minFuzzyLength must not be >= 0 (got \" + minFuzzyLength + \")\");\n }\n \n this.maxEdits = maxEdits;\n this.transpositions = transpositions;\n this.nonFuzzyPrefix = nonFuzzyPrefix;\n this.minFuzzyLength = minFuzzyLength;\n }\n \n @Override\n protected List* NOTE: This suggester does not boost suggestions that * required no edits over suggestions that did require * edits. This is a known limitation. - * + *
*
* Note: complex query analyzers can have a significant impact on the lookup
* performance. It's recommended to not use analyzers that drop or inject terms
@@ -83,13 +82,13 @@
* The default prefix length where edits are not allowed.
*/
public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
-
+
/**
* The default maximum number of edits for fuzzy
* suggestions.
*/
public static final int DEFAULT_MAX_EDITS = 1;
-
+
/**
* The default transposition value passed to {@link LevenshteinAutomata}
*/
@@ -97,47 +96,45 @@
/**
* Creates a {@link FuzzySuggester} instance initialized with default values.
- *
+ *
* @param analyzer the analyzer used for this suggester
*/
public FuzzySuggester(Analyzer analyzer) {
this(analyzer, analyzer);
}
-
+
/**
* Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.
- *
+ *
- * @param indexAnalyzer
- * Analyzer that will be used for analyzing suggestions while building the index.
- * @param queryAnalyzer
- * Analyzer that will be used for analyzing query text during lookup
+ * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the index.
+ * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup
*/
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
- DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
+ DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
}
/**
* Creates a {@link FuzzySuggester} instance.
- *
+ *
- * @param indexAnalyzer Analyzer that will be used for
+ * @param indexAnalyzer Analyzer that will be used for
- * analyzing suggestions while building the index.
+ * analyzing suggestions while building the index.
- * @param queryAnalyzer Analyzer that will be used for
+ * @param queryAnalyzer Analyzer that will be used for
- * analyzing query text during lookup
+ * analyzing query text during lookup
- * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
+ * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
* @param maxSurfaceFormsPerAnalyzedForm Maximum number of
- * surface forms to keep for a single analyzed form.
+ * surface forms to keep for a single analyzed form.
- * When there are too many surface forms we discard the
+ * When there are too many surface forms we discard the
- * lowest weighted ones.
+ * lowest weighted ones.
- * @param maxGraphExpansions Maximum number of graph paths
+ * @param maxGraphExpansions Maximum number of graph paths
- * to expand from the analyzed form. Set this to -1 for
+ * to expand from the analyzed form. Set this to -1 for
- * no limit.
+ * no limit.
- * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
+ * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
- * @param transpositions true if transpositions should be treated as a primitive
+ * @param transpositions true if transpositions should be treated as a primitive
- * edit operation. If this is false, comparisons will implement the classic
+ * edit operation. If this is false, comparisons will implement the classic
- * Levenshtein algorithm.
+ * Levenshtein algorithm.
- * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
+ * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
- * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
+ * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
*/
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
@@ -153,18 +150,18 @@
if (minFuzzyLength < 0) {
throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
}
-
+
this.maxEdits = maxEdits;
this.transpositions = transpositions;
this.nonFuzzyPrefix = nonFuzzyPrefix;
this.minFuzzyLength = minFuzzyLength;
}
-
+
@Override
- protected List