false by default */
+ public void setUnicodeArcs(boolean unicodeArcs) {
+ this.unicodeArcs = unicodeArcs;
+ }
+
private static class Position implements RollingBuffer.Resettable {
// Any tokens that ended at our position arrive to this state:
State arriving;
@@ -80,15 +88,16 @@
}
/** We create transition between two adjacent tokens. */
- public static final int POS_SEP = 256;
+ public static final int POS_SEP = 0x001f;
/** We add this arc to represent a hole. */
- public static final int HOLE = 257;
+ public static final int HOLE = 0x001e;
/** Pulls the graph (including {@link
* PositionLengthAttribute}) from the provided {@link
* TokenStream}, and creates the corresponding
- * automaton where arcs are bytes from each term. */
+ * automaton where arcs are bytes (or Unicode code points
+ * if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
final Automaton a = new Automaton();
boolean deterministic = true;
@@ -156,16 +165,34 @@
final int endPos = pos + posLengthAtt.getPositionLength();
termBytesAtt.fillBytesRef();
- final BytesRef term2 = changeToken(term);
+ final BytesRef termUTF8 = changeToken(term);
+ int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
if (endPosData.arriving == null) {
endPosData.arriving = new State();
}
State state = posData.leaving;
- for(int byteIDX=0;byteIDX
* NOTE: This suggester does not boost suggestions that
@@ -177,6 +178,10 @@
// to be log weights or something ...
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
+ if (isFuzzyUnicodeAware()) {
+ levA = new UTF32ToUTF8().convert(levA);
+ BasicOperations.determinize(levA);
+ }
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
@@ -203,7 +208,7 @@
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
- LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
+ LevenshteinAutomata lev = new LevenshteinAutomata(ints, isFuzzyUnicodeAware() ? Character.MAX_CODE_POINT : 255, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
Index: solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
===================================================================
--- solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java (revision 1498685)
+++ solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java (working copy)
@@ -43,6 +43,13 @@
public static final String PRESERVE_SEP = "preserveSep";
/**
+ * If true, maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix in
+ * {@link FuzzyLookupFactory} will be measured in Unicode code points (actual letters)
+ * instead of bytes.
+ */
+ public static final String FUZZY_UNICODE_AWARE = "fuzzyUnicodeAware";
+
+ /**
* When multiple suggestions collide to the same analyzed form, this is the limit of
* how many unique surface forms we keep.
*/
@@ -91,6 +98,10 @@
? Boolean.valueOf(params.get(PRESERVE_SEP).toString())
: true;
+ boolean fuzzyUnicodeAware = params.get(FUZZY_UNICODE_AWARE) != null
+ ? Boolean.valueOf(params.get(FUZZY_UNICODE_AWARE).toString())
+ : false;
+
int flags = 0;
if (exactMatchFirst) {
flags |= AnalyzingSuggester.EXACT_FIRST;
@@ -98,6 +109,9 @@
if (preserveSep) {
flags |= AnalyzingSuggester.PRESERVE_SEP;
}
+ if (fuzzyUnicodeAware) {
+ flags |= AnalyzingSuggester.FUZZY_UNICODE_AWARE;
+ }
int maxSurfaceFormsPerAnalyzedForm = params.get(MAX_SURFACE_FORMS) != null
? Integer.parseInt(params.get(MAX_SURFACE_FORMS).toString())
Index: solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java
===================================================================
--- solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (revision 1498685)
+++ solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (working copy)
@@ -34,22 +34,26 @@
/**
* Maximum number of edits allowed, used by {@link LevenshteinAutomata#toAutomaton(int)}
+ * in bytes or Unicode code points (if unicodeAware option is set to true).
*/
public static final String MAX_EDITS = "maxEdits";
/**
* If transpositions are allowed, Fuzzy suggestions will be computed based on a primitive
* edit operation. If it is false, it will be based on the classic Levenshtein algorithm.
+ * Transpositions of bytes or Unicode code points (if unicodeAware option is set to true).
*/
public static final String TRANSPOSITIONS = "transpositions";
/**
* Length of common (non-fuzzy) prefix for the suggestions
+ * in bytes or Unicode code points (if unicodeAware option is set to true).
*/
public static final String NON_FUZZY_PREFIX = "nonFuzzyPrefix";
/**
* Minimum length of lookup key before any edits are allowed for the suggestions
+ * in bytes or Unicode code points (if unicodeAware option is set to true).
*/
public static final String MIN_FUZZY_LENGTH = "minFuzzyLength";
@@ -80,6 +84,10 @@
? Boolean.valueOf(params.get(AnalyzingLookupFactory.PRESERVE_SEP).toString())
: true;
+ boolean fuzzyUnicodeAware = (params.get(AnalyzingLookupFactory.FUZZY_UNICODE_AWARE) != null)
+ ? Boolean.valueOf(params.get(AnalyzingLookupFactory.FUZZY_UNICODE_AWARE).toString())
+ : false;
+
int options = 0;
if (exactMatchFirst) {
options |= FuzzySuggester.EXACT_FIRST;
@@ -87,6 +95,9 @@
if (preserveSep) {
options |= FuzzySuggester.PRESERVE_SEP;
}
+ if (fuzzyUnicodeAware) {
+ options |= FuzzySuggester.FUZZY_UNICODE_AWARE;
+ }
int maxSurfaceFormsPerAnalyzedForm = (params.get(AnalyzingLookupFactory.MAX_SURFACE_FORMS) != null)
? Integer.parseInt(params.get(AnalyzingLookupFactory.MAX_SURFACE_FORMS).toString())