diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java index 4311ece..7369ff4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -750,7 +750,7 @@ public class Automaton implements Cloneable { static public Automaton union(Collection l) { return BasicOperations.union(l); } - + /** * See {@link BasicOperations#determinize(Automaton)}. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java index 7db891a..df144c4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java @@ -30,6 +30,7 @@ package org.apache.lucene.util.automaton; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; /** @@ -239,4 +240,21 @@ final public class BasicAutomata { a.deterministic = true; return a; } + + /** + * Returns a new (deterministic and minimal) automaton that accepts the union of the + * given set of strings. The input character sequences are internally sorted in-place, + * so the input array is modified. + * @see StringUnionOperations + */ + public static Automaton makeStringUnion(CharSequence... strings) { + if (strings.length == 0) + return makeEmpty(); + Arrays.sort(strings, StringUnionOperations.LEXICOGRAPHIC_ORDER); + Automaton a = new Automaton(); + a.initial = StringUnionOperations.build(strings); + a.setDeterministic(true); + a.reduce(); + return a; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringUnionOperations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringUnionOperations.java new file mode 100644 index 0000000..a199a57 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringUnionOperations.java @@ -0,0 +1,361 @@ +package org.apache.lucene.util.automaton; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.IdentityHashMap; + +/** + * Operations for building minimal deterministic automata from sets of strings. + * The algorithm requires sorted input data, but is very fast (nearly linear with the input size). + * + * @author Dawid Weiss + */ +final public class StringUnionOperations { + + /** + * Lexicographic order of input sequences. + */ + public final static Comparator LEXICOGRAPHIC_ORDER = new Comparator() { + public int compare(CharSequence s1, CharSequence s2) { + final int lens1 = s1.length(); + final int lens2 = s2.length(); + final int max = Math.min(lens1, lens2); + + for (int i = 0; i < max; i++) { + final char c1 = s1.charAt(i); + final char c2 = s2.charAt(i); + if (c1 != c2) + return c1 - c2; + } + return lens1 - lens2; + } + }; + + /** + * State with char labels on transitions. + */ + final static class State { + + /** An empty set of labels. */ + private final static char[] NO_LABELS = new char[0]; + + /** An empty set of states. */ + private final static State[] NO_STATES = new State[0]; + + /** + * Labels of outgoing transitions. Indexed identically to {@link #states}. + * Labels must be sorted lexicographically. + */ + char[] labels = NO_LABELS; + + /** + * States reachable from outgoing transitions. Indexed identically to + * {@link #labels}. + */ + State[] states = NO_STATES; + + /** + * true if this state corresponds to the end of at least one + * input sequence. + */ + boolean is_final; + + /** + * Returns the target state of a transition leaving this state and labeled + * with label. If no such transition exists, returns + * null. + */ + public State getState(char label) { + final int index = Arrays.binarySearch(labels, label); + return index >= 0 ? states[index] : null; + } + + /** + * Returns an array of outgoing transition labels. The array is sorted in + * lexicographic order and indexes correspond to states returned from + * {@link #getStates()}. + */ + public char [] getTransitionLabels() { + return this.labels; + } + + /** + * Returns an array of outgoing transitions from this state. The returned + * array must not be changed. + */ + public State[] getStates() { + return this.states; + } + + /** + * Two states are equal if: + *
    + *
  • they have an identical number of outgoing transitions, labeled with + * the same labels
  • + *
  • corresponding outgoing transitions lead to the same states (to states + * with an identical right-language). + *
+ */ + @Override + public boolean equals(Object obj) { + final State other = (State) obj; + return is_final == other.is_final + && Arrays.equals(this.labels, other.labels) + && referenceEquals(this.states, other.states); + } + + /** + * Return true if this state has any children (outgoing + * transitions). + */ + public boolean hasChildren() { + return labels.length > 0; + } + + /** + * Is this state a final state in the automaton? + */ + public boolean isFinal() { + return is_final; + } + + /** + * Compute the hash code of the current status of this state. + */ + @Override + public int hashCode() { + int hash = is_final ? 1 : 0; + + hash ^= hash * 31 + this.labels.length; + for (char c : this.labels) + hash ^= hash * 31 + c; + + /* + * Compare the right-language of this state using reference-identity of + * outgoing states. This is possible because states are interned (stored + * in registry) and traversed in post-order, so any outgoing transitions + * are already interned. + */ + for (State s : this.states) { + hash ^= System.identityHashCode(s); + } + + return hash; + } + + /** + * Create a new outgoing transition labeled label and return + * the newly created target state for this transition. + */ + State newState(char label) { + assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: " + + label; + + labels = copyOf(labels, labels.length + 1); + states = copyOf(states, states.length + 1); + + labels[labels.length - 1] = label; + return states[states.length - 1] = new State(); + } + + /** + * Return the most recent transitions's target state. + */ + State lastChild() { + assert hasChildren() : "No outgoing transitions."; + return states[states.length - 1]; + } + + /** + * Return the associated state if the most recent transition + * is labeled with label. + */ + State lastChild(char label) { + final int index = labels.length - 1; + State s = null; + if (index >= 0 && labels[index] == label) { + s = states[index]; + } + assert s == getState(label); + return s; + } + + /** + * Replace the last added outgoing transition's target state with the given + * state. + */ + void replaceLastChild(State state) { + assert hasChildren() : "No outgoing transitions."; + states[states.length - 1] = state; + } + + /** + * JDK1.5-replacement of {@link Arrays#copyOf(char[], int)} + */ + private static char[] copyOf(char[] original, int newLength) { + char[] copy = new char[newLength]; + System.arraycopy(original, 0, copy, 0, Math.min(original.length, + newLength)); + return copy; + } + + /** + * JDK1.5-replacement of {@link Arrays#copyOf(char[], int)} + */ + public static State[] copyOf(State[] original, int newLength) { + State[] copy = new State[newLength]; + System.arraycopy(original, 0, copy, 0, Math.min(original.length, newLength)); + return copy; + } + + /** + * Compare two lists of objects for reference-equality. + */ + private static boolean referenceEquals(Object[] a1, Object[] a2) { + if (a1.length != a2.length) + return false; + + for (int i = 0; i < a1.length; i++) + if (a1[i] != a2[i]) + return false; + + return true; + } + } + + /** + * "register" for state interning. + */ + private HashMap register = new HashMap(); + + /** + * Root automaton state. + */ + private State root = new State(); + + /** + * Previous sequence added to the automaton in {@link #add(CharSequence)}. + */ + private StringBuilder previous; + + /** + * Add another character sequence to this automaton. The sequence must be + * lexicographically larger or equal compared to any previous sequences + * added to this automaton (the input must be sorted). + */ + public void add(CharSequence current) { + assert register != null : "Automaton already built."; + assert current.length() > 0 : "Input sequences must not be empty."; + assert previous == null || LEXICOGRAPHIC_ORDER.compare(previous, current) <= 0 : + "Input must be sorted: " + previous + " >= " + current; + assert setPrevious(current); + + // Descend in the automaton (find matching prefix). + int pos = 0, max = current.length(); + State next, state = root; + while (pos < max && (next = state.lastChild(current.charAt(pos))) != null) { + state = next; + pos++; + } + + if (state.hasChildren()) + replaceOrRegister(state); + + addSuffix(state, current, pos); + } + + /** + * Finalize the automaton and return the root state. No more strings can be + * added to the builder after this call. + * + * @return Root automaton state. + */ + public State complete() { + if (this.register == null) + throw new IllegalStateException(); + + if (root.hasChildren()) + replaceOrRegister(root); + + register = null; + return root; + } + + /** + * Internal recursive traversal for conversion. + */ + private static org.apache.lucene.util.automaton.State convert(State s, + IdentityHashMap visited) { + org.apache.lucene.util.automaton.State converted = visited.get(s); + if (converted != null) + return converted; + + converted = new org.apache.lucene.util.automaton.State(); + converted.setAccept(s.is_final); + + visited.put(s, converted); + int i = 0; + char [] labels = s.labels; + for (StringUnionOperations.State target : s.states) { + converted.addTransition(new Transition(labels[i++], convert(target, visited))); + } + + return converted; + } + + /** + * Build a minimal, deterministic automaton from a sorted list of strings. + */ + public static org.apache.lucene.util.automaton.State build(CharSequence[] input) { + final StringUnionOperations builder = new StringUnionOperations(); + + for (CharSequence chs : input) + builder.add(chs); + + return convert(builder.complete(), new IdentityHashMap()); + } + + /** + * Copy current into an internal buffer. + */ + private boolean setPrevious(CharSequence current) { + if (previous == null) + previous = new StringBuilder(); + + previous.setLength(0); + previous.append(current); + + return true; + } + + /** + * Replace last child of state with an already registered + * state or register the last child state. + */ + private void replaceOrRegister(State state) { + final State child = state.lastChild(); + + if (child.hasChildren()) + replaceOrRegister(child); + + final State registered = register.get(child); + if (registered != null) { + state.replaceLastChild(registered); + } else { + register.put(child, child); + } + } + + /** + * Add a suffix of current starting at fromIndex + * (inclusive) to state state. + */ + private void addSuffix(State state, CharSequence current, int fromIndex) { + final int len = current.length(); + for (int i = fromIndex; i < len; i++) { + state = state.newState(current.charAt(i)); + } + state.is_final = true; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringUnion.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringUnion.java new file mode 100644 index 0000000..b7b2296 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringUnion.java @@ -0,0 +1,83 @@ +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Random; + +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +public class TestStringUnion extends LuceneTestCase { + private final static class MinMax { + public final int min; + public final int max; + + public MinMax(int min, int max) { + this.min = Math.min(min, max); + this.max = Math.max(min, max); + } + + public int range() { + return max - min; + } + } + + @Test + public void testSimpleFourWords() { + String[] input = {"art", "fart", "flirt", "start"}; + Automaton a = BasicAutomata.makeStringUnion(input); + Automaton b = makeUnion(input); + assertTrue(BasicOperations.sameLanguage(a, b)); + } + + @Test + public void testEmpty() { + String[] input = {}; + Automaton a = BasicAutomata.makeStringUnion(input); + Automaton b = makeUnion(input); + assertTrue(BasicOperations.sameLanguage(a, b)); + } + + @Test + public void testRandom5000() { + String[] input = generateRandom(2000, new MinMax(2, 10), new MinMax('a', 'z')); + long start = System.currentTimeMillis(); + Automaton a = BasicAutomata.makeStringUnion(input); + long daciuk = System.currentTimeMillis(); + Automaton b = makeUnion(input); + long union = System.currentTimeMillis(); + System.out.println("Daciuk: " + (daciuk - start) + ", union: " + (union - daciuk)); + assertTrue(BasicOperations.sameLanguage(a, b)); + } + + private Automaton makeUnion(String[] input) { + ArrayList automata = new ArrayList(); + for (String s : input) + automata.add(BasicAutomata.makeString(s)); + return Automaton.minimize(Automaton.union(automata)); + } + + /** + * Generate a sorted list of random sequences. + */ + private String[] generateRandom(int count, MinMax length, MinMax alphabet) { + final Random rnd = random; + final String[] input = new String[count]; + for (int i = 0; i < count; i++) { + input[i] = randomString(rnd, length, alphabet); + } + Arrays.sort(input); + return input; + } + + /** + * Generate a random string. + */ + private static String randomString(Random rnd, MinMax length, MinMax alphabet) { + char[] chars = new char[length.min + rnd.nextInt(length.range())]; + for (int i = 0; i < chars.length; i++) { + chars[i] = (char) (alphabet.min + 2 * rnd.nextInt(alphabet.range())); + } + return new String(chars); + } +} \ No newline at end of file