diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java index 936d4ed..73406e4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java @@ -46,6 +46,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.SlicedTransitions; import org.apache.lucene.util.automaton.Transition; // TODO: @@ -861,11 +862,13 @@ public final class DirectPostingsFormat extends PostingsFormat { private final CompiledAutomaton compiledAutomaton; private int termOrd; private final BytesRef scratch = new BytesRef(); + private final SlicedTransitions slicedTransitions; private final class State { int changeOrd; int state; - Transition[] transitions; + int transitionsStart; + int transitionsEnd; int transitionUpto; int transitionMax; int transitionMin; @@ -879,11 +882,13 @@ public final class DirectPostingsFormat extends PostingsFormat { compiledAutomaton = compiled; termOrd = -1; states = new State[1]; + slicedTransitions = compiledAutomaton.slicedTransitions; states[0] = new State(); states[0].changeOrd = terms.length; states[0].state = runAutomaton.getInitialState(); - states[0].transitions = compiledAutomaton.sortedTransitions[states[0].state]; - states[0].transitionUpto = -1; + states[0].transitionsStart = slicedTransitions.from[states[0].state]; + states[0].transitionsEnd = slicedTransitions.from[states[0].state+1]; + states[0].transitionUpto = -3; states[0].transitionMax = -1; //System.out.println("IE.init startTerm=" + startTerm); @@ -902,10 +907,11 @@ public final class DirectPostingsFormat extends PostingsFormat { final int label = startTerm.bytes[startTerm.offset+i] & 0xFF; while (label > states[i].transitionMax) { - states[i].transitionUpto++; - assert states[i].transitionUpto < states[i].transitions.length; - states[i].transitionMin = states[i].transitions[states[i].transitionUpto].getMin(); - states[i].transitionMax = states[i].transitions[states[i].transitionUpto].getMax(); + states[i].transitionUpto += 3; + assert states[i].transitionsStart + states[i].transitionUpto < states[i].transitionsEnd; + int base = states[i].transitionsStart + states[i].transitionUpto; + states[i].transitionMin = slicedTransitions.transitions[base]; + states[i].transitionMax = slicedTransitions.transitions[base+1]; assert states[i].transitionMin >= 0; assert states[i].transitionMin <= 255; assert states[i].transitionMax >= 0; @@ -962,8 +968,9 @@ public final class DirectPostingsFormat extends PostingsFormat { stateUpto++; states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; states[stateUpto].state = nextState; - states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; - states[stateUpto].transitionUpto = -1; + states[stateUpto].transitionsStart = slicedTransitions.from[nextState]; + states[stateUpto].transitionsEnd = slicedTransitions.from[nextState+1]; + states[stateUpto].transitionUpto = -3; states[stateUpto].transitionMax = -1; //System.out.println(" push " + states[stateUpto].transitions.length + " trans"); @@ -1119,8 +1126,8 @@ public final class DirectPostingsFormat extends PostingsFormat { while (label > state.transitionMax) { //System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length); - state.transitionUpto++; - if (state.transitionUpto == state.transitions.length) { + state.transitionUpto+=3; + if (state.transitionUpto + state.transitionsStart == state.transitionsEnd) { // We've exhausted transitions leaving this // state; force pop+next/skip now: //System.out.println("forcepop: stateUpto=" + stateUpto); @@ -1139,9 +1146,10 @@ public final class DirectPostingsFormat extends PostingsFormat { } continue nextTerm; } - assert state.transitionUpto < state.transitions.length: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.length; - state.transitionMin = state.transitions[state.transitionUpto].getMin(); - state.transitionMax = state.transitions[state.transitionUpto].getMax(); + assert state.transitionUpto+state.transitionsStart < state.transitionsEnd: " state.transitionUpto=" + state.transitionUpto+state.transitionsStart + " vs " + state.transitionsEnd; + int base = state.transitionUpto+state.transitionsStart; + state.transitionMin = slicedTransitions.transitions[base]; + state.transitionMax = slicedTransitions.transitions[base+1]; assert state.transitionMin >= 0; assert state.transitionMin <= 255; assert state.transitionMax >= 0; @@ -1239,8 +1247,9 @@ public final class DirectPostingsFormat extends PostingsFormat { stateUpto++; states[stateUpto].state = nextState; states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; - states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; - states[stateUpto].transitionUpto = -1; + states[stateUpto].transitionsStart = slicedTransitions.from[states[nextState].state]; + states[stateUpto].transitionsEnd = slicedTransitions.from[states[nextState].state+1]; + states[stateUpto].transitionUpto = -3; states[stateUpto].transitionMax = -1; if (stateUpto == termLength) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java index 3360602..2624813 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java @@ -50,6 +50,7 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.SlicedTransitions; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; @@ -611,7 +612,9 @@ public class BlockTreeTermsReader extends FieldsProducer { int numFollowFloorBlocks; int nextFloorLabel; - Transition[] transitions; + SlicedTransitions slicedTransitions; + int transitionStart; + int transitionEnd; int curTransitionMax; int transitionIndex; @@ -645,17 +648,19 @@ public class BlockTreeTermsReader extends FieldsProducer { nextFloorLabel = 256; } // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel); - } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[transitionIndex].getMin()); + } while (numFollowFloorBlocks != 0 && nextFloorLabel <= slicedTransitions.transitions[transitionIndex]); // getMin load(null); } public void setState(int state) { this.state = state; - transitionIndex = 0; - transitions = compiledAutomaton.sortedTransitions[state]; - if (transitions.length != 0) { - curTransitionMax = transitions[0].getMax(); + slicedTransitions = compiledAutomaton.slicedTransitions; + transitionStart = slicedTransitions.from[state]; + transitionEnd = slicedTransitions.from[state+1]; + transitionIndex = transitionStart; + if (transitionStart != transitionEnd) { + curTransitionMax = slicedTransitions.transitions[transitionIndex+1]; // getMax } else { curTransitionMax = -1; } @@ -665,7 +670,7 @@ public class BlockTreeTermsReader extends FieldsProducer { // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state)); - if (frameIndexData != null && transitions.length != 0) { + if (frameIndexData != null && transitionStart != transitionEnd) { // Floor frame if (floorData.length < frameIndexData.length) { this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)]; @@ -684,7 +689,7 @@ public class BlockTreeTermsReader extends FieldsProducer { // first block in case it has empty suffix: if (!runAutomaton.isAccept(state)) { // Maybe skip floor blocks: - while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[0].getMin()) { + while (numFollowFloorBlocks != 0 && nextFloorLabel <= slicedTransitions.transitions[transitionIndex]/*getMin()*/) { fp = fpOrig + (floorDataReader.readVLong() >>> 1); numFollowFloorBlocks--; // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); @@ -1101,7 +1106,7 @@ public class BlockTreeTermsReader extends FieldsProducer { if (currentFrame.suffix != 0) { final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff; while (label > currentFrame.curTransitionMax) { - if (currentFrame.transitionIndex >= currentFrame.transitions.length-1) { + if (currentFrame.transitionIndex >= currentFrame.transitionEnd-3) { // Stop processing this frame -- no further // matches are possible because we've moved // beyond what the max transition will allow @@ -1112,8 +1117,8 @@ public class BlockTreeTermsReader extends FieldsProducer { currentFrame.nextEnt = currentFrame.entCount; continue nextTerm; } - currentFrame.transitionIndex++; - currentFrame.curTransitionMax = currentFrame.transitions[currentFrame.transitionIndex].getMax(); + currentFrame.transitionIndex += 3; + currentFrame.curTransitionMax = currentFrame.slicedTransitions.transitions[currentFrame.transitionIndex+1]/*getMax()*/; //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java index 980fce8..010e967 100644 --- a/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java @@ -25,7 +25,7 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton; -import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.SlicedTransitions; /** * A FilteredTermsEnum that enumerates terms based upon what is accepted by a @@ -52,7 +52,7 @@ class AutomatonTermsEnum extends FilteredTermsEnum { // true if the automaton accepts a finite language private final boolean finite; // array of sorted transitions for each state, indexed by state number - private final Transition[][] allTransitions; +// private final Transition[][] allTransitions; // for path tracking: each long records gen when we last // visited the state; we use gens to avoid having to clear private final long[] visited; @@ -66,6 +66,7 @@ class AutomatonTermsEnum extends FilteredTermsEnum { private boolean linear = false; private final BytesRef linearUpperBound = new BytesRef(10); private final Comparator termComp; + private final SlicedTransitions slicedTransitions; /** * Construct an enumerator based upon an automaton, enumerating the specified @@ -81,7 +82,7 @@ class AutomatonTermsEnum extends FilteredTermsEnum { this.runAutomaton = compiled.runAutomaton; assert this.runAutomaton != null; this.commonSuffixRef = compiled.commonSuffixRef; - this.allTransitions = compiled.sortedTransitions; + this.slicedTransitions = compiled.slicedTransitions; // used for path tracking, where each bit is a numbered state. visited = new long[runAutomaton.getSize()]; @@ -142,11 +143,14 @@ class AutomatonTermsEnum extends FilteredTermsEnum { state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff); assert state >= 0: "state=" + state; } - for (int i = 0; i < allTransitions[state].length; i++) { - Transition t = allTransitions[state][i]; - if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && - (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { - maxInterval = t.getMax(); + int start = slicedTransitions.from[state]; + int end = slicedTransitions.from[state+1]; + for (int i = start; i < end; i++) { + final int min = slicedTransitions.transitions[i]; + final int max = slicedTransitions.transitions[i+1]; + if (min <= (seekBytesRef.bytes[position] & 0xff) && + (seekBytesRef.bytes[position] & 0xff) <= max) { + maxInterval = max; break; } } @@ -254,19 +258,21 @@ class AutomatonTermsEnum extends FilteredTermsEnum { seekBytesRef.length = position; visited[state] = curGen; - Transition transitions[] = allTransitions[state]; + int start = slicedTransitions.from[state]; + int end = slicedTransitions.from[state+1]; // find the minimal path (lexicographic order) that is >= c - for (int i = 0; i < transitions.length; i++) { - Transition transition = transitions[i]; - if (transition.getMax() >= c) { - int nextChar = Math.max(c, transition.getMin()); + for (int i = start; i < end; i+=3) { + final int max = slicedTransitions.transitions[i+1]; + if (max >= c) { + final int min = slicedTransitions.transitions[i]; + int nextChar = Math.max(c, min); // append either the next sequential char, or the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar; - state = transition.getDest().getNumber(); + state = slicedTransitions.transitions[i+2]; /* * as long as is possible, continue down the minimal path in * lexicographic order. if a loop or accept state is encountered, stop. @@ -278,13 +284,13 @@ class AutomatonTermsEnum extends FilteredTermsEnum { * so the below is ok, if it is not an accept state, * then there MUST be at least one transition. */ - transition = allTransitions[state][0]; - state = transition.getDest().getNumber(); + int offset = slicedTransitions.from[state]; + state = slicedTransitions.transitions[offset+2]; // append the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; - seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin(); + seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) slicedTransitions.transitions[offset]; // we found a loop, record it for faster enumeration if (!finite && !linear && visited[state] == curGen) { diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index 4f52f36..92f3e90 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -34,6 +34,7 @@ import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.BasicAutomata; @@ -165,21 +166,15 @@ public class FuzzyTermsEnum extends TermsEnum { private List initAutomata(int maxDistance) { final List runAutomata = dfaAtt.automata(); //System.out.println("cached automata size: " + runAutomata.size()); + IntsRef prefix = realPrefixLength > 0 ? new IntsRef(termText, 0, realPrefixLength) : null; if (runAutomata.size() <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions); for (int i = runAutomata.size(); i <= maxDistance; i++) { - Automaton a = builder.toAutomaton(i); - //System.out.println("compute automaton n=" + i); - // constant prefix - if (realPrefixLength > 0) { - Automaton prefix = BasicAutomata.makeString( - UnicodeUtil.newString(termText, 0, realPrefixLength)); - a = BasicOperations.concatenate(prefix, a); - } - runAutomata.add(new CompiledAutomaton(a, true, false)); + CompiledAutomaton automata = builder.toRunAutomaton(prefix, i); + runAutomata.add(automata); } } return runAutomata; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java index f943489..392ba83 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -773,4 +773,32 @@ public class Automaton implements Cloneable { MinimizationOperations.minimize(a); return a; } + + public SlicedTransitions getSlicedTransitions() { + final State[] states = getNumberedStates(); + final int[] stateOffsets = new int[states.length+1]; + boolean[] accept = new boolean[states.length]; + int numTrans = 0; + for (State s : states) { + s.sortTransitions(Transition.CompareByMinMaxThenDest); + s.trimTransitionsArray(); + numTrans += s.transitionsArray.length; + assert s.transitionsArray != null; + } + int[] transitions = new int[numTrans*3]; + int offset = 0; + for (int i = 0; i < states.length; i++) { + State state = states[i]; + accept[i] = state.isAccept(); + stateOffsets[i] = offset; + Transition[] trans = state.transitionsArray; + for (int j = 0; j < trans.length; j++) { + transitions[offset++] = trans[j].getMin(); + transitions[offset++] = trans[j].getMax(); + transitions[offset++] = trans[j].to.number; + } + } + stateOffsets[stateOffsets.length-1] = offset; + return new SlicedTransitions(stateOffsets, transitions, states.length, accept); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java index 8c8d68a..8f680f7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java @@ -30,6 +30,11 @@ public class ByteRunAutomaton extends RunAutomaton { public ByteRunAutomaton(Automaton a, boolean utf8) { super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true); } + + // nocommit + ByteRunAutomaton(SlicedTransitions slices) { + super(slices, 256, true); + } /** * Returns true if the given byte array is accepted by this automaton diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java index d606116..9bcdcdd 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java @@ -72,7 +72,8 @@ public class CompiledAutomaton { * {@link #runAutomaton}. * Only valid for {@link AUTOMATON_TYPE#NORMAL}. */ - public final Transition[][] sortedTransitions; + //nocommit - document.... + public final SlicedTransitions slicedTransitions; /** * Shared common suffix accepted by the automaton. Only valid * for {@link AUTOMATON_TYPE#NORMAL}, and only when the @@ -102,7 +103,7 @@ public class CompiledAutomaton { term = null; commonSuffixRef = null; runAutomaton = null; - sortedTransitions = null; + slicedTransitions = null; this.finite = null; return; } else if (BasicOperations.isTotal(automaton)) { @@ -111,7 +112,7 @@ public class CompiledAutomaton { term = null; commonSuffixRef = null; runAutomaton = null; - sortedTransitions = null; + slicedTransitions = null; this.finite = null; return; } else { @@ -136,7 +137,7 @@ public class CompiledAutomaton { term = new BytesRef(singleton); commonSuffixRef = null; runAutomaton = null; - sortedTransitions = null; + slicedTransitions = null; this.finite = null; return; } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate( @@ -146,7 +147,7 @@ public class CompiledAutomaton { term = new BytesRef(commonPrefix); commonSuffixRef = null; runAutomaton = null; - sortedTransitions = null; + slicedTransitions = null; this.finite = null; return; } @@ -167,7 +168,16 @@ public class CompiledAutomaton { commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8); } runAutomaton = new ByteRunAutomaton(utf8, true); - sortedTransitions = utf8.getSortedTransitions(); + slicedTransitions = utf8.getSlicedTransitions(); + } + + CompiledAutomaton(SlicedTransitions transitions, ByteRunAutomaton runAutomaton) { + this.runAutomaton = runAutomaton; + slicedTransitions = transitions; + commonSuffixRef = null; + this.finite = true; + type = AUTOMATON_TYPE.NORMAL; + term = null; } //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; @@ -176,21 +186,23 @@ public class CompiledAutomaton { // Find biggest transition that's < label // TODO: use binary search here - Transition maxTransition = null; - for (Transition transition : sortedTransitions[state]) { - if (transition.min < leadLabel) { - maxTransition = transition; - } + int maxTransition = -1; + int offset = slicedTransitions.from[state]; + int end = slicedTransitions.from[state+1]; + for (int i = offset; i < end; i+=3) { + if (slicedTransitions.transitions[i] < leadLabel) { + maxTransition = i; + } } - assert maxTransition != null; + assert maxTransition >= 0 ; // Append floorLabel final int floorLabel; - if (maxTransition.max > leadLabel-1) { + if (slicedTransitions.transitions[maxTransition+1] > leadLabel-1) { floorLabel = leadLabel-1; } else { - floorLabel = maxTransition.max; + floorLabel = slicedTransitions.transitions[maxTransition+1]; } if (idx >= term.bytes.length) { term.grow(1+idx); @@ -198,13 +210,14 @@ public class CompiledAutomaton { //if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx); term.bytes[idx] = (byte) floorLabel; - state = maxTransition.to.getNumber(); + state = slicedTransitions.transitions[maxTransition+2]; idx++; // Push down to last accept state while (true) { - Transition[] transitions = sortedTransitions[state]; - if (transitions.length == 0) { + offset = slicedTransitions.from[state]; + end = slicedTransitions.from[state+1]; + if (offset == end) { assert runAutomaton.isAccept(state); term.length = idx; //if (DEBUG) System.out.println(" return " + term.utf8ToString()); @@ -212,14 +225,15 @@ public class CompiledAutomaton { } else { // We are pushing "top" -- so get last label of // last transition: - assert transitions.length != 0; - Transition lastTransition = transitions[transitions.length-1]; + assert offset < end : "offset: " + offset + " end: " + end + " state: " + state + " numStates: " + slicedTransitions.numStates; + int lastMax = slicedTransitions.transitions[end-2]; + int lastTo = slicedTransitions.transitions[end-1]; if (idx >= term.bytes.length) { term.grow(1+idx); } //if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx); - term.bytes[idx] = (byte) lastTransition.max; - state = lastTransition.to.getNumber(); + term.bytes[idx] = (byte) lastMax; + state = lastTo; idx++; } } @@ -300,13 +314,14 @@ public class CompiledAutomaton { // Pop back to a state that has a transition // <= our label: while (true) { - Transition[] transitions = sortedTransitions[state]; - if (transitions.length == 0) { + int offset = slicedTransitions.from[state]; + int end = slicedTransitions.from[state+1]; + if (offset == end) { assert runAutomaton.isAccept(state); output.length = idx; //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return output; - } else if (label-1 < transitions[0].min) { + } else if (label-1 < slicedTransitions.transitions[offset]) { if (runAutomaton.isAccept(state)) { output.length = idx; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java index 92384c4..5e75192 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java @@ -21,6 +21,11 @@ import java.util.Iterator; import java.util.SortedSet; import java.util.TreeSet; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IntBlockPool; +import org.apache.lucene.util.SorterTemplate; +import org.apache.lucene.util.IntsRef; + /** * Class to construct DFAs that match a word within some edit distance. *

@@ -42,7 +47,7 @@ public class LevenshteinAutomata { /* the ranges outside of alphabet */ final int rangeLower[]; final int rangeUpper[]; - int numRanges = 0; + final int numRanges; ParametricDescription descriptions[]; @@ -81,6 +86,7 @@ public class LevenshteinAutomata { // calculate the unicode range intervals that exclude the alphabet // these are the ranges for all unicode characters not in the alphabet int lower = 0; + int numRanges = 0; for (int i = 0; i < alphabet.length; i++) { int higher = alphabet[i]; if (higher > lower) { @@ -96,7 +102,7 @@ public class LevenshteinAutomata { rangeUpper[numRanges] = alphaMax; numRanges++; } - + this.numRanges = numRanges; descriptions = new ParametricDescription[] { null, /* for n=0, we do not need to go through the trouble */ withTranspositions ? new Lev1TParametricDescription(word.length) : new Lev1ParametricDescription(word.length), @@ -165,7 +171,7 @@ public class LevenshteinAutomata { for (int r = 0; r < numRanges; r++) states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest])); } - + Automaton a = new Automaton(states[0]); a.setDeterministic(true); // we create some useless unconnected states, and its a net-win overall to remove these, @@ -179,6 +185,288 @@ public class LevenshteinAutomata { return a; } + + public CompiledAutomaton toRunAutomaton(IntsRef prefix, int n) { + if (n == 0) { + if (prefix == null || prefix.length == 0) { + return new CompiledAutomaton(BasicAutomata.makeString(word, 0, word.length), true, false); + } + return new CompiledAutomaton(BasicAutomata.makeString(prefix.ints, prefix.offset, prefix.length). + concatenate(BasicAutomata.makeString(word, 0, word.length)), true, false); + } + + if (n >= descriptions.length) + return null; + final int[] transitionBuffer = new int[(alphabet.length + numRanges) * 3]; + int transitionUpTo = 0; + final int range = 2*n+1; + ParametricDescription description = descriptions[n]; + // the number of states is based on the length of the word and n + SliceTransitionBuilder builder = new SliceTransitionBuilder(description.size()); + // create all states, and mark as accept states if appropriate + final int numInitStates = description.size(); + // create transitions from state to state + for (int k = 0; k < numInitStates; k++) { + final int xpos = description.getPosition(k); + if (xpos < 0) + continue; + final int end = xpos + Math.min(word.length - xpos, range); + + for (int x = 0; x < alphabet.length; x++) { + final int ch = alphabet[x]; + // get the characteristic vector at this position wrt ch + final int cvec = getVector(ch, xpos, end); + int dest = description.transition(k, xpos, cvec); + if (dest >= 0) { + transitionBuffer[transitionUpTo++] = ch; + transitionBuffer[transitionUpTo++] = ch; + transitionBuffer[transitionUpTo++] = dest; + } + } + // add transitions for all other chars in unicode + // by definition, their characteristic vectors are always 0, + // because they do not exist in the input string. + int dest = description.transition(k, xpos, 0); // by definition + if (dest >= 0) { + for (int r = 0; r < numRanges; r++) { + transitionBuffer[transitionUpTo++] = rangeLower[r]; + transitionBuffer[transitionUpTo++] = rangeUpper[r]; + transitionBuffer[transitionUpTo++] = dest; + } + } + builder.addBuffer(k, transitionBuffer, transitionUpTo); + transitionUpTo = 0; + + } + final SlicedTransitions slicedTransitions = builder.toSlicedTransitions(prefix, description); + final ByteRunAutomaton runAutomaton = new ByteRunAutomaton(slicedTransitions); + assert assertAutomatons(slicedTransitions, prefix, builder, description) : " prefix: " + prefix + " n: " + n; + return new CompiledAutomaton(slicedTransitions, runAutomaton); + } + + public static boolean assertAutomatons(SlicedTransitions transitions, IntsRef prefix, SliceTransitionBuilder builder, ParametricDescription description) { + boolean[] isAccept = new boolean[builder.numStates]; + int limit = description.size(); + for (int i = 0; i < limit; i++) { + isAccept[i] = description.isAccept(i); + } + Automaton automaton = builder.toAutomaton(isAccept); + automaton.setDeterministic(true); + if (prefix != null) { + automaton = new UTF32ToUTF8().convert(BasicAutomata.makeString(prefix.ints, prefix.offset, prefix.length)).concatenate(automaton); + automaton.setDeterministic(true); + } + return BasicOperations.sameLanguage(transitions.toAutomaton(), automaton); + } + + private static class SliceTransitionBuilder { + final UTF32ToUTF8.Transitions utf8Transitions = new UTF32ToUTF8.Transitions(); + final UTF32ToUTF8 utf32ToUTF8 = new UTF32ToUTF8(); + final IntBlockPool pool = new IntBlockPool(); + final IntBlockPool.SliceWriter writer = new IntBlockPool.SliceWriter(pool); + int[] stateStart; + int[] stateEnd; + int numStates; + int numTransitions; + + public SliceTransitionBuilder(int numUTF32States) { + stateStart = new int[numUTF32States]; + stateEnd = new int[numUTF32States]; + numStates = numUTF32States; + writer.startNewSlice();// ignore first slice to safe a fill + } + + public void addBuffer(int start, int[] transitionBuffer, int transitionUpTo) { + // TODO can we a.reduce() this on the fly? + /* nocommit + * - do we need the slice writer or can we sort and serialized into fixed length directly. + * - we know that the states transition here are fixed make use of it. + * + */ + for (int i = 0; i < transitionUpTo; i++) { + int min = transitionBuffer[i++]; + int max = transitionBuffer[i++]; + int transDest = transitionBuffer[i]; + utf8Transitions.reset(numStates); + utf32ToUTF8.convertOneEdge(start, transDest, min, max, utf8Transitions); + stateStart = ArrayUtil.grow(stateStart, utf8Transitions.addedStates + utf8Transitions.stateCounter); + stateEnd = ArrayUtil.grow(stateEnd, utf8Transitions.addedStates + utf8Transitions.stateCounter); + for (int j = 0; j < utf8Transitions.offset; j++) { + int fromState = utf8Transitions.transitions[j++]; + int toState = utf8Transitions.transitions[j++]; + int minT = utf8Transitions.transitions[j++]; + int maxT = utf8Transitions.transitions[j]; + if (stateStart[fromState] <= 0) { + stateEnd[fromState] = stateStart[fromState] = writer.startNewSlice(); + } else { + writer.reset(stateEnd[fromState]); + } + writer.writeInt(minT); + writer.writeInt(maxT); + writer.writeInt(toState); + numTransitions++; + stateEnd[fromState] = writer.getCurrentOffset(); + } + numStates += utf8Transitions.addedStates; + } + } + + static int insertSingletonPrefix(int[] from, int[] transitions, SlicedTransitions singleton) { + int state = 0; + int transIndex = 0; + for (int i = 0; i < singleton.numStates-1; i++) { + from[i] = i*3; + int offset = singleton.from[state]; + transitions[transIndex++] = singleton.transitions[offset++]; + transitions[transIndex++] = singleton.transitions[offset++]; + transitions[transIndex++] = i+1; + state = singleton.transitions[offset]; + } + assert singleton.accept[state]; + return singleton.numStates-1; + } + + public SlicedTransitions toSlicedTransitions(IntsRef prefix, ParametricDescription description) { + // TODO do this on the fly + int numPrefixStates = 0; + int numPrefixTransitions = 0; + int fromOffset = 0; + final int[] from; + final int[] transitions; + if (prefix == null || prefix.length == 0) { + from = new int[numStates + 1]; + transitions = new int[numTransitions * 3]; + } else { + SlicedTransitions singleton = new UTF32ToUTF8() + .convert( + BasicAutomata.makeString(prefix.ints, prefix.offset, + prefix.length)).getSlicedTransitions(); + numPrefixStates = singleton.numStates-1; + numPrefixTransitions = singleton.transitions.length; + from = new int[numPrefixStates+ numStates + 1]; + transitions = new int[singleton.transitions.length + numTransitions * 3]; + fromOffset = insertSingletonPrefix(from, transitions, singleton); + } + IntBlockPool.SliceReader reader = new IntBlockPool.SliceReader(pool); + int transIndex = numPrefixTransitions; + int descLimit = description.size(); + boolean[] accept = new boolean[numPrefixStates + numStates]; + + for (int i = 0; i < numStates; i++) { + if (i < descLimit) { + accept[numPrefixStates + i] = description.isAccept(i); + } + reader.reset(stateStart[i], stateEnd[i]); + from[fromOffset + i] = transIndex; + assert transIndex%3 == 0; + int numTrans = 0; + int startOffset = transIndex; + while (!reader.endOfSlice()) { + numTrans++; + transitions[transIndex++] = reader.readInt(); + transitions[transIndex++] = reader.readInt(); + transitions[transIndex++] = numPrefixStates + reader.readInt(); + assert transitions[transIndex-1] < from.length; + // nocommit we should reduce the transitions on the fly here we have them sorted already and can apply the reduce alg directly + } + assert assertTransitions(transitions, numPrefixStates + numStates); + assert from[fromOffset+i] % 3 == 0; + sortTransitions(transitions, startOffset, numTrans); // nocommit should we do that during building + } + from[from.length - 1] = transIndex; + assert assertTransitions(transitions, numPrefixStates + numStates); + + assert transitions.length == from[from.length-1] : " " + from[from.length-1] + " " + transitions.length; + return new SlicedTransitions(from, transitions, numPrefixStates+numStates, accept); + } + + private static boolean assertTransitions(int[] transitions, int numStates) { + for (int i = 0; i < transitions.length; i+=3) { + assert transitions[i+2] < numStates : "tansition to state: " + transitions[i+2] + " numStates: " + numStates; + } + return true; + } + + public Automaton toAutomaton(boolean[] accept) { + State[] states = new State[numStates]; + IntBlockPool.SliceReader reader = new IntBlockPool.SliceReader(pool); + for (int i = 0; i < states.length; i++) { + states[i] = new State(); + if (i < accept.length) { + states[i].setAccept(accept[i]); + } + states[i].number = i; + + } + for (int i = 0; i < numStates; i++) { + State s = states[i]; + reader.reset(stateStart[i], stateEnd[i]); + while(!reader.endOfSlice()) { + s.addTransition(new Transition(reader.readInt(), reader.readInt(), states[reader.readInt()])); + } + } + Automaton automaton = new Automaton(states[0]); + automaton.setNumberedStates(states); + return automaton; + } + private final int[] pivot = new int[3]; + + private void sortTransitions(final int[] transitions, final int start, final int num) { + if (num==1) { + return;// sorted only one transition! + } + + new SorterTemplate() { + + @Override + protected void swap(int i, int j) { + for (int k = 0; k < 3; k++) { + int tmp = transitions[start+3*i+k]; + transitions[start+3*i+k] = transitions[start+3*j+k]; + transitions[start+3*j+k] = tmp; + } + } + + @Override + protected void setPivot(int i) { + for (int k = 0; k < 3; k++) { + pivot[k] = transitions[start+3*i+k]; + } + + } + + @Override + protected int comparePivot(int j) { + int rOffset = start + 3 * j; + if (pivot[0] < transitions[rOffset]) return -1; + if (pivot[0] > transitions[rOffset]) return 1; + if (pivot[1] > transitions[rOffset + 1]) return -1; + if (pivot[1] < transitions[rOffset + 1]) return 1; + if (pivot[2] != transitions[rOffset + 2]) { + if (pivot[2] < transitions[rOffset + 2]) return -1; + if (pivot[2] > transitions[rOffset + 2]) return 1; + } + return 0; + } + + @Override + protected int compare(int i, int j) { + int lOffset = start+3*i; + int rOffset = start+3*j; + if (transitions[lOffset] < transitions[rOffset]) return -1; + if (transitions[lOffset] > transitions[rOffset]) return 1; + if (transitions[lOffset+1] > transitions[rOffset+1]) return -1; + if (transitions[lOffset+1] < transitions[rOffset+1]) return 1; + if (transitions[lOffset+2] != transitions[rOffset+2]) { + if (transitions[lOffset+2] < transitions[rOffset+2]) return -1; + if (transitions[lOffset+2] > transitions[rOffset+2]) return 1; + } + return 0; + } + }.quickSort(0, num-1); + } + } + /** * Get the characteristic vector X(x, V) * where V is substring(pos, end) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java index 2d9c0d0..14edcf0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java @@ -110,6 +110,39 @@ public abstract class RunAutomaton { final int getCharClass(int c) { return SpecialOperations.findIndex(c, points); } + + //nocommit + protected RunAutomaton(SlicedTransitions slices, int maxInterval, boolean tableize) { + this.maxInterval = maxInterval; + points = slices.getPoints(); + initial = 0; + size = slices.numStates; + accept = slices.accept; + transitions = new int[size * points.length]; + for (int n = 0; n < size * points.length; n++) + transitions[n] = -1; + for (int i = 0; i < size; i++) { + for (int c = 0; c < points.length; c++) { + int state = slices.step(i, points[c]); + if (state >=0 ) transitions[i * points.length + c] = state; + } + } + /* + * Set alphabet table for optimal run performance. + */ + if (tableize) { + classmap = new int[maxInterval + 1]; + int i = 0; + for (int j = 0; j <= maxInterval; j++) { + if (i + 1 < points.length && j == points[i + 1]) + i++; + classmap[j] = i; + } + } else { + classmap = null; + } + + } /** * Constructs a new RunAutomaton from a deterministic diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/SlicedTransitions.java b/lucene/core/src/java/org/apache/lucene/util/automaton/SlicedTransitions.java new file mode 100644 index 0000000..2ae0625 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/SlicedTransitions.java @@ -0,0 +1,99 @@ +package org.apache.lucene.util.automaton; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//nocommit - find a better name - we could extract a MinimalAutomaton interface and impl this here.... +public class SlicedTransitions { + + public final int[] from; + public final int[] transitions; + public final int numStates; + public final boolean[] accept; + + public SlicedTransitions(int[] from, int[] transitions, int numStates, boolean[] accept) { + this.accept = accept; + this.from = from; + this.transitions = transitions; + this.numStates = numStates; + } + + public SlicedTransitions(int[] from, int[] transitions, int numStates) { + this(from, transitions, numStates, null); + } + + public int[] getPoints() { + // nocommit maybe we can precompute this? + Set pointset = new HashSet(); + pointset.add(Character.MIN_CODE_POINT); + for (int i = 0; i < numStates; i++) { + int end = from[i+1]; + for (int j=from[i];j= 0; + int end = from[state+1]; + for (int i=from[state];i 0) { - assertTrue(automata[n-1].subsetOf(automata[n])); - assertTrue(automata[n-1].subsetOf(tautomata[n])); - assertTrue(tautomata[n-1].subsetOf(automata[n])); - assertTrue(tautomata[n-1].subsetOf(tautomata[n])); - assertNotSame(automata[n-1], automata[n]); - } - // check that Lev(N) is a subset of LevT(N) - assertTrue(automata[n].subsetOf(tautomata[n])); - // special checks for specific n - switch(n) { - case 0: - // easy, matches the string itself - assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), automata[0])); - assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), tautomata[0])); - break; - case 1: - // generate a lev1 naively, and check the accepted lang is the same. - assertTrue(BasicOperations.sameLanguage(naiveLev1(s), automata[1])); - assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), tautomata[1])); - break; - default: - assertBruteForce(s, automata[n], n); - assertBruteForceT(s, tautomata[n], n); - break; + for (int i = 0; i < 2; i++) { + LevenshteinAutomata builder = new LevenshteinAutomata(s, false); + LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true); + Automaton automata[] = new Automaton[maxDistance + 1]; + Automaton tautomata[] = new Automaton[maxDistance + 1]; + for (int n = 0; n < automata.length; n++) { + automata[n] = i == 0 ? builder.toAutomaton(n) : builder.toRunAutomaton(null, n).slicedTransitions.toAutomaton() ; + tautomata[n] = i == 0 ? tbuilder.toAutomaton(n) : tbuilder.toRunAutomaton(null, n).slicedTransitions.toAutomaton(); + assertNotNull(automata[n]); + assertNotNull(tautomata[n]); + assertTrue(automata[n].isDeterministic()); + assertTrue(tautomata[n].isDeterministic()); + assertTrue(SpecialOperations.isFinite(automata[n])); + assertTrue(SpecialOperations.isFinite(tautomata[n])); + AutomatonTestUtil.assertNoDetachedStates(automata[n]); + AutomatonTestUtil.assertNoDetachedStates(tautomata[n]); + // check that the dfa for n-1 accepts a subset of the dfa for n + if (n > 0) { + assertTrue(automata[n-1].subsetOf(automata[n])); + assertTrue(automata[n-1].subsetOf(tautomata[n])); + assertTrue(tautomata[n-1].subsetOf(automata[n])); + assertTrue(tautomata[n-1].subsetOf(tautomata[n])); + assertNotSame(automata[n-1], automata[n]); + } + // check that Lev(N) is a subset of LevT(N) + assertTrue(automata[n].subsetOf(tautomata[n])); + // special checks for specific n + switch(n) { + case 0: + // easy, matches the string itself + assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), automata[0])); + assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), tautomata[0])); + break; + case 1: + // generate a lev1 naively, and check the accepted lang is the same. + assertTrue("string: " + s + " i: " + i, BasicOperations.sameLanguage(i == 0 ? naiveLev1(s) : new UTF32ToUTF8().convert(naiveLev1(s)), automata[1])); + assertTrue("string: " + s + " i: " + i, BasicOperations.sameLanguage(i == 0 ? naiveLev1T(s) : new UTF32ToUTF8().convert(naiveLev1T(s)), tautomata[1])); + break; + default: + assertBruteForce(s, automata[n], n); + assertBruteForceT(s, tautomata[n], n); + break; + } } } }