diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java index 3225fdf..5127862 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java @@ -83,7 +83,7 @@ public class Builder { @SuppressWarnings("unchecked") final UnCompiledNode[] f = (UnCompiledNode[]) new UnCompiledNode[10]; frontier = f; for(int idx=0;idx(this); + frontier[idx] = new UnCompiledNode(this, idx); } } @@ -201,7 +201,7 @@ public class Builder { // undecided on whether to prune it. later, it // will be either compiled or pruned, so we must // allocate a new node: - frontier[idx] = new UnCompiledNode(this); + frontier[idx] = new UnCompiledNode(this, idx); } } } @@ -292,7 +292,7 @@ public class Builder { new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(frontier, 0, next, 0, frontier.length); for(int idx=frontier.length;idx(this); + next[idx] = new UnCompiledNode(this, idx); } frontier = next; } @@ -424,12 +424,22 @@ public class Builder { boolean isFinal; int inputCount; + /** This node's depth, starting from the automaton root. */ + final int depth; + + /** + * @param depth + * The node's depth starting from the automaton root. Needed for + * LUCENE-2934 (node expansion based on conditions other than the + * fanout size). + */ @SuppressWarnings("unchecked") - public UnCompiledNode(Builder owner) { + public UnCompiledNode(Builder owner, int depth) { this.owner = owner; arcs = (Arc[]) new Arc[1]; arcs[0] = new Arc(); output = owner.NO_OUTPUT; + this.depth = depth; } public boolean isCompiled() { @@ -441,6 +451,9 @@ public class Builder { isFinal = false; output = owner.NO_OUTPUT; inputCount = 0; + + // We don't clear the depth here because it never changes + // for nodes on the frontier (even when reused). } public T getLastOutput(int labelToMatch) { diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java index 0b366b4..60dc55c 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java @@ -25,6 +25,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode; /** Represents an FST using a compact byte[] format. *

The format is similar to what's used by Morfologik @@ -47,11 +48,21 @@ public class FST { // this when number of arcs is > NUM_ARCS_ARRAY: private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; - // If the node has >= this number of arcs, the arcs are - // stored as a fixed array. Fixed array consumes more RAM - // but enables binary search on the arcs (instead of - // linear scan) on lookup by arc label: - private final static int NUM_ARCS_FIXED_ARRAY = 10; + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node. + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5; + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10; + private int[] bytesPerArc = new int[0]; // Increment version to change it @@ -315,7 +326,7 @@ public class FST { int startAddress = writer.posWrite; //System.out.println(" startAddr=" + startAddress); - final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY; + final boolean doFixedArray = shouldExpand(node); final int fixedArrayStart; if (doFixedArray) { if (bytesPerArc.length < node.numArcs) { @@ -518,6 +529,23 @@ public class FST { return readNextArc(arc); } + /** + * Checks if arc's target state is in expanded (or vector) format. + * + * @return Returns true if arc points to a state in an + * expanded array format. + */ + boolean isExpandedTarget(Arc follow) throws IOException { + if (follow.isFinal()) { + return false; + } else { + final BytesReader in = getBytesReader(follow.target); + final byte b = in.readByte(); + + return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; + } + } + /** In-place read; returns the arc. */ public Arc readNextArc(Arc arc) throws IOException { if (arc.label == -1) { @@ -712,6 +740,26 @@ public class FST { public int getArcWithOutputCount() { return arcWithOutputCount; } + + /** + * Nodes will be expanded if their depth (distance from the root node) is + * <= this value and their number of arcs is >= + * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}. + * + *

+ * Fixed array consumes more RAM but enables binary search on the arcs + * (instead of a linear scan) on lookup by arc label. + * + * @return true if node should be stored in an + * expanded (array) form. + * + * @see #FIXED_ARRAY_NUM_ARCS_DEEP + * @see Builder.UnCompiledNode#depth + */ + private boolean shouldExpand(UnCompiledNode node) { + return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) || + node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP; + } // Non-static: writes to FST's byte[] class BytesWriter extends DataOutput { diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java index a10c4bc..6699ac6 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java @@ -189,6 +189,8 @@ public final class Util { */ public static void toDot(FST fst, Writer out, boolean sameRank, boolean labelStates) throws IOException { + final String expandedNodeColor = "blue"; + // This is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); @@ -219,7 +221,9 @@ public final class Util { } emitDotState(out, "initial", "point", "white", ""); - emitDotState(out, Integer.toString(startArc.target), stateShape, null, ""); + emitDotState(out, Integer.toString(startArc.target), stateShape, + fst.isExpandedTarget(startArc) ? expandedNodeColor : null, + ""); out.write(" initial -> " + startArc.target + "\n"); final T NO_OUTPUT = fst.outputs.getNoOutput(); @@ -243,7 +247,9 @@ public final class Util { while (true) { // Emit the unseen state and add it to the queue for the next level. if (arc.target >= 0 && !seen.get(arc.target)) { - emitDotState(out, Integer.toString(arc.target), stateShape, null, + final boolean isExpanded = fst.isExpandedTarget(arc); + emitDotState(out, Integer.toString(arc.target), stateShape, + isExpanded ? expandedNodeColor : null, labelStates ? Integer.toString(arc.target) : ""); seen.set(arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); @@ -285,10 +291,10 @@ public final class Util { } sameLevelStates.clear(); } - + // Emit terminating state (always there anyway). out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n"); - out.write(" {rank=sink; -1 } "); + out.write(" {rank=sink; -1 }\n"); out.write("}\n"); out.flush(); diff --git a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java index f979481..5f6c589 100644 --- a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java +++ b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java @@ -56,6 +56,7 @@ import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.fst.FST.Arc; public class TestFSTs extends LuceneTestCase { @@ -1322,4 +1323,85 @@ public class TestFSTs extends LuceneTestCase { assertEquals(b, seekResult.input); assertEquals(42, (long) seekResult.output); } + + /** + * Test state expansion (array format) on close-to-root states. Creates + * synthetic input that has one expanded state on each level. + * + * @see "https://issues.apache.org/jira/browse/LUCENE-2933" + */ + public void testExpandedCloseToRoot() throws Exception { + class SyntheticData { + FST compile(String[] lines) throws IOException { + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object nothing = outputs.getNoOutput(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + + int line = 0; + final BytesRef term = new BytesRef(); + while (line < lines.length) { + String w = lines[line++]; + if (w == null) { + break; + } + term.copy(w); + b.add(term, nothing); + } + + return b.finish(); + } + + void generate(ArrayList out, StringBuilder b, char from, char to, + int depth) { + if (depth == 0 || from == to) { + String seq = b.toString() + "_" + out.size() + "_end"; + out.add(seq); + } else { + for (char c = from; c <= to; c++) { + b.append(c); + generate(out, b, from, c == to ? to : from, depth - 1); + b.deleteCharAt(b.length() - 1); + } + } + } + + public int verifyStateAndBelow(FST fst, Arc arc, int depth) + throws IOException { + if (fst.targetHasArcs(arc)) { + int childCount = 0; + for (arc = fst.readFirstTargetArc(arc, arc);; + arc = fst.readNextArc(arc), childCount++) + { + boolean expanded = fst.isExpandedTarget(arc); + int children = verifyStateAndBelow(fst, new FST.Arc().copyFrom(arc), depth + 1); + + assertEquals( + expanded, + (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && + children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || + children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP); + if (arc.isLast()) break; + } + + return childCount; + } + return 0; + } + } + + // Sanity check. + assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP); + assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0); + + SyntheticData s = new SyntheticData(); + + ArrayList out = new ArrayList(); + StringBuilder b = new StringBuilder(); + s.generate(out, b, 'a', 'i', 10); + String[] input = out.toArray(new String[out.size()]); + Arrays.sort(input); + FST fst = s.compile(input); + FST.Arc arc = fst.getFirstArc(new FST.Arc()); + s.verifyStateAndBelow(fst, arc, 1); + } }