Index: lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java --- lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Thu Dec 02 11:30:31 2010 -0500 +++ lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Fri Dec 03 05:33:14 2010 -0500 @@ -31,15 +31,16 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.fst.Builder; +import org.apache.lucene.util.automaton.fst.FSTEnum; +import org.apache.lucene.util.automaton.fst.FST; +import org.apache.lucene.util.automaton.fst.PositiveIntOutputs; +import org.apache.lucene.util.automaton.fst.PairOutputs; import java.io.IOException; import java.util.Comparator; import java.util.Map; -import java.util.Set; import java.util.HashMap; -import java.util.TreeMap; -import java.util.SortedMap; -import java.util.Iterator; class SimpleTextFieldsReader extends FieldsProducer { @@ -116,73 +117,40 @@ private class SimpleTextTermsEnum extends TermsEnum { private final IndexInput in; private final boolean omitTF; - private BytesRef current; private int docFreq; private long docsStart; private boolean ended; - private final TreeMap allTerms; - private Iterator> iter; + private final FSTEnum fstEnum; - public SimpleTextTermsEnum(TreeMap allTerms, boolean omitTF) throws IOException { + public SimpleTextTermsEnum(FST fst, boolean omitTF) throws IOException { this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); - this.allTerms = allTerms; + //this.allTerms = allTerms; this.omitTF = omitTF; - iter = allTerms.entrySet().iterator(); + fstEnum = new FSTEnum(fst); } public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { - - final SortedMap tailMap = allTerms.tailMap(text); - if (tailMap.isEmpty()) { - current = null; + fstEnum.reset(); + //System.out.println("seek to text=" + text.utf8ToString()); + FSTEnum.InputOutput result = fstEnum.advance(text); + if (result == null) { + //System.out.println(" end"); return SeekStatus.END; } else { - current = tailMap.firstKey(); - final TermData td = tailMap.get(current); - docsStart = td.docsStart; - docFreq = td.docFreq; - iter = tailMap.entrySet().iterator(); - assert iter.hasNext(); - iter.next(); - if (current.equals(text)) { + //System.out.println(" got text=" + term.utf8ToString()); + PairOutputs.Pair pair = (PairOutputs.Pair) result.output; + docsStart = (Long) pair.output1; + docFreq = (int) ((Long) pair.output2).longValue(); + + if (result.input.equals(text)) { + //System.out.println(" match docsStart=" + docsStart); return SeekStatus.FOUND; } else { + //System.out.println(" not match docsStart=" + docsStart); return SeekStatus.NOT_FOUND; } } - - /* - if (current != null) { - final int cmp = current.compareTo(text); - if (cmp == 0) { - return SeekStatus.FOUND; - } else if (cmp > 0) { - ended = false; - in.seek(fieldStart); - } - } else { - ended = false; - in.seek(fieldStart); - } - - // Naive!! This just scans... would be better to do - // up-front scan to build in-RAM index - BytesRef b; - while((b = next()) != null) { - final int cmp = b.compareTo(text); - if (cmp == 0) { - ended = false; - return SeekStatus.FOUND; - } else if (cmp > 0) { - ended = false; - return SeekStatus.NOT_FOUND; - } - } - current = null; - ended = true; - return SeekStatus.END; - */ } @Override @@ -192,56 +160,20 @@ @Override public BytesRef next() throws IOException { assert !ended; - - if (iter.hasNext()) { - Map.Entry ent = iter.next(); - current = ent.getKey(); - TermData td = ent.getValue(); - docFreq = td.docFreq; - docsStart = td.docsStart; - return current; + FSTEnum.InputOutput result = fstEnum.next(); + if (result != null) { + PairOutputs.Pair pair = (PairOutputs.Pair) result.output; + docsStart = (Long) pair.output1; + docFreq = (int) ((Long) pair.output2).longValue(); + return result.input; } else { - current = null; return null; } - - /* - readLine(in, scratch); - if (scratch.equals(END) || scratch.startsWith(FIELD)) { - ended = true; - current = null; - return null; - } else { - assert scratch.startsWith(TERM): "got " + scratch.utf8ToString(); - docsStart = in.getFilePointer(); - final int len = scratch.length - TERM.length; - if (len > scratch2.length) { - scratch2.grow(len); - } - System.arraycopy(scratch.bytes, TERM.length, scratch2.bytes, 0, len); - scratch2.length = len; - current = scratch2; - docFreq = 0; - long lineStart = 0; - while(true) { - lineStart = in.getFilePointer(); - readLine(in, scratch); - if (scratch.equals(END) || scratch.startsWith(FIELD) || scratch.startsWith(TERM)) { - break; - } - if (scratch.startsWith(DOC)) { - docFreq++; - } - } - in.seek(lineStart); - return current; - } - */ } @Override public BytesRef term() { - return current; + return fstEnum.current().input; } @Override @@ -512,10 +444,11 @@ private final String field; private final long termsStart; private final boolean omitTF; + private FST fst; // NOTE: horribly, horribly RAM consuming, but then // SimpleText should never be used in production - private final TreeMap allTerms = new TreeMap(); + //private final TreeMap allTerms = new TreeMap(); private final BytesRef scratch = new BytesRef(10); @@ -527,6 +460,8 @@ } private void loadTerms() throws IOException { + PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(); + Builder b = new Builder(0, 0, true, new PairOutputs(posIntOutputs, posIntOutputs)); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); @@ -536,16 +471,18 @@ readLine(in, scratch); if (scratch.equals(END) || scratch.startsWith(FIELD)) { if (lastDocsStart != -1) { - allTerms.put(new BytesRef(lastTerm), - new TermData(lastDocsStart, docFreq)); + //allTerms.put(new BytesRef(lastTerm), + //new TermData(lastDocsStart, docFreq)); + b.add(lastTerm, new PairOutputs.Pair(lastDocsStart, Long.valueOf(docFreq))); } break; } else if (scratch.startsWith(DOC)) { docFreq++; } else if (scratch.startsWith(TERM)) { if (lastDocsStart != -1) { - allTerms.put(new BytesRef(lastTerm), - new TermData(lastDocsStart, docFreq)); + //allTerms.put(new BytesRef(lastTerm), + //new TermData(lastDocsStart, docFreq)); + b.add(lastTerm, new PairOutputs.Pair(lastDocsStart, Long.valueOf(docFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; @@ -557,11 +494,23 @@ docFreq = 0; } } + fst = b.finish(); + /* + PrintStream ps = new PrintStream("out.dot"); + fst.toDot(ps); + ps.close(); + System.out.println("SAVED out.dot"); + */ + //System.out.println("FST " + fst.sizeInBytes()); } @Override public TermsEnum iterator() throws IOException { - return new SimpleTextTermsEnum(allTerms, omitTF); + if (fst != null) { + return new SimpleTextTermsEnum(fst, omitTF); + } else { + return TermsEnum.EMPTY; + } } @Override Index: lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,505 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Builds a compact FST (maps a BytesRef term to an arbitrary + * output) from pre-sorted terms with outputs (the FST + * becomes an FSA if you use NoOutputs). The FST is written + * on-the-fly into a compact serialized format byte array, which can + * be saved to / loaded from a Directory or used directly + * for traversal. The FST is always finite (no cycles). + * + *

NOTE: The algorithm is described at + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698

+ * + * If your outputs are ByteSequenceOutput then the final FST + * will be minimal, but if you use PositiveIntOutput then + * it's only "near minimal". For example, aa/0, aab/1, bbb/2 + * will produce 6 states when a 5 state fst is also + * possible. + */ + +// nocommit fix generics +@SuppressWarnings({"unchecked"}) +public class Builder { + private final NodeHash dedupHash; + private final FST fst; + private final Object NO_OUTPUT; + + // simplistic pruning: we prune node (and all following + // nodes) if less than this number of terms go through it: + private final int minSuffixCount1; + + // better pruning: we prune node (and all following + // nodes) if the prior node has less than this number of + // terms go through it: + private final int minSuffixCount2; + + private final BytesRef lastTerm = new BytesRef(); + + // current "frontier" + private Node[] frontier; + + private FrozenNode[] spareFrozenNodes = new FrozenNode[10]; + private int numSpareFrozenNodes; + private Node[] spareNodes; + private int numSpareNodes; + + public Builder(int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs outputs) { + this.minSuffixCount1 = minSuffixCount1; + this.minSuffixCount2 = minSuffixCount2; + fst = new FST(outputs); + if (doMinSuffix) { + dedupHash = new NodeHash(fst); + } else { + dedupHash = null; + } + NO_OUTPUT = outputs.getNoOutput(); + + //spareNodes = (Node[]) Array.newInstance(Node.class, 10); + spareNodes = new Node[10]; + + //frontier = (Node[]) Array.newInstance(Node.class, 10); + frontier = new Node[10]; + for(int idx=0;idx 0) { + //System.out.println(" getNode spare=" + spareNodes[numSpareNodes-1]); + Node n = spareNodes[--numSpareNodes]; + assert n.free; + n.free = false; + return n; + } else { + //System.out.println("new"); + return new Node(); + } + } + + private void recycle(FrozenNode n) { + assert !n.free; + n.free = true; + if (n.address == -2) { + //System.out.println(" node"); + if (numSpareNodes == spareNodes.length) { + //final Node[] next = (Node[]) Array.newInstance(Node.class, ArrayUtil.oversize(1+numSpareNodes, RamUsageEstimator.NUM_BYTES_OBJ_REF)); + final Node[] next = new Node[ArrayUtil.oversize(1+numSpareNodes, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + System.arraycopy(spareNodes, 0, next, 0, numSpareNodes); + spareNodes = next; + } + spareNodes[numSpareNodes++] = (Node) n; + } else { + //System.out.println(" frozen"); + if (numSpareFrozenNodes == spareFrozenNodes.length) { + final FrozenNode[] next = new FrozenNode[ArrayUtil.oversize(1+numSpareFrozenNodes, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + System.arraycopy(spareFrozenNodes, 0, next, 0, numSpareFrozenNodes); + spareFrozenNodes = next; + } + spareFrozenNodes[numSpareFrozenNodes++] = n; + } + } + + private FrozenNode getFrozenNode() { + if (numSpareFrozenNodes > 0) { + FrozenNode n = spareFrozenNodes[--numSpareFrozenNodes]; + assert n.free; + n.free = false; + return n; + } else { + return new FrozenNode(); + } + } + + private FrozenNode freezeNode(Node n) { + + final int address; + if (dedupHash != null) { + if (n.numArcs == 0) { + address = fst.addNode(n); + } else { + address = dedupHash.add(n); + } + } else { + address = fst.addNode(n); + } + assert address != -2; + + n.clear(); + + final FrozenNode fn = getFrozenNode(); + fn.address = address; + return fn; + } + + private void freezePrevTail(int prefixLenPlus1) { + assert prefixLenPlus1 >= 1; + //System.out.println(" freezeTail " + prefixLenPlus1); + for(int idx=lastTerm.length; idx >= prefixLenPlus1; idx--) { + boolean doPrune = false; + boolean doFreeze = false; + + final Node node = frontier[idx]; + final Node parent = frontier[idx-1]; + + if (node.termCount < minSuffixCount1) { + doPrune = true; + doFreeze = true; + } else if (idx > prefixLenPlus1) { + // prune if parent's termCount is less than suffixMinCount2 + if (parent.termCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.termCount == 1) { + // my parent, about to be frozen, doesn't make the cut, so + // I'm definitely pruned + + // if pruneCount2 is 1, we keep only up + // until the 'distinguished edge', ie we keep only the + // 'divergent' part of the FST. if my parent, about to be + // frozen, has termCount 1 then we are already past the + // distinguished edge. NOTE: this only works if + // the FST outputs are not "compressible" (simple + // ords ARE compressible). + doPrune = true; + } else { + // my parent, about to be frozen, does make the cut, so + // I'm definitely not pruned + doPrune = false; + } + doFreeze = true; + } else { + // if pruning is disabled (count is 0) we can always + // freeze current node + doFreeze = minSuffixCount2 == 0; + } + + //System.out.println(" label=" + ((char) lastTerm.bytes[lastTerm.offset+idx-1]) + " idx=" + idx + " termCount=" + frontier[idx].termCount + " doFreeze=" + doFreeze + " doPrune=" + doPrune); + + if (node.termCount < minSuffixCount2 || minSuffixCount2 == 1 && node.termCount == 1) { + // drop all arcs + for(int arcIdx=0;arcIdx 0: "terms are added out of order lastTerm=" + lastTerm.utf8ToString() + " vs term=" + term.utf8ToString(); + assert validOutput(output); + + //System.out.println("\nadd: " + term); + if (term.length == 0) { + // empty term: only allowed as first term. we have + // to special case this because the packed FST + // format cannot represent the empty term since + // 'finalness' is stored on the incoming arc, not on + // the node + frontier[0].termCount++; + fst.setEmptyOutput(output); + return; + } + + // compare shared prefix length + int pos1 = 0; + int pos2 = term.offset; + final int pos1Stop = Math.min(lastTerm.length, term.length); + while(true) { + //System.out.println(" incr " + pos1); + frontier[pos1].termCount++; + if (pos1 >= pos1Stop || lastTerm.bytes[pos1] != term.bytes[pos2]) { + break; + } + pos1++; + pos2++; + } + final int prefixLenPlus1 = pos1+1; + + if (frontier.length < term.length+1) { + final Node[] next = new Node[ArrayUtil.oversize(term.length+1, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + //final Node[] next = (Node[]) Array.newInstance(Node.class, ArrayUtil.oversize(term.length+1, RamUsageEstimator.NUM_BYTES_OBJ_REF)); + System.arraycopy(frontier, 0, next, 0, frontier.length); + for(int idx=frontier.length;idx 0; + assert arcs[numArcs-1].label == labelToMatch; + return arcs[numArcs-1].output; + } + + public void addArc(int label, FrozenNode target) { + assert label >= 0; + assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs; + if (numArcs == arcs.length) { + final Arc[] newArcs = new Arc[ArrayUtil.oversize(arcs.length+1, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + //final Arc[] newArcs = (Arc[]) Array.newInstance(Arc.class, ArrayUtil.oversize(arcs.length+1, RamUsageEstimator.NUM_BYTES_OBJ_REF)); + + System.arraycopy(arcs, 0, newArcs, 0, numArcs); + for(int arcIdx=numArcs;arcIdx 0; + final Arc arc = arcs[numArcs-1]; + assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; + arc.target = target; + //assert target.address != -2; + arc.nextFinalOutput = nextFinalOutput; + arc.isFinal = isFinal; + } + + public void deleteLast(int label, Node target) { + assert numArcs > 0; + assert label == arcs[numArcs-1].label; + assert target == arcs[numArcs-1].target; + numArcs--; + } + + public void setLastOutput(int labelToMatch, Object newOutput) { + assert validOutput(newOutput); + assert numArcs > 0; + final Arc arc = arcs[numArcs-1]; + assert arc.label == labelToMatch; + arc.output = newOutput; + } + + // pushes an output prefix forward onto all arcs + public void prependOutput(Object outputPrefix) { + assert validOutput(outputPrefix); + + for(int arcIdx=0;arcIdx { + + private final static BytesRef NO_OUTPUT = new BytesRef(); + + private ByteSequenceOutputs() { + } + + public static ByteSequenceOutputs getSingleton() { + return new ByteSequenceOutputs(); + } + + @Override + public BytesRef common(BytesRef output1, BytesRef output2) { + assert output1 != null; + assert output2 != null; + + int pos1 = output1.offset; + int pos2 = output2.offset; + int stopAt1 = pos1 + Math.min(output1.length, output2.length); + while(pos1 < stopAt1) { + if (output1.bytes[pos1] != output2.bytes[pos2]) { + break; + } + pos1++; + pos2++; + } + + if (pos1 == output1.offset) { + // no common prefix + return NO_OUTPUT; + } else if (pos1 == output1.offset + output1.length) { + // output1 is a prefix of output2 + return output1; + } else if (pos2 == output2.offset + output2.length) { + // output2 is a prefix of output1 + return output2; + } else { + return new BytesRef(output1.bytes, output1.offset, pos1-output1.offset); + } + } + + @Override + public BytesRef subtract(BytesRef output, BytesRef inc) { + assert output != null; + assert inc != null; + if (inc == NO_OUTPUT) { + // no prefix removed + return output; + } else if (inc.length == output.length) { + // entire output removed + return NO_OUTPUT; + } else { + assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; + assert inc.length > 0; + return new BytesRef(output.bytes, output.offset + inc.length, output.length-inc.length); + } + } + + @Override + public BytesRef add(BytesRef prefix, BytesRef output) { + assert prefix != null; + assert output != null; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + assert prefix.length > 0; + assert output.length > 0; + BytesRef result = new BytesRef(prefix.length + output.length); + System.arraycopy(prefix.bytes, prefix.offset, result.bytes, 0, prefix.length); + System.arraycopy(output.bytes, output.offset, result.bytes, prefix.length, output.length); + result.length = prefix.length + output.length; + return result; + } + } + + @Override + public void write(BytesRef prefix, FST fst) { + assert prefix != null; + fst.writeVInt(prefix.length); + fst.write(prefix); + } + + @Override + public BytesRef read(FST fst, PosRef pos) { + final int len = fst.readVInt(pos); + if (len == 0) { + return NO_OUTPUT; + } else { + final BytesRef output = new BytesRef(len); + fst.read(pos, output.bytes, 0, len); + output.length = len; + return output; + } + } + + @Override + public BytesRef getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(BytesRef output) { + return output.utf8ToString(); + } +} Index: lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,574 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.PrintStream; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.BytesRef; + +// nocommit fix generics +/** Represents an FST using a compact byte[] format. + *

The format is similar to what's used by Morfologik + * (http://sourceforge.net/projects/morfologik). + */ +@SuppressWarnings({"unchecked"}) +public class FST { + private final static int BIT_FINAL_ARC = 1 << 0; + private final static int BIT_LAST_ARC = 1 << 1; + private final static int BIT_TARGET_NEXT = 1 << 2; + private final static int BIT_STOP_NODE = 1 << 3; + private final static int BIT_ARC_HAS_OUTPUT = 1 << 4; + private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; + + // Increment version to change it + private final static String FILE_FORMAT_NAME = "FST"; + private final static int VERSION_START = 0; + private final static int VERSION_CURRENT = VERSION_START; + + // Never serialized; just used to represent the virtual + // final node w/ no arcs: + private final static int FINAL_END_NODE = -1; + + // Never serialized; just used to represent the virtual + // non-final node w/ no arcs: + private final static int NON_FINAL_END_NODE = 0; + + // if non-null, this FST accepts the empty string and + // produces this output + private Object emptyOutput; + private byte[] emptyOutputBytes; + + private byte[] bytes; + int byteUpto = 0; + + private int startNode = -1; + + public final Outputs outputs; + + private int lastFrozenNode; + private int pos; + + private final Object NO_OUTPUT; + + public int nodeCount; + public int arcCount; + public int arcWithOutputCount; + + public final static class Arc { + int label; // really a "unsigned" byte + int target; + byte flags; + Object output; + Object nextFinalOutput; + int nextArc; + + public boolean flag(int flag) { + return FST.flag(flags, flag); + } + + public boolean isLast() { + return flag(BIT_LAST_ARC); + } + + public boolean isFinal() { + return flag(BIT_FINAL_ARC); + } + }; + + private static boolean flag(int flags, int bit) { + return (flags & bit) != 0; + } + + // make a new empty FST, for building + public FST(Outputs outputs) { + this.outputs = outputs; + bytes = new byte[128]; + NO_OUTPUT = outputs.getNoOutput(); + + // pad: ensure no node gets address 0 which is reserved to mean + // the stop state w/ no arcs + pos = 1; + + emptyOutput = null; + } + + // create an existing FST + public FST(IndexInput in, Outputs outputs) throws IOException { + this.outputs = outputs; + CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START); + if (in.readByte() == 1) { + // accepts empty string + int numBytes = in.readVInt(); + // messy + bytes = new byte[numBytes]; + in.readBytes(bytes, 0, numBytes); + PosRef posRef = PosRef.get(); + posRef.pos = numBytes-1; + emptyOutput = outputs.read(this, posRef); + assert posRef.pos == -1: "pos=" + posRef.pos; + } else { + emptyOutput = null; + } + startNode = in.readVInt(); + nodeCount = in.readVInt(); + arcCount = in.readVInt(); + arcWithOutputCount = in.readVInt(); + + bytes = new byte[in.readVInt()]; + in.readBytes(bytes, 0, bytes.length); + NO_OUTPUT = outputs.getNoOutput(); + } + + public void writeVInt(int v) { + assert v >= 0: "got v=" + v; + while(v > 0x7F) { + write((byte) (0x80 | (v & 0x7F))); + v = v >> 7; + } + write((byte) v); + } + + public int readVInt(PosRef pos) { + byte b = bytes[pos.pos--]; + int value = b & 0x7F; + int shift = 7; + while((b & 0x80) != 0) { + b = bytes[pos.pos--]; + value |= (b & 0x7F) << shift; + shift += 7; + } + return value; + } + + public void writeVLong(long v) { + assert v >= 0: "got v=" + v; + while(v > 0x7FL) { + write((byte) (0x80 | (v & 0x7FL))); + v = v >> 7; + } + write((byte) v); + } + + public long readVLong(PosRef pos) { + byte b = bytes[pos.pos--]; + long value = b & 0x7FL; + int shift = 7; + while((b & 0x80) != 0) { + b = bytes[pos.pos--]; + value |= (b & 0x7FL) << shift; + shift += 7; + } + return value; + } + + public void read(PosRef pos, byte[] bytesOut, int offset, int len) { + int upto = 0; + while(upto < len) { + bytesOut[offset+upto] = bytes[pos.pos--]; + upto++; + } + } + + public void write(byte b) { + if (bytes.length == pos) { + bytes = ArrayUtil.grow(bytes); + } + bytes[pos++] = b; + } + + public void write(BytesRef br) { + final int size = pos + br.length; + if (size > bytes.length) { + bytes = ArrayUtil.grow(bytes, size); + } + System.arraycopy(br.bytes, br.offset, bytes, pos, br.length); + pos += br.length; + } + + /** Returns bytes used to represent the FST */ + public int sizeInBytes() { + return bytes.length; + } + + void finish(int startNode) { + if (this.startNode != -1) { + throw new IllegalStateException("already finished"); + } + byte[] finalBytes = new byte[pos]; + System.arraycopy(bytes, 0, finalBytes, 0, pos); + bytes = finalBytes; + this.startNode = startNode; + } + + public void setEmptyOutput(Object v) { + if (emptyOutput != null) { + throw new IllegalStateException("empty output is already set"); + } + emptyOutput = v; + + // messy!! + final int posSave = pos; + outputs.write(emptyOutput, this); + emptyOutputBytes = new byte[pos-posSave]; + + // reverse + final int stopAt = (pos - posSave)/2; + int upto = 0; + while(upto < stopAt) { + final byte b = bytes[posSave + upto]; + bytes[posSave+upto] = bytes[pos-upto-1]; + bytes[pos-upto-1] = b; + upto++; + } + System.arraycopy(bytes, posSave, emptyOutputBytes, 0, pos-posSave); + pos = posSave; + } + + @SuppressWarnings({"unchecked"}) + public void save(IndexOutput out) throws IOException { + if (startNode == -1) { + throw new IllegalStateException("call finish first"); + } + CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); + if (emptyOutput != null) { + out.writeByte((byte) 1); + out.writeVInt(emptyOutputBytes.length); + out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); + } else { + out.writeByte((byte) 0); + } + out.writeVInt(startNode); + out.writeVInt(nodeCount); + out.writeVInt(arcCount); + out.writeVInt(arcWithOutputCount); + out.writeVInt(bytes.length); + out.writeBytes(bytes, 0, bytes.length); + } + + // returns true if the node at this address has any + // outgoing arcs + public boolean hasArcs(int address) { + return address != FINAL_END_NODE && address != NON_FINAL_END_NODE; + } + + public int getStartNode() { + if (startNode == -1) { + throw new IllegalStateException("call finish first"); + } + return startNode; + } + + // returns null if this FST does not accept the empty + // string, else, the output for the empty string + public Object getEmptyOutput() { + return emptyOutput; + } + + // serializes new node by appending its bytes to the end + // of the current byte[] + int addNode(Builder.Node node) { + //System.out.println("addNode pos=" + pos); + if (node.numArcs == 0) { + if (node.isFinal) { + return FINAL_END_NODE; + } else { + return NON_FINAL_END_NODE; + } + } + + nodeCount++; + arcCount += node.numArcs; + + final Object NO_OUTPUT = outputs.getNoOutput(); + + int startAddress = pos; + final int lastArc = node.numArcs-1; + + for(int arcIdx=0;arcIdx /x/tmp/out.png + */ + public void toDot(PrintStream out) { + + final List queue = new ArrayList(); + queue.add(startNode); + + final Set seen = new HashSet(); + seen.add(startNode); + + out.println("digraph FST {"); + out.println(" rankdir = LR;"); + out.println(" " + startNode + " [shape=circle label=" + startNode + "];"); + out.println(" initial [shape=plaintext label=\"\"];"); + if (emptyOutput != null) { + out.println(" initial -> " + startNode + " [arrowhead=tee label=\"(" + outputs.outputToString(emptyOutput) + ")\"];"); + } else { + out.println(" initial -> " + startNode); + } + + final Arc arc = new Arc(); + + while(queue.size() != 0) { + Integer node = queue.get(queue.size()-1); + queue.remove(queue.size()-1); + + if (node == FINAL_END_NODE || node == NON_FINAL_END_NODE) { + continue; + } + + // scan all arcs + readArc(node, arc); + while(true) { + if (!seen.contains(arc.target)) { + out.println(" " + arc.target + " [label=" + arc.target + "];"); + seen.add(arc.target); + queue.add(arc.target); + } + String outs; + if (arc.output != NO_OUTPUT) { + outs = "/" + outputs.outputToString(arc.output); + } else { + outs = ""; + } + if (arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) { + outs += " (" + outputs.outputToString(arc.nextFinalOutput) + ")"; + } + out.print(" " + node + " -> " + arc.target + " [label=\"" + ((char) (arc.label&0xFF)) + outs + "\""); + if (arc.isFinal()) { + out.print(" arrowhead=tee"); + } + if (arc.flag(BIT_TARGET_NEXT)) { + out.print(" color=blue"); + } + out.println("];"); + + if (arc.isLast()) { + break; + } else { + readArc(arc.nextArc, arc); + } + } + } + out.println("}"); + } + + public int getNodeCount() { + // 1+ in order to count the -1 implicit final node + return 1+nodeCount; + } + + public int getArcCount() { + return arcCount; + } + + public int getArcWithOutputCount() { + return arcWithOutputCount; + } +} Index: lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,314 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; + +/** Can next() and advance() through the terms in an FST */ + +// nocommit fix generics +@SuppressWarnings({"unchecked"}) +public class FSTEnum { + private final FST fst; + + private BytesRef current = new BytesRef(10); + private FST.Arc[] arcs = new FST.Arc[10]; + // outputs are cumulative + private Object[] output = new Object[10]; + + private boolean lastFinal; + private boolean didEmpty; + private final Object NO_OUTPUT; + private final InputOutput result = new InputOutput(); + + public static class InputOutput { + public BytesRef input; + public Object output; + } + + public FSTEnum(FST fst) { + this.fst = fst; + result.input = current; + NO_OUTPUT = fst.outputs.getNoOutput(); + } + + public void reset() { + lastFinal = false; + didEmpty = false; + current.length = 0; + result.output = NO_OUTPUT; + } + + /** NOTE: target must be >= where we are already + * positioned */ + public InputOutput advance(BytesRef target) { + + assert target.compareTo(current) >= 0; + + //System.out.println(" advance len=" + target.length + " curlen=" + current.length); + + // special case empty string + if (current.length == 0) { + if (target.length == 0) { + final Object output = fst.getEmptyOutput(); + if (output != null) { + if (!didEmpty) { + current.length = 0; + lastFinal = true; + result.output = output; + didEmpty = true; + } + return result; + } else { + return next(); + } + } + + if (fst.noNodes()) { + return null; + } + } + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // what prefix does target share w/ current + int idx = 0; + while (idx < current.length && idx < target.length) { + if (current.bytes[idx] != target.bytes[target.offset + idx]) { + break; + } + idx++; + } + + //System.out.println(" shared " + idx); + + FST.Arc arc; + if (current.length == 0) { + // new enum (no seek/next yet) + arc = fst.readArc(fst.getStartNode(), getArc(0)); + //System.out.println(" new enum"); + } else if (idx < current.length) { + // roll back to shared point + lastFinal = false; + current.length = idx; + arc = arcs[idx]; + if (arc.isLast()) { + if (idx == 0) { + return null; + } else { + return next(); + } + } + arc = fst.readArc(arc.nextArc, arc); + } else if (idx == target.length) { + // degenerate case -- seek to term we are already on + assert target.equals(current); + return result; + } else { + // current is a full prefix of target + if (lastFinal) { + arc = fst.readArc(arcs[current.length-1].target, getArc(current.length)); + } else { + return next(); + } + } + + lastFinal = false; + + assert arc == arcs[current.length]; + int targetLabel = target.bytes[target.offset+current.length] & 0xFF; + + while(true) { + //System.out.println(" cycle len=" + current.length + " target=" + ((char) targetLabel) + " vs " + ((char) arc.label)); + if (arc.label == targetLabel) { + grow(); + current.bytes[current.length] = (byte) arc.label; + appendOutput(arc.output); + current.length++; + grow(); + if (current.length == target.length) { + result.output = output[current.length-1]; + if (arc.isFinal()) { + // target is exact match + if (fst.hasArcs(arc.target)) { + // target is also a proper prefix of other terms + lastFinal = true; + appendFinalOutput(arc.nextFinalOutput); + } + } else { + // target is not a match but is a prefix of + // other terms + current.length--; + push(); + } + return result; + } else if (!fst.hasArcs(arc.target)) { + // we only match a prefix of the target + return next(); + } else { + targetLabel = target.bytes[target.offset+current.length] & 0xFF; + arc = fst.readArc(arc.target, getArc(current.length)); + } + } else if (arc.label > targetLabel) { + // we are now past the target + push(); + return result; + } else if (arc.isLast()) { + if (current.length == 0) { + return null; + } + return next(); + } else { + arc = fst.readArc(arc.nextArc, getArc(current.length)); + } + } + } + + public InputOutput current() { + return result; + } + + public InputOutput next() { + //System.out.println(" enum.next"); + + if (current.length == 0) { + final Object output = fst.getEmptyOutput(); + if (output != null) { + if (!didEmpty) { + current.length = 0; + lastFinal = true; + result.output = output; + didEmpty = true; + return result; + } else { + lastFinal = false; + } + } + if (fst.noNodes()) { + return null; + } + fst.readArc(fst.getStartNode(), getArc(0)); + push(); + } else if (lastFinal) { + lastFinal = false; + assert current.length > 0; + // resume pushing + fst.readArc(arcs[current.length-1].target, getArc(current.length)); + push(); + } else { + //System.out.println(" pop/push"); + pop(); + if (current.length == 0) { + // enum done + return null; + } else { + current.length--; + fst.readArc(arcs[current.length].nextArc, arcs[current.length]); + push(); + } + } + + return result; + } + + private void grow() { + if (current.bytes.length == current.length) { + current.bytes = ArrayUtil.grow(current.bytes, 1+current.bytes.length); + } + + if (arcs.length == current.length) { + final FST.Arc[] next = new FST.Arc[ArrayUtil.oversize(current.length+1, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + arcs = next; + } + + if (output.length == current.length) { + final Object[] next = new Object[ArrayUtil.oversize(current.length+1, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + System.arraycopy(output, 0, next, 0, output.length); + output = next; + } + } + + private void appendOutput(Object addedOutput) { + Object newOutput; + if (current.length == 0) { + newOutput = addedOutput; + } else if (addedOutput == NO_OUTPUT) { + output[current.length] = output[current.length-1]; + return; + } else { + newOutput = fst.outputs.add(output[current.length-1], addedOutput); + } + output[current.length] = newOutput; + } + + private void appendFinalOutput(Object addedOutput) { + if (current.length == 0) { + result.output = addedOutput; + } else { + result.output = fst.outputs.add(output[current.length-1], addedOutput); + } + } + + private void push() { + + FST.Arc arc = arcs[current.length]; + assert arc != null; + + while(true) { + grow(); + + current.bytes[current.length] = (byte) arc.label; + appendOutput(arc.output); + //System.out.println(" push: append label=" + ((char) arc.label) + " output=" + fst.outputs.outputToString(arc.output)); + current.length++; + grow(); + + if (!fst.hasArcs(arc.target)) { + break; + } + + if (arc.isFinal()) { + appendFinalOutput(arc.nextFinalOutput); + lastFinal = true; + return; + } + + arc = fst.readArc(arc.target, getArc(current.length)); + } + result.output = output[current.length-1]; + } + + private void pop() { + while (current.length > 0 && arcs[current.length-1].isLast()) { + current.length--; + } + } + + private FST.Arc getArc(int idx) { + if (arcs[idx] == null) { + arcs[idx] = new FST.Arc(); + } + return arcs[idx]; + } +} Index: lucene/src/java/org/apache/lucene/util/automaton/fst/NoOutputs.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/NoOutputs.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,89 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Use this if you just want to build an FSA. + */ + +public final class NoOutputs extends Outputs { + + Object NO_OUTPUT = new Object() { + // NodeHash calls hashCode for this output; we fix this + // so we get deterministic hashing. + @Override + public int hashCode() { + return 42; + } + + @Override + public boolean equals(Object other) { + return other == this; + } + }; + + private NoOutputs() { + } + + public static NoOutputs getSingleton() { + return new NoOutputs(); + } + + @Override + public Object common(Object output1, Object output2) { + assert output1 == NO_OUTPUT; + assert output2 == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public Object subtract(Object output, Object inc) { + assert output == NO_OUTPUT; + assert inc == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public Object add(Object prefix, Object output) { + assert prefix == NO_OUTPUT: "got " + prefix; + assert output == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public void write(Object prefix, FST fst) { + //assert false; + } + + @Override + public Object read(FST fst, PosRef pos) { + //assert false; + //return null; + return NO_OUTPUT; + } + + @Override + public Object getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Object output) { + return ""; + } +} Index: lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,165 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +// Used to dedup states (lookup already-frozen states) +final class NodeHash { + + private int[] table; + private int count; + private int mask; + private final FST fst; + private final FST.Arc scratchArc = new FST.Arc(); + + public NodeHash(FST fst) { + table = new int[16]; + mask = 15; + this.fst = fst; + } + + private boolean nodesEqual(Builder.Node node, int address) { + fst.readArc(address, scratchArc); + for(int arcUpto=0;arcUpto { + + /** Eg common("foo", "foobar") -> "foo" */ + public abstract T common(T output1, T output2); + + /** Eg subtract("foobar", "foo") -> "bar" */ + public abstract T subtract(T output, T inc); + + /** Eg add("foo", "bar") -> "foobar" */ + public abstract T add(T prefix, T output); + + public abstract void write(T output, FST fst); + + public abstract T read(FST fst, PosRef pos); + + /** NOTE: this output is compared with == so you must + * ensure that all methods return the single object if + * it's really no output */ + public abstract T getNoOutput(); + + public abstract String outputToString(T output); +} Index: lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,112 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Pairs up two outputs into one. + */ + + +// nocommit fix generics +@SuppressWarnings({"unchecked"}) +public class PairOutputs extends Outputs { + + private final Pair NO_OUTPUT; + private final Outputs outputs1; + private final Outputs outputs2; + + public static class Pair { + public final Object output1; + public final Object output2; + + public Pair(Object output1, Object output2) { + this.output1 = output1; + this.output2 = output2; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof Pair) { + Pair pair = (Pair) other; + return output1.equals(pair.output1) && output2.equals(pair.output2); + } else { + return false; + } + } + + public int hashCode() { + return output1.hashCode() + output2.hashCode(); + } + }; + + public PairOutputs(Outputs outputs1, Outputs outputs2) { + this.outputs1 = outputs1; + this.outputs2 = outputs2; + NO_OUTPUT = new Pair(outputs1.getNoOutput(), outputs2.getNoOutput()); + } + + private Pair makePair(Object output1, Object output2) { + if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) { + return NO_OUTPUT; + } else { + return new Pair(output1, output2); + } + } + + @Override + public Pair common(Pair pair1, Pair pair2) { + return makePair(outputs1.common(pair1.output1, pair2.output1), + outputs2.common(pair1.output2, pair2.output2)); + } + + @Override + public Pair subtract(Pair output, Pair inc) { + return makePair(outputs1.subtract(output.output1, inc.output1), + outputs2.subtract(output.output2, inc.output2)); + } + + @Override + public Pair add(Pair prefix, Pair output) { + return makePair(outputs1.add(prefix.output1, output.output1), + outputs2.add(prefix.output2, output.output2)); + } + + @Override + public void write(Pair output, FST fst) { + outputs1.write(output.output1, fst); + outputs2.write(output.output2, fst); + } + + @Override + public Pair read(FST fst, PosRef pos) { + Object output1 = outputs1.read(fst, pos); + Object output2 = outputs2.read(fst, pos); + return makePair(output1, output2); + } + + @Override + public Pair getNoOutput() { + return NO_OUTPUT; + } + + public String outputToString(Pair output) { + return ""; + } +} Index: lucene/src/java/org/apache/lucene/util/automaton/fst/PosRef.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/PosRef.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,38 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public final class PosRef { + public int pos; + + private static final ThreadLocal posRefs = new ThreadLocal(); + + public static PosRef get() { + PosRef posRef = posRefs.get(); + if (posRef == null) { + posRef = new PosRef(); + posRefs.set(posRef); + } + + return posRef; + } + + public void add(int inc) { + pos += inc; + } +} Index: lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,123 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +// TODO: make a sharing and non-sharing variant; eg if you +// output docFreq per term the FST will be smaller if you +// don't share since they are not "well shared" + +/** + * Output is a long, for each input term. NOTE: the + * resulting FST is not guaranteed to be minimal! See + * {@link Builder}. + */ + +public final class PositiveIntOutputs extends Outputs { + + private final static Long NO_OUTPUT = new Long(0); + + private PositiveIntOutputs() { + } + + public static PositiveIntOutputs getSingleton() { + return new PositiveIntOutputs(); + } + + public Object get(long v) { + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } + + @Override + public Long common(Long output1, Long output2) { + assert valid(output1); + assert valid(output2); + if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) { + return NO_OUTPUT; + } else { + assert output1 > 0; + assert output2 > 0; + return Math.min(output1, output2); + } + } + + @Override + public Long subtract(Long output, Long inc) { + assert valid(output); + assert valid(inc); + assert output >= inc; + + if (inc == NO_OUTPUT) { + return output; + } else if (output.equals(inc)) { + return NO_OUTPUT; + } else { + return output - inc; + } + } + + @Override + public Long add(Long prefix, Long output) { + assert valid(prefix); + assert valid(output); + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + return prefix + output; + } + } + + @Override + public void write(Long output, FST fst) { + assert valid(output); + fst.writeVLong(output); + } + + @Override + public Long read(FST fst, PosRef pos) { + long v = fst.readVLong(pos); + if (v == 0) { + return NO_OUTPUT; + } else { + return v; + } + } + + private boolean valid(Long o) { + assert o != null; + assert o instanceof Long; + assert o == NO_OUTPUT || o > 0; + return true; + } + + @Override + public Long getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Long output) { + return output.toString(); + } +} Index: lucene/src/test/org/apache/lucene/util/automaton/fst/TestSimpleFST.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/test/org/apache/lucene/util/automaton/fst/TestSimpleFST.java Fri Dec 03 05:33:14 2010 -0500 @@ -0,0 +1,812 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintStream; +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MockDirectoryWrapper; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +// nocommit fix generics +@SuppressWarnings({"unchecked"}) +public class TestSimpleFST extends LuceneTestCase { + + private static final int MODE_NONE = 0; + private static final int MODE_ORD = 1; + private static final int MODE_NUM = 2; + private static final int MODE_PAIR = 3; + private static final int MODE_SEQ = 4; + + private MockDirectoryWrapper dir; + + public void setUp() throws IOException { + dir = newDirectory(); + dir.setPreventDoubleWrite(false); + } + + public void tearDown() throws IOException { + dir.close(); + } + + // ncoommit -- merge this w/ FSTEnum's + private static class InputOutput implements Comparable { + public final BytesRef input; + public final Object output; + + public InputOutput(BytesRef input, Object output) { + this.input = input; + this.output = output; + } + + public int compareTo(Object other) { + if (other instanceof InputOutput) { + return input.compareTo(((InputOutput) other).input); + } else if (other instanceof BytesRef) { + return input.compareTo((BytesRef) other); + } else { + throw new IllegalArgumentException(); + } + } + } + + public void testBasicFSA() throws IOException { + String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"}; + BytesRef[] terms = new BytesRef[strings.length]; + for(int idx=0;idx fsts = new HashMap(); + doTest(terms, fsts); + + // FSA + FST fst = fsts.get(MODE_NONE); + assertNotNull(fst); + assertEquals(22, fst.getNodeCount()); + assertEquals(27, fst.getArcCount()); + + // FST ord pos int + fst = fsts.get(MODE_ORD); + assertNotNull(fst); + assertEquals(22, fst.getNodeCount()); + assertEquals(27, fst.getArcCount()); + + // FST byte sequence ord + fst = fsts.get(MODE_SEQ); + assertNotNull(fst); + assertEquals(24, fst.getNodeCount()); + assertEquals(30, fst.getArcCount()); + } + + private static String simpleRandomString(Random r) { + final int end = r.nextInt(10); + if (end == 0) { + // allow 0 length + return ""; + } + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + buffer[i] = (char) _TestUtil.nextInt(random, 97, 102); + } + return new String(buffer, 0, end); + } + + // given set of terms, test the different outputs for them + private void doTest(BytesRef[] terms, Map fsts) throws IOException { + Arrays.sort(terms); + for(int mode=0;mode<5;mode++) { + //System.out.println("\nTEST: mode=" + mode); + Outputs posIntOutputs = PositiveIntOutputs.getSingleton(); + + final Outputs outputs; + if (mode == MODE_NONE) { + outputs = NoOutputs.getSingleton(); + } else if (mode == MODE_ORD || mode == MODE_NUM) { + outputs = posIntOutputs; + } else if (mode == MODE_PAIR) { + outputs = new PairOutputs(posIntOutputs, posIntOutputs); + } else { + outputs = ByteSequenceOutputs.getSingleton(); + } + + final Object NO_OUTPUT = outputs.getNoOutput(); + + final Random r2 = new Random(random.nextLong()); + + InputOutput[] pairs = new InputOutput[terms.length]; + long lastOutput = 0; + for(int idx=0;idx termsSet = new HashSet(); + BytesRef[] terms = new BytesRef[numWords]; + while(termsSet.size() < numWords) { + final String term = getRandomString(); + termsSet.add(new BytesRef(term)); + } + doTest(termsSet.toArray(new BytesRef[termsSet.size()]), null); + } + } + + private String getRandomString() { + final String term; + if (random.nextBoolean()) { + term = _TestUtil.randomRealisticUnicodeString(random); + } else { + // we want to mix in limited-alphabet symbols so + // we get more sharing of the nodes given how few + // terms we are testing... + term = simpleRandomString(random); + } + return term; + } + + @Nightly + public void testBigSet() throws IOException { + testRandomWords(50000, RANDOM_MULTIPLIER); + } + + private Object randomAcceptedWord(FST fst, BytesRef in) { + int node = fst.getStartNode(); + + if (fst.noNodes()) { + // degenerate FST: only accepts the empty string + assertTrue(fst.getEmptyOutput() != null); + in.length = 0; + return fst.getEmptyOutput(); + } + final List arcs = new ArrayList(); + in.length = 0; + in.offset = 0; + Object output = fst.outputs.getNoOutput(); + //System.out.println("get random"); + while(true) { + // read all arcs: + //System.out.println(" n=" + node); + int arcAddress = node; + while(true) { + final FST.Arc arc = fst.readArc(arcAddress, new FST.Arc()); + arcs.add(arc); + if (arc.isLast()) { + break; + } + arcAddress = arc.nextArc; + } + + // pick one + FST.Arc arc = arcs.get(random.nextInt(arcs.size())); + + arcs.clear(); + + // append label + if (in.bytes.length == in.length) { + in.grow(1+in.length); + } + in.bytes[in.length++] = (byte) arc.label; + + output = fst.outputs.add(output, arc.output); + + // maybe stop + if (arc.isFinal()) { + if (fst.hasArcs(arc.target)) { + // final state but it also has outgoing edges + if (random.nextBoolean()) { + output = fst.outputs.add(output, arc.nextFinalOutput); + break; + } + } else { + break; + } + } + + node = arc.target; + } + + return output; + } + + // runs the term, returning the output, or null if term + // isn't accepted. if stopNode is non-null it must be + // length 2 int array; stopNode[0] will be the last + // matching node (-1 if the term is accepted) + // and stopNode[1] will be the length of the + // term prefix that matches + private static Object run(FST fst, BytesRef term, int[] stopNode) { + if (term.length == 0) { + final Object output = fst.getEmptyOutput(); + if (stopNode != null) { + stopNode[1] = 0; + if (output != null) { + // accepted + stopNode[0] = -1; + } else { + stopNode[0] = fst.getStartNode(); + } + } + return output; + } + + final FST.Arc arc = new FST.Arc(); + int node = fst.getStartNode(); + int lastNode = -1; + Object output = fst.outputs.getNoOutput(); + for(int i=0;i termsMap = new HashMap(); + for(InputOutput pair : pairs) { + termsMap.put(pair.input, pair.output); + } + + // find random matching word and make sure it's valid + final BytesRef scratch = new BytesRef(); + for(int iter=0;iter<500*RANDOM_MULTIPLIER;iter++) { + Object output = randomAcceptedWord(fst, scratch); + assertTrue("accepted word " + scratch.utf8ToString() + " is not valid", termsMap.containsKey(scratch)); + assertEquals(termsMap.get(scratch), output); + } + + // test single FSTEnum.advance: + //System.out.println("TEST: verify advance"); + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + final FSTEnum fstEnum = new FSTEnum(fst); + if (random.nextBoolean()) { + // seek to term that doesn't exist: + while(true) { + final BytesRef term = new BytesRef(getRandomString()); + // HMM: little risky: I rely on who's .compareTo + // Arrays uses: + int pos = Arrays.binarySearch(pairs, term); + if (pos < 0) { + pos = -(pos+1); + // ok doesn't exist + //System.out.println(" seek " + term.utf8ToString()); + final FSTEnum.InputOutput seekResult = fstEnum.advance(term); + if (pos < pairs.length) { + //System.out.println(" got " + seekResult.input.utf8ToString() + " output=" + fst.outputs.outputToString(seekResult.output)); + assertEquals(pairs[pos].input, seekResult.input); + assertEquals(pairs[pos].output, seekResult.output); + } else { + // seeked beyond end + //System.out.println("seek=" + seekTerm); + assertNull("expected null but got " + (seekResult==null ? "null" : seekResult.input.utf8ToString()), seekResult); + } + + break; + } + } + } else { + // seek to term that does exist: + InputOutput pair = pairs[random.nextInt(pairs.length)]; + //System.out.println(" seek " + + //pair.input.utf8ToString()); + final FSTEnum.InputOutput seekResult = fstEnum.advance(pair.input); + assertEquals(pair.input, seekResult.input); + assertEquals(pair.output, seekResult.output); + } + } + + if (VERBOSE) { + System.out.println("TEST: mixed next/advance"); + } + + // test mixed next/advance + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + if (VERBOSE) { + System.out.println("TEST: iter " + iter); + } + final FSTEnum fstEnum = new FSTEnum(fst); + int upto = -1; + while(true) { + boolean isDone = false; + if (upto == pairs.length-1 || random.nextBoolean()) { + // next + upto++; + if (VERBOSE) { + System.out.println(" do next"); + } + isDone = fstEnum.next() == null; + } else if (upto != -1 && upto < 0.75 * pairs.length && random.nextBoolean()) { + int attempt = 0; + for(;attempt<10;attempt++) { + BytesRef term = new BytesRef(getRandomString()); + if (!termsMap.containsKey(term) && term.compareTo(pairs[upto].input) > 0) { + if (VERBOSE) { + System.out.println(" do non-exist advance(" + term.utf8ToString() + "]"); + } + int pos = Arrays.binarySearch(pairs, term); + assert pos < 0; + upto = -(pos+1); + isDone = fstEnum.advance(term) == null; + break; + } + } + if (attempt == 10) { + continue; + } + + } else { + final int inc = random.nextInt(pairs.length - upto - 1); + upto += inc; + if (upto == -1) { + upto = 0; + } + + if (VERBOSE) { + System.out.println(" do advance(" + pairs[upto].input.utf8ToString() + "]"); + } + isDone = fstEnum.advance(pairs[upto].input) == null; + } + if (VERBOSE) { + if (!isDone) { + System.out.println(" got " + fstEnum.current().input.utf8ToString()); + } else { + System.out.println(" got null"); + } + } + + if (upto == pairs.length) { + assertTrue(isDone); + break; + } else { + assertFalse(isDone); + assertEquals(pairs[upto].input, fstEnum.current().input); + assertEquals(pairs[upto].output, fstEnum.current().output); + } + } + } + } + + private static class CountMinOutput { + int count; + Object output; + Object finalOutput; + boolean isLeaf = true; + boolean isFinal; + } + + // FST is pruned + private void verifyPruned(FST fst, Outputs outputs, InputOutput[] pairs, int prune1, int prune2) { + + if (VERBOSE) { + System.out.println("TEST: now verify pruned " + pairs.length + " terms; outputs=" + outputs); + for(int idx=0;idx prefixes = new HashMap(); + final BytesRef scratch = new BytesRef(); + for(InputOutput pair: pairs) { + scratch.copy(pair.input); + for(int idx=0;idx<=pair.input.length;idx++) { + scratch.length = idx; + CountMinOutput cmo = prefixes.get(scratch); + if (cmo == null) { + cmo = new CountMinOutput(); + cmo.count = 1; + cmo.output = pair.output; + prefixes.put(new BytesRef(scratch), cmo); + } else { + cmo.count++; + cmo.output = outputs.common(cmo.output, pair.output); + } + if (idx == pair.input.length) { + cmo.isFinal = true; + cmo.finalOutput = cmo.output; + } + } + } + + //System.out.println("TEST: now prune"); + + // prune 'em + final Iterator> it = prefixes.entrySet().iterator(); + while(it.hasNext()) { + Map.Entry ent = it.next(); + final BytesRef prefix = ent.getKey(); + final CountMinOutput cmo = ent.getValue(); + //System.out.println(" term=" + prefix.utf8ToString() + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf); + final boolean keep; + if (prune1 > 0) { + keep = cmo.count >= prune1; + } else { + assert prune2 > 0; + if (prune2 > 1 && cmo.count >= prune2) { + keep = true; + } else if (prefix.length > 0) { + // consult our parent + scratch.length = prefix.length-1; + System.arraycopy(prefix.bytes, prefix.offset, scratch.bytes, 0, scratch.length); + final CountMinOutput cmo2 = prefixes.get(scratch); + //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); + keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1))); + } else if (cmo.count >= prune2) { + keep = true; + } else { + keep = false; + } + } + + if (!keep) { + it.remove(); + //System.out.println(" remove"); + } else { + // clear isLeaf for all ancestors + //System.out.println(" keep"); + scratch.copy(prefix); + scratch.length--; + while(scratch.length >= 0) { + final CountMinOutput cmo2 = prefixes.get(scratch); + if (cmo2 != null) { + //System.out.println(" clear isLeaf " + scratch.utf8ToString()); + cmo2.isLeaf = false; + } + scratch.length--; + } + } + } + + //System.out.println("TEST: after prune"); + /* + for(Map.Entry ent : prefixes.entrySet()) { + System.out.println(" " + ent.getKey().utf8ToString() + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); + if (ent.getValue().isFinal) { + System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); + } + } + */ + + if (prefixes.size() <= 1) { + assertNull(fst); + return; + } + + assertNotNull(fst); + + // make sure FST only enums valid prefixes + FSTEnum fstEnum = new FSTEnum(fst); + FSTEnum.InputOutput current; + while((current = fstEnum.next()) != null) { + //System.out.println(" fst enum term=" + current.input.utf8ToString() + " output=" + outputs.outputToString(current.output)); + final CountMinOutput cmo = prefixes.get(current.input); + assertNotNull(cmo); + assertTrue(cmo.isLeaf || cmo.isFinal); + if (cmo.isFinal && !cmo.isLeaf) { + assertEquals(cmo.finalOutput, current.output); + } else { + assertEquals(cmo.output, current.output); + } + } + + // make sure all non-pruned prefixes are present in the FST + final int[] stopNode = new int[2]; + for(Map.Entry ent : prefixes.entrySet()) { + if (ent.getKey().length > 0) { + final CountMinOutput cmo = ent.getValue(); + final Object output = run(fst, ent.getKey(), stopNode); + //System.out.println(" term=" + ent.getKey().utf8ToString() + " output=" + outputs.outputToString(cmo.output)); + // if (cmo.isFinal && !cmo.isLeaf) { + if (cmo.isFinal) { + assertEquals(cmo.finalOutput, output); + } else { + assertEquals(cmo.output, output); + } + assertEquals(ent.getKey().length, stopNode[1]); + } + } + } + + // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestSimpleFST /x/tmp/allTerms3.txt out + public static void main(String[] args) throws IOException { + final String wordsFileIn = args[0]; + final String dirOut = args[1]; + int idx = 2; + int prune = 0; + int limit = Integer.MAX_VALUE; + while(idx < args.length) { + if (args[idx].equals("-prune")) { + prune = Integer.valueOf(args[1+idx]); + idx++; + } + if (args[idx].equals("-limit")) { + limit = Integer.valueOf(args[1+idx]); + idx++; + } + idx++; + } + + BufferedReader is = new BufferedReader(new FileReader(wordsFileIn), 65536); + + //Outputs outputs = NoOutputs.getSingleton(); + PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); + Builder builder = new Builder(0, prune, prune == 0, outputs); + //final Object NO_OUTPUT = outputs.getNoOutput(); + + final BytesRef term = new BytesRef(); + final long tStart = System.currentTimeMillis(); + int count = 0; + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + term.copy(w); + Object output = outputs.get(count++); + builder.add(term, output); + if (builder.getTermCount() % 1000000 == 0) { + System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + builder.getTermCount() + "..."); + } + if (builder.getTermCount() >= limit) { + break; + } + } + final FST fst = builder.finish(); + if (fst == null) { + System.out.println("FST was fully pruned!"); + System.exit(0); + } + + System.out.println(fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); + + if (fst.getNodeCount() < 100) { + PrintStream ps = new PrintStream("out.dot"); + fst.toDot(ps); + ps.close(); + System.out.println("Wrote FST to out.dot"); + } + + Directory dir = FSDirectory.open(new File(dirOut)); + IndexOutput out = dir.createOutput("fst.bin"); + fst.save(out); + out.close(); + + System.out.println("Saved FST to fst.bin."); + } +}