Index: modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java =================================================================== --- modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (revision 1237044) +++ modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (working copy) @@ -329,8 +329,11 @@ private boolean descendWithPrefix(Arc arc, BytesRef utf8) throws IOException { final int max = utf8.offset + utf8.length; + // Cannot save as instance var since multiple threads + // can use FSTCompletion at once... + final FST.BytesReader fstReader = automaton.getBytesReader(0); for (int i = utf8.offset; i < max; i++) { - if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) { + if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) { // No matching prefixes, return an empty result. return false; } Index: modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java =================================================================== --- modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (revision 1237044) +++ modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (working copy) @@ -234,7 +234,7 @@ final Object empty = outputs.getNoOutput(); final Builder builder = new Builder( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, - shareMaxTailLength, outputs, null); + shareMaxTailLength, outputs, null, false); BytesRef scratch = new BytesRef(); final IntsRef scratchIntsRef = new IntsRef(); Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (revision 1237044) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (working copy) @@ -226,6 +226,9 @@ private final FST fst; + private final FST.BytesReader fstReader; + + private final BytesRef scratchBytes = new BytesRef(); private final CharsRef scratchChars = new CharsRef(); @@ -241,7 +244,7 @@ this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; - + this.fstReader = fst.getBytesReader(0); if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } @@ -366,7 +369,7 @@ int bufUpto = 0; while(bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); - if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) { + if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); break byToken; } @@ -388,7 +391,7 @@ // See if the FST wants to continue matching (ie, needs to // see the next input token): - if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) { + if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. Index: modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java =================================================================== --- modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (revision 1237044) +++ modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (working copy) @@ -131,7 +131,7 @@ System.out.println(" encode..."); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true); - Builder fstBuilder = new Builder(FST.INPUT_TYPE.BYTE2, fstOutput); + Builder fstBuilder = new Builder(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true); IntsRef scratch = new IntsRef(); long ord = -1; // first ord will be 0 String lastValue = null; @@ -155,13 +155,14 @@ for (int i = 0; i < token.length(); i++) { scratch.ints[i] = (int) token.charAt(i); } - fstBuilder.add(scratch, fstOutput.get(ord)); + fstBuilder.add(scratch, ord); } dictionary.addMapping((int)ord, offset); offset = next; } - FST fst = fstBuilder.finish(); + final FST fst = fstBuilder.finish().pack(2, 100000); + System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... "); dictionary.setFST(fst); System.out.println(" done"); Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java =================================================================== --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (revision 1237044) +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (working copy) @@ -113,7 +113,7 @@ for (int i = 0; i < token.length(); i++) { scratch.ints[i] = (int) token.charAt(i); } - fstBuilder.add(scratch, fstOutput.get(ord)); + fstBuilder.add(scratch, ord); segmentations.add(wordIdAndLength); ord++; } Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java =================================================================== --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java (revision 1237044) +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java (working copy) @@ -47,9 +47,10 @@ FST.Arc firstArc = new FST.Arc(); fst.getFirstArc(firstArc); FST.Arc arc = new FST.Arc(); + final FST.BytesReader fstReader = fst.getBytesReader(0); // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs) for (int i = 0; i < rootCache.length; i++) { - if (fst.findTargetArc(0x3040 + i, firstArc, arc) != null) { + if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) { rootCache[i] = new FST.Arc().copyFrom(arc); } } @@ -67,7 +68,10 @@ return arc; } } else { - return fst.findTargetArc(ch, follow, arc); + // TODO: could require caller to pass in the + // FSTReader... since a tokenStream is thread private + // anyway... + return fst.findTargetArc(ch, follow, arc, fst.getBytesReader(0)); } } Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java =================================================================== --- lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1237044) +++ lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) @@ -89,11 +89,11 @@ return br; } - private static IntsRef toIntsRef(String s, int inputMode) { + static IntsRef toIntsRef(String s, int inputMode) { return toIntsRef(s, inputMode, new IntsRef(10)); } - private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { + static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { if (inputMode == 0) { // utf8 return toIntsRef(new BytesRef(s), ir); @@ -103,7 +103,7 @@ } } - private static IntsRef toIntsRefUTF32(String s, IntsRef ir) { + static IntsRef toIntsRefUTF32(String s, IntsRef ir) { final int charLength = s.length(); int charIdx = 0; int intIdx = 0; @@ -120,7 +120,7 @@ return ir; } - private static IntsRef toIntsRef(BytesRef br, IntsRef ir) { + static IntsRef toIntsRef(BytesRef br, IntsRef ir) { if (br.length > ir.ints.length) { ir.grow(br.length); } @@ -172,7 +172,7 @@ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final List> pairs = new ArrayList>(terms2.length); for(int idx=0;idx(terms2[idx], outputs.get(idx))); + pairs.add(new FSTTester.InputOutput(terms2[idx], (long) idx)); } final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false); assertNotNull(fst); @@ -230,7 +230,7 @@ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final List> pairs = new ArrayList>(terms.length); for(int idx=0;idx(terms[idx], outputs.get(idx))); + pairs.add(new FSTTester.InputOutput(terms[idx], (long) idx)); } new FSTTester(random, dir, inputMode, pairs, outputs, true).doTest(); } @@ -244,7 +244,7 @@ for(int idx=0;idx(terms[idx], outputs.get(value))); + pairs.add(new FSTTester.InputOutput(terms[idx], value)); } new FSTTester(random, dir, inputMode, pairs, outputs, doShare).doTest(); } @@ -254,7 +254,7 @@ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); final List> pairs = new ArrayList>(terms.length); for(int idx=0;idx(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); + pairs.add(new FSTTester.InputOutput(terms[idx], random.nextLong() & Long.MAX_VALUE)); } new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(); } @@ -270,8 +270,7 @@ final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000); lastOutput = value; pairs.add(new FSTTester.InputOutput>(terms[idx], - outputs.get(o1.get(idx), - o2.get(value)))); + outputs.newPair((long) idx, value))); } new FSTTester>(random, dir, inputMode, pairs, outputs, false).doTest(); } @@ -393,6 +392,7 @@ final FST.Arc arc = fst.getFirstArc(new FST.Arc()); final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; + final FST.BytesReader fstReader = fst.getBytesReader(0); for(int i=0;i<=term.length;i++) { final int label; @@ -401,8 +401,9 @@ } else { label = term.ints[term.offset+i]; } - //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); - if (fst.findTargetArc(label, arc, arc) == null) { + // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); + if (fst.findTargetArc(label, arc, arc, fstReader) == null) { + // System.out.println(" not found"); if (prefixLength != null) { prefixLength[0] = i; return output; @@ -462,16 +463,19 @@ FST doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException { if (VERBOSE) { - System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); + System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2); } + final boolean willRewrite = random.nextBoolean(); + final Builder builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, prune1, prune2, prune1==0 && prune2==0, allowRandomSuffixSharing ? random.nextBoolean() : true, allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, outputs, - null); + null, + willRewrite); for(InputOutput pair : pairs) { if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { @@ -486,7 +490,7 @@ } FST fst = builder.finish(); - if (random.nextBoolean() && fst != null) { + if (random.nextBoolean() && fst != null && !willRewrite) { TestFSTs t = new TestFSTs(); IOContext context = t.newIOContext(random); IndexOutput out = dir.createOutput("fst.bin", context); @@ -522,6 +526,21 @@ verifyPruned(inputMode, fst, prune1, prune2); } + if (willRewrite && fst != null) { + if (VERBOSE) { + System.out.println("TEST: now rewrite"); + } + final FST packed =fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000)); + if (VERBOSE) { + System.out.println("TEST: now verify packed FST"); + } + if (prune1 == 0 && prune2 == 0) { + verifyUnPruned(inputMode, packed); + } else { + verifyPruned(inputMode, packed, prune1, prune2); + } + } + return fst; } @@ -638,7 +657,7 @@ num = atLeast(100); for(int iter=0;iter fst = builder.finish(); - //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount()); - // NOTE: we produce 7 nodes today - assertEquals(6, fst.getNodeCount()); - // NOTE: we produce 8 arcs today - assertEquals(7, fst.getNodeCount()); - //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); - //Util.toDot(fst, w, false, false); - //w.close(); - } - */ - - // NOTE: this test shows a case where our current builder - // fails to produce minimal FST: - /* - public void test4() throws Exception { - final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - IntsRef scratchIntsRef = new IntsRef(); - builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput()); - builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1")); - builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11")); - final FST fst = builder.finish(); - //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount()); - // NOTE: we produce 7 nodes today - assertEquals(6, fst.getNodeCount()); - // NOTE: we produce 8 arcs today - assertEquals(7, fst.getNodeCount()); - //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); - //Util.toDot(fst, w, false, false); - //w.close(); - } - */ - // Build FST for all unique terms in the test line docs // file, up until a time limit public void testRealTerms() throws Exception { @@ -1126,8 +1109,11 @@ IndexReader r = IndexReader.open(writer, true); writer.close(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + final boolean doRewrite = random.nextBoolean(); + + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite); + boolean storeOrd = random.nextBoolean(); if (VERBOSE) { if (storeOrd) { @@ -1162,59 +1148,69 @@ } else { output = termsEnum.docFreq(); } - builder.add(Util.toIntsRef(term, scratchIntsRef), outputs.get(output)); + builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output); ord++; if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { System.out.println(ord + " terms..."); } } - final FST fst = builder.finish(); + FST fst = builder.finish(); if (VERBOSE) { System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes"); } if (ord > 0) { - // Now confirm BytesRefFSTEnum and TermsEnum act the - // same: - final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); - int num = atLeast(1000); - for(int iter=0;iter fstEnum = new BytesRefFSTEnum(fst); + int num = atLeast(1000); + for(int iter=0;iter nextResult = fstEnum.next(); + if (nextResult != null) { + System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output)); + fail(); + } + break; } - BytesRefFSTEnum.InputOutput nextResult = fstEnum.next(); - if (nextResult != null) { - System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output)); - fail(); - } - break; } } } @@ -1248,14 +1244,17 @@ private int inputMode; private final Outputs outputs; private final Builder builder; + private final boolean doPack; - public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs outputs) { + public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs outputs, boolean doPack, boolean noArcArrays) { this.dirOut = dirOut; this.wordsFileIn = wordsFileIn; this.inputMode = inputMode; this.outputs = outputs; - - builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null); + this.doPack = doPack; + + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack); + builder.setAllowArrayArcs(!noArcArrays); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -1287,14 +1286,15 @@ } assert builder.getTermCount() == ord; - final FST fst = builder.finish(); + FST fst = builder.finish(); if (fst == null) { System.out.println("FST was fully pruned!"); System.exit(0); } - if (dirOut == null) + if (dirOut == null) { return; + } System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); if (fst.getNodeCount() < 100) { @@ -1304,58 +1304,68 @@ System.out.println("Wrote FST to out.dot"); } - Directory dir = FSDirectory.open(new File(dirOut)); - IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT); - fst.save(out); - out.close(); + if (doPack) { + System.out.println("Pack..."); + fst = fst.pack(5, 10000000); + System.out.println("New size " + fst.sizeInBytes() + " bytes"); + } else { + Directory dir = FSDirectory.open(new File(dirOut)); + IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT); + fst.save(out); + out.close(); + System.out.println("Saved FST to fst.bin."); + } - System.out.println("Saved FST to fst.bin."); - if (!verify) { return; } System.out.println("\nNow verify..."); - is.close(); - is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); + while(true) { + is.close(); + is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); - ord = 0; - tStart = System.currentTimeMillis(); - while(true) { - String w = is.readLine(); - if (w == null) { - break; + ord = 0; + tStart = System.currentTimeMillis(); + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + toIntsRef(w, inputMode, intsRef); + T expected = getOutput(intsRef, ord); + T actual = Util.get(fst, intsRef); + if (actual == null) { + throw new RuntimeException("unexpected null output on input=" + w); + } + if (!actual.equals(expected)) { + throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); + } + + ord++; + if (ord % 500000 == 0) { + System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); + } + if (ord >= limit) { + break; + } } - toIntsRef(w, inputMode, intsRef); - T expected = getOutput(intsRef, ord); - T actual = Util.get(fst, intsRef); - if (actual == null) { - throw new RuntimeException("unexpected null output on input=" + w); - } - if (!actual.equals(expected)) { - throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); - } - ord++; - if (ord % 500000 == 0) { - System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); - } - if (ord >= limit) { - break; - } + double totSec = ((System.currentTimeMillis() - tStart)/1000.0); + System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); + + // NOTE: comment out to profile lookup... + break; } - double totSec = ((System.currentTimeMillis() - tStart)/1000.0); - System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); - } finally { is.close(); } } } - // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out + // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out public static void main(String[] args) throws IOException { int prune = 0; int limit = Integer.MAX_VALUE; @@ -1363,7 +1373,8 @@ boolean storeOrds = false; boolean storeDocFreqs = false; boolean verify = true; - + boolean doPack = false; + boolean noArcArrays = false; String wordsFileIn = null; String dirOut = null; @@ -1381,10 +1392,14 @@ inputMode = 1; } else if (args[idx].equals("-docFreq")) { storeDocFreqs = true; + } else if (args[idx].equals("-noArcArrays")) { + noArcArrays = true; } else if (args[idx].equals("-ords")) { storeOrds = true; } else if (args[idx].equals("-noverify")) { verify = false; + } else if (args[idx].equals("-pack")) { + doPack = true; } else if (args[idx].startsWith("-")) { System.err.println("Unrecognized option: " + args[idx]); System.exit(-1); @@ -1413,44 +1428,44 @@ final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false); final PairOutputs outputs = new PairOutputs(o1, o2); - new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs) { + new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { Random rand; @Override public PairOutputs.Pair getOutput(IntsRef input, int ord) { if (ord == 0) { rand = new Random(17); } - return new PairOutputs.Pair(o1.get(ord), - o2.get(_TestUtil.nextInt(rand, 1, 5000))); + return outputs.newPair((long) ord, + (long) _TestUtil.nextInt(rand, 1, 5000)); } }.run(limit, verify); } else if (storeOrds) { // Store only ords final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { @Override public Long getOutput(IntsRef input, int ord) { - return outputs.get(ord); + return (long) ord; } }.run(limit, verify); } else if (storeDocFreqs) { // Store only docFreq final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { Random rand; @Override public Long getOutput(IntsRef input, int ord) { if (ord == 0) { rand = new Random(17); } - return outputs.get(_TestUtil.nextInt(rand, 1, 5000)); + return (long) _TestUtil.nextInt(rand, 1, 5000); } }.run(limit, verify); } else { // Store nothing final NoOutputs outputs = NoOutputs.getSingleton(); final Object NO_OUTPUT = outputs.getNoOutput(); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { @Override public Object getOutput(IntsRef input, int ord) { return NO_OUTPUT; @@ -1468,6 +1483,46 @@ assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); } + /* + public void testTrivial() throws Exception { + + // Get outputs -- passing true means FST will share + // (delta code) the outputs. This should result in + // smaller FST if the outputs grow monotonically. But + // if numbers are "random", false should give smaller + // final size: + final NoOutputs outputs = NoOutputs.getSingleton(); + + String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"}; + + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, + 0, 0, + true, + true, + Integer.MAX_VALUE, + outputs, + null, + true); + Arrays.sort(strings); + final IntsRef scratch = new IntsRef(); + for(String s : strings) { + builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput()); + } + final FST fst = builder.finish(); + System.out.println("DOT before rewrite"); + Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot")); + Util.toDot(fst, w, false, false); + w.close(); + + final FST rewrite = new FST(fst, 1, 100); + + System.out.println("DOT after rewrite"); + w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot")); + Util.toDot(rewrite, w, false, false); + w.close(); + } + */ + public void testSimple() throws Exception { // Get outputs -- passing true means FST will share @@ -1484,9 +1539,9 @@ final BytesRef b = new BytesRef("b"); final BytesRef c = new BytesRef("c"); - builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17)); - builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42)); - builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L)); + builder.add(Util.toIntsRef(a, new IntsRef()), 17L); + builder.add(Util.toIntsRef(b, new IntsRef()), 42L); + builder.add(Util.toIntsRef(c, new IntsRef()), 13824324872317238L); final FST fst = builder.finish(); @@ -1795,11 +1850,11 @@ public void testFinalOutputOnEndState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null); - builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17)); - builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10)); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random.nextBoolean()); + builder.add(Util.toUTF32("stat", new IntsRef()), 17L); + builder.add(Util.toUTF32("station", new IntsRef()), 10L); final FST fst = builder.finish(); - //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); StringWriter w = new StringWriter(); Util.toDot(fst, w, false, false); w.close(); @@ -1809,8 +1864,8 @@ public void testInternalFinalState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); + final boolean willRewrite = random.nextBoolean(); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite); builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput()); final FST fst = builder.finish(); @@ -1819,17 +1874,23 @@ Util.toDot(fst, w, false, false); w.close(); //System.out.println(w.toString()); - assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1); + final String expected; + if (willRewrite) { + expected = "4 -> 3 [label=\"t\" style=\"bold\""; + } else { + expected = "8 -> 6 [label=\"t\" style=\"bold\""; + } + assertTrue(w.toString().indexOf(expected) != -1); } // Make sure raw FST can differentiate between final vs // non-final end nodes - public void testNonFinalStopNodes() throws Exception { + public void testNonFinalStopNode() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final Long nothing = outputs.getNoOutput(); final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs); + final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs, false); final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); @@ -1839,8 +1900,8 @@ node.isFinal = true; rootNode.addArc('a', node); final Builder.CompiledNode frozen = new Builder.CompiledNode(); - frozen.address = fst.addNode(node); - rootNode.arcs[0].nextFinalOutput = outputs.get(17); + frozen.node = fst.addNode(node); + rootNode.arcs[0].nextFinalOutput = 17L; rootNode.arcs[0].isFinal = true; rootNode.arcs[0].output = nothing; rootNode.arcs[0].target = frozen; @@ -1851,13 +1912,18 @@ final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); rootNode.addArc('b', node); final Builder.CompiledNode frozen = new Builder.CompiledNode(); - frozen.address = fst.addNode(node); + frozen.node = fst.addNode(node); rootNode.arcs[1].nextFinalOutput = nothing; - rootNode.arcs[1].output = outputs.get(42); + rootNode.arcs[1].output = 42L; rootNode.arcs[1].target = frozen; } fst.finish(fst.addNode(rootNode)); + + StringWriter w = new StringWriter(); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); + Util.toDot(fst, w, false, false); + w.close(); checkStopNodes(fst, outputs); Index: lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java (working copy) @@ -229,7 +229,7 @@ ////System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in - fstBuilder.add(new IntsRef(), fstOutputs.get(termsFilePointer)); + fstBuilder.add(new IntsRef(), termsFilePointer); startTermsFilePointer = termsFilePointer; } @@ -260,7 +260,7 @@ final int lengthSave = text.length; text.length = indexedTermPrefixLength(lastTerm, text); try { - fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), fstOutputs.get(termsFilePointer)); + fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer); } finally { text.length = lengthSave; } Index: lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (working copy) @@ -521,9 +521,10 @@ private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder>> b; - b = new Builder>>(FST.INPUT_TYPE.BYTE1, - new PairOutputs>(posIntOutputs, - new PairOutputs(posIntOutputs, posIntOutputs))); + final PairOutputs outputsInner = new PairOutputs(posIntOutputs, posIntOutputs); + final PairOutputs> outputs = new PairOutputs>(posIntOutputs, + outputsInner); + b = new Builder>>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); @@ -536,9 +537,9 @@ SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { - b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair>(lastDocsStart, - new PairOutputs.Pair((long) docFreq, - posIntOutputs.get(totalTermFreq)))); + b.add(Util.toIntsRef(lastTerm, scratchIntsRef), + outputs.newPair(lastDocsStart, + outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; @@ -553,9 +554,8 @@ totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { - b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair>(lastDocsStart, - new PairOutputs.Pair((long) docFreq, - posIntOutputs.get(totalTermFreq)))); + b.add(Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair(lastDocsStart, + outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; Index: lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java (working copy) @@ -398,7 +398,7 @@ final long indexStartFP; final long rootBlockFP; final BytesRef rootCode; - private FST index; + private final FST index; //private boolean DEBUG; @@ -433,6 +433,8 @@ w.close(); } */ + } else { + index = null; } } @@ -495,6 +497,8 @@ private final BytesRef term = new BytesRef(); + private final FST.BytesReader fstReader; + // TODO: can we share this with the frame in STE? private final class Frame { final int ord; @@ -755,6 +759,12 @@ arcs[arcIdx] = new FST.Arc(); } + if (index == null) { + fstReader = null; + } else { + fstReader = index.getBytesReader(0); + } + // TODO: if the automaton is "smallish" we really // should use the terms index to seek at least to // the initial term and likely to subsequent terms @@ -842,7 +852,7 @@ // TODO: we could be more efficient for the next() // case by using current arc as starting point, // passed to findTargetArc - arc = index.findTargetArc(target, arc, getArc(1+idx)); + arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader); assert arc != null; output = fstOutputs.add(output, arc.output); idx++; @@ -1186,6 +1196,7 @@ private boolean eof; final BytesRef term = new BytesRef(); + private final FST.BytesReader fstReader; @SuppressWarnings("unchecked") private FST.Arc[] arcs = new FST.Arc[1]; @@ -1196,6 +1207,12 @@ // Used to hold seek by TermState, or cached seek staticFrame = new Frame(-1); + if (index == null) { + fstReader = null; + } else { + fstReader = index.getBytesReader(0); + } + // Init w/ root block; don't use index since it may // not (and need not) have been loaded for(int arcIdx=0;arcIdx nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); + final FST.Arc nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); if (nextArc == null) { @@ -1838,7 +1855,7 @@ final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; - final FST.Arc nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); + final FST.Arc nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); if (nextArc == null) { Index: lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (working copy) @@ -288,7 +288,7 @@ final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder indexBuilder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, - outputs, null); + outputs, null, false); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} @@ -831,7 +831,7 @@ 0, 0, true, true, Integer.MAX_VALUE, noOutputs, - new FindBlocks()); + new FindBlocks(), false); postingsWriter.setField(fieldInfo); } Index: lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java (working copy) @@ -38,7 +38,8 @@ public final A output1; public final B output2; - public Pair(A output1, B output2) { + // use newPair + private Pair(A output1, B output2) { this.output1 = output1; this.output2 = output2; } @@ -66,35 +67,79 @@ this.outputs2 = outputs2; NO_OUTPUT = new Pair(outputs1.getNoOutput(), outputs2.getNoOutput()); } - - public Pair get(A output1, B output2) { - if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) { + + /** Create a new Pair */ + public Pair newPair(A a, B b) { + if (a.equals(outputs1.getNoOutput())) { + a = outputs1.getNoOutput(); + } + if (b.equals(outputs2.getNoOutput())) { + b = outputs2.getNoOutput(); + } + + if (a == outputs1.getNoOutput() && b == outputs2.getNoOutput()) { return NO_OUTPUT; } else { - return new Pair(output1, output2); + final Pair p = new Pair(a, b); + assert valid(p); + return p; } } - + + // for assert + private boolean valid(Pair pair) { + final boolean noOutput1 = pair.output1.equals(outputs1.getNoOutput()); + final boolean noOutput2 = pair.output2.equals(outputs2.getNoOutput()); + + if (noOutput1 && pair.output1 != outputs1.getNoOutput()) { + System.out.println("invalid0"); + return false; + } + + if (noOutput2 && pair.output2 != outputs2.getNoOutput()) { + System.out.println("invalid1"); + return false; + } + + if (noOutput1 && noOutput2) { + if (pair != NO_OUTPUT) { + System.out.println("invalid2"); + return false; + } else { + return true; + } + } else { + return true; + } + } + @Override public Pair common(Pair pair1, Pair pair2) { - return get(outputs1.common(pair1.output1, pair2.output1), - outputs2.common(pair1.output2, pair2.output2)); + assert valid(pair1); + assert valid(pair2); + return newPair(outputs1.common(pair1.output1, pair2.output1), + outputs2.common(pair1.output2, pair2.output2)); } @Override public Pair subtract(Pair output, Pair inc) { - return get(outputs1.subtract(output.output1, inc.output1), - outputs2.subtract(output.output2, inc.output2)); + assert valid(output); + assert valid(inc); + return newPair(outputs1.subtract(output.output1, inc.output1), + outputs2.subtract(output.output2, inc.output2)); } @Override public Pair add(Pair prefix, Pair output) { - return get(outputs1.add(prefix.output1, output.output1), - outputs2.add(prefix.output2, output.output2)); + assert valid(prefix); + assert valid(output); + return newPair(outputs1.add(prefix.output1, output.output1), + outputs2.add(prefix.output2, output.output2)); } @Override public void write(Pair output, DataOutput writer) throws IOException { + assert valid(output); outputs1.write(output.output1, writer); outputs2.write(output.output2, writer); } @@ -103,7 +148,7 @@ public Pair read(DataInput in) throws IOException { A output1 = outputs1.read(in); B output2 = outputs2.read(in); - return get(output1, output2); + return newPair(output1, output2); } @Override @@ -113,6 +158,12 @@ @Override public String outputToString(Pair output) { + assert valid(output); return ""; } + + @Override + public String toString() { + return "PairOutputs<" + outputs1 + "," + outputs2 + ">"; + } } Index: lucene/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/Util.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -37,23 +37,21 @@ // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + final FST.BytesReader fstReader = fst.getBytesReader(0); + // Accumulate output as we go - final T NO_OUTPUT = fst.outputs.getNoOutput(); - T output = NO_OUTPUT; + T output = fst.outputs.getNoOutput(); for(int i=0;i T get(FST fst, BytesRef input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE1; + final FST.BytesReader fstReader = fst.getBytesReader(0); + // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); // Accumulate output as we go - final T NO_OUTPUT = fst.outputs.getNoOutput(); - T output = NO_OUTPUT; + T output = fst.outputs.getNoOutput(); for(int i=0;i prevArc = null; @@ -238,6 +234,7 @@ // A queue of transitions to consider when processing the next level. final List> nextLevelQueue = new ArrayList>(); nextLevelQueue.add(startArc); + //System.out.println("toDot: startArc: " + startArc); // A list of states on the same level (for ranking). final List sameLevelStates = new ArrayList(); @@ -289,8 +286,11 @@ int level = 0; + final FST.BytesReader r = fst.getBytesReader(0); + while (!nextLevelQueue.isEmpty()) { // we could double buffer here, but it doesn't matter probably. + //System.out.println("next level=" + level); thisLevelQueue.addAll(nextLevelQueue); nextLevelQueue.clear(); @@ -298,19 +298,19 @@ out.write("\n // Transitions and states at level: " + level + "\n"); while (!thisLevelQueue.isEmpty()) { final FST.Arc arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); + //System.out.println(" pop: " + arc); if (fst.targetHasArcs(arc)) { - // scan all arcs + // scan all target arcs + //System.out.println(" readFirstTarget..."); final int node = arc.target; - fst.readFirstTargetArc(arc, arc); - if (arc.label == FST.END_LABEL) { - // Skip it -- prior recursion took this into account already - assert !arc.isLast(); - fst.readNextArc(arc); - } + fst.readFirstRealTargetArc(arc.target, arc, r); + //System.out.println(" firstTarget: " + arc); + while (true) { + //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. if (arc.target >= 0 && !seen.get(arc.target)) { @@ -329,7 +329,7 @@ if (fst.isExpandedTarget(arc)) { stateColor = expandedNodeColor; } else { - stateColor = null; + stateColor = null; } final String finalOutput; @@ -339,7 +339,9 @@ finalOutput = ""; } - emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput); + emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput); + // To see the node address, use this instead: + //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); seen.set(arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); sameLevelStates.add(arc.target); @@ -362,14 +364,22 @@ outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]"; } + final String arcColor; + if (arc.flag(FST.BIT_TARGET_NEXT)) { + arcColor = "red"; + } else { + arcColor = "black"; + } + assert arc.label != FST.END_LABEL; - out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n"); + out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.isLast()) { + //System.out.println(" break"); break; } - fst.readNextArc(arc); + fst.readNextRealArc(arc, r); } } } Index: lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (working copy) @@ -25,10 +25,7 @@ /** * Output is a long, for each input term. NOTE: the * resulting FST is not guaranteed to be minimal! See - * {@link Builder}. You must use {@link #get} to obtain the - * output for a given long value -- do not use autoboxing - * nor create your own Long instance (the value 0 - * must map to the {@link #getNoOutput} singleton). + * {@link Builder}. * * @lucene.experimental */ @@ -50,14 +47,6 @@ return doShare ? singletonShare : singletonNoShare; } - public Long get(long v) { - if (v == 0) { - return NO_OUTPUT; - } else { - return Long.valueOf(v); - } - } - @Override public Long common(Long output1, Long output2) { assert valid(output1); Index: lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java (working copy) @@ -151,7 +151,8 @@ boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + in.pos = arc.posArcsStart; + in.skip(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); @@ -275,7 +276,7 @@ // Now scan forward, matching the new suffix of the target while(true) { - //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast()); + //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc); if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { // Arcs are fixed array -- use binary search to find @@ -289,15 +290,16 @@ boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + in.pos = arc.posArcsStart; + in.skip(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); - if (cmp < 0) + if (cmp < 0) { low = mid + 1; - else if (cmp > 0) + } else if (cmp > 0) { high = mid - 1; - else { + } else { found = true; break; } @@ -430,9 +432,11 @@ FST.Arc arc = getArc(upto-1); int targetLabel = getTargetLabel(); + final FST.BytesReader fstReader = fst.getBytesReader(0); + while(true) { //System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); - final FST.Arc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto)); + final FST.Arc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader); if (nextArc == null) { // short circuit //upto--; Index: lucene/src/java/org/apache/lucene/util/fst/Outputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/Outputs.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/fst/Outputs.java (working copy) @@ -26,6 +26,10 @@ * Represents the outputs for an FST, providing the basic * algebra needed for the FST. * + *

Note that any operation that returns NO_OUTPUT must + * return the same singleton object from {@link + * #getNoOutput}.

+ * * @lucene.experimental */ @@ -56,6 +60,8 @@ public abstract String outputToString(T output); + // TODO: maybe make valid(T output) public...? for asserts + public T merge(T first, T second) { throw new UnsupportedOperationException(); } Index: lucene/src/java/org/apache/lucene/util/fst/Builder.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/Builder.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/fst/Builder.java (working copy) @@ -17,15 +17,15 @@ * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc -import java.io.IOException; - /** - * Builds a compact FST (maps an IntsRef term to an arbitrary + * Builds a minimal FST (maps an IntsRef term to an arbitrary * output) from pre-sorted terms with outputs (the FST * becomes an FSA if you use NoOutputs). The FST is written * on-the-fly into a compact serialized format byte array, which can @@ -35,12 +35,6 @@ *

NOTE: The algorithm is described at * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698

* - * If your outputs are ByteSequenceOutput then the final FST - * will be minimal, but if you use PositiveIntOutput then - * it's only "near minimal". For example, aa/0, aab/1, bbb/2 - * will produce 6 states when a 5 state fst is also - * possible. - * * The parameterized type T is the output type. See the * subclasses of {@link Outputs}. * @@ -52,7 +46,7 @@ private final FST fst; private final T NO_OUTPUT; - // private static final boolean DEBUG = false; + // private static final boolean DEBUG = true; // simplistic pruning: we prune node (and all following // nodes) if less than this number of terms go through it: @@ -88,7 +82,7 @@ * pruning options turned off. */ public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false); } /** @@ -127,16 +121,20 @@ * @param outputs The output type for each input sequence. Applies only if building an FST. For * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the * singleton output object. + * + * @param willPackFST Pass true if you will rewrite (compact) the FST before saving. This + * causes the FST to create additional data structures intenrally to facilitate rewriting, but + * it means the resulting FST cannot be saved: it must first be rewritten using {@link FST#FST(FST,int[])}} */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - FreezeTail freezeTail) { + FreezeTail freezeTail, boolean willPackFST) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.freezeTail = freezeTail; this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.shareMaxTailLength = shareMaxTailLength; - fst = new FST(inputType, outputs); + fst = new FST(inputType, outputs, willPackFST); if (doShareSuffix) { dedupHash = new NodeHash(fst); } else { @@ -170,23 +168,23 @@ fst.setAllowArrayArcs(b); } - private CompiledNode compileNode(UnCompiledNode n, int tailLength) throws IOException { - final int address; - if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) { - if (n.numArcs == 0) { - address = fst.addNode(n); + private CompiledNode compileNode(UnCompiledNode nodeIn, int tailLength) throws IOException { + final int node; + if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) { + if (nodeIn.numArcs == 0) { + node = fst.addNode(nodeIn); } else { - address = dedupHash.add(n); + node = dedupHash.add(nodeIn); } } else { - address = fst.addNode(n); + node = fst.addNode(nodeIn); } - assert address != -2; + assert node != -2; - n.clear(); + nodeIn.clear(); final CompiledNode fn = new CompiledNode(); - fn.address = address; + fn.node = node; return fn; } @@ -319,6 +317,11 @@ } */ + // De-dup NO_OUTPUT since it must be a singleton: + if (output.equals(NO_OUTPUT)) { + output = NO_OUTPUT; + } + assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input; assert validOutput(output); @@ -443,7 +446,7 @@ } } //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); - fst.finish(compileNode(root, lastInput.length).address); + fst.finish(compileNode(root, lastInput.length).node); return fst; } @@ -480,7 +483,7 @@ } static final class CompiledNode implements Node { - int address; + int node; public boolean isCompiled() { return true; } @@ -560,7 +563,7 @@ final Arc arc = arcs[numArcs-1]; assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; arc.target = target; - //assert target.address != -2; + //assert target.node != -2; arc.nextFinalOutput = nextFinalOutput; arc.isFinal = isFinal; } Index: lucene/src/java/org/apache/lucene/util/fst/NodeHash.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/NodeHash.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/fst/NodeHash.java (working copy) @@ -35,7 +35,7 @@ } private boolean nodesEqual(Builder.UnCompiledNode node, int address, FST.BytesReader in) throws IOException { - fst.readFirstRealArc(address, scratchArc, in); + fst.readFirstRealTargetArc(address, scratchArc, in); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; } @@ -43,7 +43,7 @@ final Builder.Arc arc = node.arcs[arcUpto]; if (arc.label != scratchArc.label || !arc.output.equals(scratchArc.output) || - ((Builder.CompiledNode) arc.target).address != scratchArc.target || + ((Builder.CompiledNode) arc.target).node != scratchArc.target || !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) || arc.isFinal != scratchArc.isFinal()) { return false; @@ -71,9 +71,9 @@ // TODO: maybe if number of arcs is high we can safely subsample? for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; - //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); + //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); h = PRIME * h + arc.label; - h = PRIME * h + ((Builder.CompiledNode) arc.target).address; + h = PRIME * h + ((Builder.CompiledNode) arc.target).node; h = PRIME * h + arc.output.hashCode(); h = PRIME * h + arc.nextFinalOutput.hashCode(); if (arc.isFinal) { @@ -88,9 +88,9 @@ private int hash(int node) throws IOException { final int PRIME = 31; final FST.BytesReader in = fst.getBytesReader(0); - //System.out.println("hash frozen"); + //System.out.println("hash frozen node=" + node); int h = 0; - fst.readFirstRealArc(node, scratchArc, in); + fst.readFirstRealTargetArc(node, scratchArc, in); while(true) { //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal()); h = PRIME * h + scratchArc.label; @@ -109,26 +109,26 @@ return h & Integer.MAX_VALUE; } - public int add(Builder.UnCompiledNode node) throws IOException { + public int add(Builder.UnCompiledNode nodeIn) throws IOException { // System.out.println("hash: add count=" + count + " vs " + table.length); final FST.BytesReader in = fst.getBytesReader(0); - final int h = hash(node); + final int h = hash(nodeIn); int pos = h & mask; int c = 0; while(true) { final int v = table[pos]; if (v == 0) { // freeze & add - final int address = fst.addNode(node); - //System.out.println(" now freeze addr=" + address); - assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h; + final int node = fst.addNode(nodeIn); + //System.out.println(" now freeze node=" + node); + assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; count++; - table[pos] = address; + table[pos] = node; if (table.length < 2*count) { rehash(); } - return address; - } else if (nodesEqual(node, v, in)) { + return node; + } else if (nodesEqual(nodeIn, v, in)) { // same node is already here return v; } Index: lucene/src/java/org/apache/lucene/util/fst/FST.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/FST.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/fst/FST.java (working copy) @@ -25,6 +25,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -33,12 +35,23 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.Builder.UnCompiledNode; +// TODO: break this into WritableFST and ReadOnlyFST.. then +// we can have subclasses of ReadOnlyFST to handle the +// different byte[] level encodings (packed or +// not)... and things like nodeCount, arcCount are read only + // TODO: if FST is pure prefix trie we can do a more compact // job, ie, once we are at a 'suffix only', just store the // completion labels as a string not as a series of arcs. +// TODO: maybe make an explicit thread state that holds +// reusable stuff eg BytesReader, a scratch arc + // NOTE: while the FST is able to represent a non-final // dead-end state (NON_FINAL_END_NODE=0), the layers above // (FSTEnum, Util) have problems with this!! @@ -52,13 +65,15 @@ * * @lucene.experimental */ -public class FST { +public final class FST { public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4}; public final INPUT_TYPE inputType; private final static int BIT_FINAL_ARC = 1 << 0; private final static int BIT_LAST_ARC = 1 << 1; - private final static int BIT_TARGET_NEXT = 1 << 2; + final static int BIT_TARGET_NEXT = 1 << 2; + + // TODO: we can free up a bit if we can nuke this: private final static int BIT_STOP_NODE = 1 << 3; private final static int BIT_ARC_HAS_OUTPUT = 1 << 4; private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; @@ -66,8 +81,13 @@ // Arcs are stored as fixed-size (per entry) array, so // that we can find an arc using binary search. We do // this when number of arcs is > NUM_ARCS_ARRAY: - private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; + // If set, the target node is delta coded vs current + // position: + private final static int BIT_TARGET_DELTA = 1 << 6; + + private final static byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT; + /** * @see #shouldExpand(UnCompiledNode) */ @@ -95,8 +115,11 @@ /** Write BYTE2 labels as 2-byte short, not vInt. */ private final static int VERSION_SHORT_BYTE2_LABELS = 2; - private final static int VERSION_CURRENT = VERSION_SHORT_BYTE2_LABELS; + /** Added optional packed format. */ + private final static int VERSION_PACKED = 3; + private final static int VERSION_CURRENT = VERSION_PACKED; + // Never serialized; just used to represent the virtual // final node w/ no arcs: private final static int FINAL_END_NODE = -1; @@ -126,6 +149,9 @@ public int arcCount; public int arcWithOutputCount; + private final boolean packed; + private final int[] nodeRefToAddress; + // If arc has this label then that arc is final/accepted public static final int END_LABEL = -1; @@ -137,10 +163,17 @@ public int label; public T output; + // From node (ord or address); currently only used when + // building an FST w/ willPackFST=true: + int node; + + // To node (ord or address): public int target; byte flags; public T nextFinalOutput; + + // address (into the byte[]), or ord/address if label == END_LABEL int nextArc; // This is non-zero if current arcs are fixed array: @@ -151,19 +184,18 @@ /** Returns this */ public Arc copyFrom(Arc other) { + node = other.node; label = other.label; target = other.target; flags = other.flags; output = other.output; nextFinalOutput = other.nextFinalOutput; nextArc = other.nextArc; - if (other.bytesPerArc != 0) { - bytesPerArc = other.bytesPerArc; + bytesPerArc = other.bytesPerArc; + if (bytesPerArc != 0) { posArcsStart = other.posArcsStart; arcIdx = other.arcIdx; numArcs = other.numArcs; - } else { - bytesPerArc = 0; } return this; } @@ -179,40 +211,91 @@ public boolean isFinal() { return flag(BIT_FINAL_ARC); } + + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("node=" + node); + b.append(" target=" + target); + b.append(" label=" + label); + if (flag(BIT_LAST_ARC)) { + b.append(" last"); + } + if (flag(BIT_FINAL_ARC)) { + b.append(" final"); + } + if (flag(BIT_TARGET_NEXT)) { + b.append(" targetNext"); + } + if (flag(BIT_ARC_HAS_OUTPUT)) { + b.append(" hasOutput"); + } + if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + b.append(" hasOutput"); + } + if (bytesPerArc != 0) { + b.append(" arcArray(idx=" + arcIdx + " of " + numArcs + ")"); + } + return b.toString(); + } }; - static boolean flag(int flags, int bit) { + private final static boolean flag(int flags, int bit) { return (flags & bit) != 0; } private final BytesWriter writer; - // make a new empty FST, for building - public FST(INPUT_TYPE inputType, Outputs outputs) { + // TODO: we can save RAM here by using growable packed + // ints...: + private int[] nodeAddress; + + // TODO: we could be smarter here, and prune periodically + // as we go; high in-count nodes will "usually" become + // clear early on: + private int[] inCounts; + + // make a new empty FST, for building; Builder invokes + // this ctor + FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST) { this.inputType = inputType; this.outputs = outputs; bytes = new byte[128]; NO_OUTPUT = outputs.getNoOutput(); + if (willPackFST) { + nodeAddress = new int[8]; + inCounts = new int[8]; + } else { + nodeAddress = null; + inCounts = null; + } writer = new BytesWriter(); emptyOutput = null; + packed = false; + nodeRefToAddress = null; } - // create an existing FST + /** Load a previously saved FST. */ public FST(DataInput in, Outputs outputs) throws IOException { this.outputs = outputs; writer = null; // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): - CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_SHORT_BYTE2_LABELS, VERSION_SHORT_BYTE2_LABELS); + CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED); + packed = in.readByte() == 1; if (in.readByte() == 1) { // accepts empty string int numBytes = in.readVInt(); // messy bytes = new byte[numBytes]; in.readBytes(bytes, 0, numBytes); - emptyOutput = outputs.read(getBytesReader(numBytes-1)); + if (packed) { + emptyOutput = outputs.read(getBytesReader(0)); + } else { + emptyOutput = outputs.read(getBytesReader(numBytes-1)); + } } else { emptyOutput = null; } @@ -230,6 +313,15 @@ default: throw new IllegalStateException("invalid input type " + t); } + if (packed) { + final int nodeRefCount = in.readVInt(); + nodeRefToAddress = new int[nodeRefCount]; + for(int idx=0;idx[]) new FST.Arc[0x80]; - final FST.Arc arc = new FST.Arc(); + cachedRootArcs = (Arc[]) new Arc[0x80]; + final Arc arc = new Arc(); getFirstArc(arc); final BytesReader in = getBytesReader(0); if (targetHasArcs(arc)) { - readFirstRealArc(arc.target, arc, in); + readFirstRealTargetArc(arc.target, arc, in); while(true) { assert arc.label != END_LABEL; if (arc.label < cachedRootArcs.length) { @@ -307,14 +416,16 @@ outputs.write(emptyOutput, writer); emptyOutputBytes = new byte[writer.posWrite-posSave]; - // reverse - final int stopAt = (writer.posWrite - posSave)/2; - int upto = 0; - while(upto < stopAt) { - final byte b = bytes[posSave + upto]; - bytes[posSave+upto] = bytes[writer.posWrite-upto-1]; - bytes[writer.posWrite-upto-1] = b; - upto++; + if (!packed) { + // reverse + final int stopAt = (writer.posWrite - posSave)/2; + int upto = 0; + while(upto < stopAt) { + final byte b = bytes[posSave + upto]; + bytes[posSave+upto] = bytes[writer.posWrite-upto-1]; + bytes[writer.posWrite-upto-1] = b; + upto++; + } } System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.posWrite-posSave); writer.posWrite = posSave; @@ -324,7 +435,15 @@ if (startNode == -1) { throw new IllegalStateException("call finish first"); } + if (nodeAddress != null) { + throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed"); + } CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); + if (packed) { + out.writeByte((byte) 1); + } else { + out.writeByte((byte) 0); + } // TODO: really we should encode this as an arc, arriving // to the root node, instead of special casing here: if (emptyOutput != null) { @@ -343,6 +462,13 @@ t = 2; } out.writeByte(t); + if (packed) { + assert nodeRefToAddress != null; + out.writeVInt(nodeRefToAddress.length); + for(int idx=0;idx arc) { + public static boolean targetHasArcs(Arc arc) { return arc.target > 0; } // serializes new node by appending its bytes to the end // of the current byte[] - int addNode(Builder.UnCompiledNode node) throws IOException { - //System.out.println("FST.addNode pos=" + posWrite + " numArcs=" + node.numArcs); - if (node.numArcs == 0) { - if (node.isFinal) { + int addNode(Builder.UnCompiledNode nodeIn) throws IOException { + //System.out.println("FST.addNode pos=" + writer.posWrite + " numArcs=" + nodeIn.numArcs); + if (nodeIn.numArcs == 0) { + if (nodeIn.isFinal) { return FINAL_END_NODE; } else { return NON_FINAL_END_NODE; @@ -437,15 +563,15 @@ int startAddress = writer.posWrite; //System.out.println(" startAddr=" + startAddress); - final boolean doFixedArray = shouldExpand(node); + final boolean doFixedArray = shouldExpand(nodeIn); final int fixedArrayStart; if (doFixedArray) { - if (bytesPerArc.length < node.numArcs) { - bytesPerArc = new int[ArrayUtil.oversize(node.numArcs, 1)]; + if (bytesPerArc.length < nodeIn.numArcs) { + bytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)]; } // write a "false" first arc: - writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY); - writer.writeVInt(node.numArcs); + writer.writeByte(ARCS_AS_FIXED_ARRAY); + writer.writeVInt(nodeIn.numArcs); // placeholder -- we'll come back and write the number // of bytes per arc (int) here: // TODO: we could make this a vInt instead @@ -456,15 +582,14 @@ fixedArrayStart = 0; } - nodeCount++; - arcCount += node.numArcs; + arcCount += nodeIn.numArcs; - final int lastArc = node.numArcs-1; + final int lastArc = nodeIn.numArcs-1; int lastArcStart = writer.posWrite; int maxBytesPerArc = 0; - for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + for(int arcIdx=0;arcIdx arc = nodeIn.arcs[arcIdx]; final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; int flags = 0; @@ -472,7 +597,10 @@ flags += BIT_LAST_ARC; } - if (lastFrozenNode == target.address && !doFixedArray) { + if (lastFrozenNode == target.node && !doFixedArray) { + // TODO: for better perf (but more RAM used) we + // could avoid this except when arc is "near" the + // last arc: flags += BIT_TARGET_NEXT; } @@ -485,10 +613,12 @@ assert arc.nextFinalOutput == NO_OUTPUT; } - boolean targetHasArcs = target.address > 0; + boolean targetHasArcs = target.node > 0; if (!targetHasArcs) { flags += BIT_STOP_NODE; + } else if (inCounts != null) { + inCounts[target.node]++; } if (arc.output != NO_OUTPUT) { @@ -498,19 +628,23 @@ writer.writeByte((byte) flags); writeLabel(arc.label); - //System.out.println(" write arc: label=" + arc.label + " flags=" + flags); + // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + writer.posWrite + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { outputs.write(arc.output, writer); + //System.out.println(" write output"); arcWithOutputCount++; } + if (arc.nextFinalOutput != NO_OUTPUT) { + //System.out.println(" write final output"); outputs.write(arc.nextFinalOutput, writer); } - if (targetHasArcs && (doFixedArray || lastFrozenNode != target.address)) { - assert target.address > 0; - writer.writeInt(target.address); + if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { + assert target.node > 0; + //System.out.println(" write target"); + writer.writeInt(target.node); } // just write the arcs "like normal" on first pass, @@ -530,10 +664,11 @@ // such cases if (doFixedArray) { + //System.out.println(" doFixedArray"); assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed // byte size - final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc; + final int sizeNeeded = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc; bytes = ArrayUtil.grow(bytes, sizeNeeded); // TODO: we could make this a vInt instead bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); @@ -543,9 +678,9 @@ // expand the arcs in place, backwards int srcPos = writer.posWrite; - int destPos = fixedArrayStart + node.numArcs*maxBytesPerArc; + int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; writer.posWrite = destPos; - for(int arcIdx=node.numArcs-1;arcIdx>=0;arcIdx--) { + for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); destPos -= maxBytesPerArc; srcPos -= bytesPerArc[arcIdx]; @@ -559,7 +694,7 @@ // reverse bytes in-place; we do this so that the // "BIT_TARGET_NEXT" opto can work, ie, it reads the // node just before the current one - final int endAddress = lastFrozenNode = writer.posWrite - 1; + final int endAddress = writer.posWrite - 1; int left = startAddress; int right = endAddress; @@ -568,13 +703,31 @@ bytes[left++] = bytes[right]; bytes[right--] = b; } + //System.out.println(" endAddress=" + endAddress); - return endAddress; + nodeCount++; + final int node; + if (nodeAddress != null) { + // Nodes are addressed by 1+ord: + if (nodeCount == nodeAddress.length) { + nodeAddress = ArrayUtil.grow(nodeAddress); + inCounts = ArrayUtil.grow(inCounts); + } + nodeAddress[nodeCount] = endAddress; + // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); + node = nodeCount; + } else { + node = endAddress; + } + lastFrozenNode = node; + + return node; } /** Fills virtual 'start' arc, ie, an empty incoming arc to * the FST's start node */ public Arc getFirstArc(Arc arc) { + if (emptyOutput != null) { arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; arc.nextFinalOutput = emptyOutput; @@ -585,7 +738,7 @@ arc.output = NO_OUTPUT; // If there are no nodes, ie, the FST only accepts the - // empty string, then startNode is 0, and then readFirstTargetArc + // empty string, then startNode is 0 arc.target = startNode; return arc; } @@ -602,20 +755,27 @@ //System.out.println(" end node"); assert follow.isFinal(); arc.label = END_LABEL; + arc.target = FINAL_END_NODE; arc.output = follow.nextFinalOutput; arc.flags = BIT_LAST_ARC; return arc; } else { - final BytesReader in = getBytesReader(follow.target); - arc.flags = in.readByte(); - if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + final BytesReader in = getBytesReader(getNodeAddress(follow.target)); + arc.node = follow.target; + final byte b = in.readByte(); + if (b == ARCS_AS_FIXED_ARRAY) { // array: jump straight to end arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readInt(); + if (packed) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); arc.posArcsStart = in.pos; arc.arcIdx = arc.numArcs - 2; } else { + arc.flags = b; // non-array: linear scan arc.bytesPerArc = 0; //System.out.println(" scan"); @@ -631,11 +791,17 @@ if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { } else { - in.pos -= 4; + if (packed) { + in.readVInt(); + } else { + in.skip(4); + } } arc.flags = in.readByte(); } - arc.nextArc = in.pos+1; + // Undo the byte flags we read: + in.skip(-1); + arc.nextArc = in.pos; } readNextRealArc(arc, in); assert arc.isLast(); @@ -657,35 +823,48 @@ // Insert "fake" final first arc: arc.label = END_LABEL; arc.output = follow.nextFinalOutput; + arc.flags = BIT_FINAL_ARC; if (follow.target <= 0) { - arc.flags = BIT_LAST_ARC | BIT_FINAL_ARC; + arc.flags |= BIT_LAST_ARC; } else { - arc.flags = BIT_FINAL_ARC; + arc.node = follow.target; + // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target; } + arc.target = FINAL_END_NODE; //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); return arc; } else { - return readFirstRealArc(follow.target, arc, getBytesReader(0)); + return readFirstRealTargetArc(follow.target, arc, getBytesReader(0)); } } - public Arc readFirstRealArc(int address, Arc arc, final BytesReader in) throws IOException { + public Arc readFirstRealTargetArc(int node, Arc arc, final BytesReader in) throws IOException { + final int address = getNodeAddress(node); in.pos = address; - arc.flags = in.readByte(); + //System.out.println(" readFirstRealTargtArc address=" + //+ address); + //System.out.println(" flags=" + arc.flags); + arc.node = node; - if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + if (in.readByte() == ARCS_AS_FIXED_ARRAY) { //System.out.println(" fixedArray"); // this is first arc in a fixed-array arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readInt(); + if (packed) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } arc.arcIdx = -1; arc.nextArc = arc.posArcsStart = in.pos; //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); } else { + //arc.flags = b; arc.nextArc = address; arc.bytesPerArc = 0; } + return readNextRealArc(arc, in); } @@ -699,9 +878,8 @@ if (!targetHasArcs(follow)) { return false; } else { - final BytesReader in = getBytesReader(follow.target); - final byte b = in.readByte(); - return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; + final BytesReader in = getBytesReader(getNodeAddress(follow.target)); + return in.readByte() == ARCS_AS_FIXED_ARRAY; } } @@ -710,10 +888,9 @@ if (arc.label == END_LABEL) { // This was a fake inserted "final" arc if (arc.nextArc <= 0) { - // This arc went to virtual final node, ie has no outgoing arcs - return null; + throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); } - return readFirstRealArc(arc.nextArc, arc, getBytesReader(0)); + return readFirstRealTargetArc(arc.nextArc, arc, getBytesReader(0)); } else { return readNextRealArc(arc, getBytesReader(0)); } @@ -727,19 +904,24 @@ final BytesReader in; if (arc.label == END_LABEL) { //System.out.println(" nextArc fake " + arc.nextArc); - in = getBytesReader(arc.nextArc); - byte flags = bytes[in.pos]; - if (flag(flags, BIT_ARCS_AS_FIXED_ARRAY)) { + in = getBytesReader(getNodeAddress(arc.nextArc)); + final byte b = bytes[in.pos]; + if (b == ARCS_AS_FIXED_ARRAY) { //System.out.println(" nextArc fake array"); - in.pos--; + in.skip(1); in.readVInt(); - in.readInt(); + if (packed) { + in.readVInt(); + } else { + in.readInt(); + } } } else { if (arc.bytesPerArc != 0) { //System.out.println(" nextArc real array"); // arcs are at fixed entries - in = getBytesReader(arc.posArcsStart - (1+arc.arcIdx)*arc.bytesPerArc); + in = getBytesReader(arc.posArcsStart); + in.skip((1+arc.arcIdx)*arc.bytesPerArc); } else { // arcs are packed //System.out.println(" nextArc real packed"); @@ -754,12 +936,16 @@ /** Never returns null, but you should never call this if * arc.isLast() is true. */ public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { + + // TODO: can't assert this because we call from readFirstArc + // assert !flag(arc.flags, BIT_LAST_ARC); + // this is a continuing arc in a fixed array if (arc.bytesPerArc != 0) { // arcs are at fixed entries arc.arcIdx++; assert arc.arcIdx < arc.numArcs; - in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc; + in.skip(arc.posArcsStart, arc.arcIdx*arc.bytesPerArc); } else { // arcs are packed in.pos = arc.nextArc; @@ -788,45 +974,61 @@ arc.nextArc = in.pos; } else if (arc.flag(BIT_TARGET_NEXT)) { arc.nextArc = in.pos; - if (!arc.flag(BIT_LAST_ARC)) { - if (arc.bytesPerArc == 0) { - // must scan - seekToNextNode(in); + // TODO: would be nice to make this lazy -- maybe + // caller doesn't need the target and is scanning arcs... + if (nodeAddress == null) { + if (!arc.flag(BIT_LAST_ARC)) { + if (arc.bytesPerArc == 0) { + // must scan + seekToNextNode(in); + } else { + in.skip(arc.posArcsStart, arc.bytesPerArc * arc.numArcs); + } + } + arc.target = in.pos; + } else { + arc.target = arc.node - 1; + assert arc.target > 0; + } + } else { + if (packed) { + final int pos = in.pos; + final int code = in.readVInt(); + if (arc.flag(BIT_TARGET_DELTA)) { + // Address is delta-coded from current address: + arc.target = pos + code; + //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target); + } else if (code < nodeRefToAddress.length) { + // Deref + arc.target = nodeRefToAddress[code]; + //System.out.println(" deref code=" + code + " target=" + arc.target); } else { - in.pos = arc.posArcsStart - arc.bytesPerArc * arc.numArcs; + // Absolute + arc.target = code; + //System.out.println(" abs code=" + code + " derefLen=" + nodeRefToAddress.length); } + } else { + arc.target = in.readInt(); } - arc.target = in.pos; - } else { - arc.target = in.readInt(); arc.nextArc = in.pos; } - return arc; } /** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */ - public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc) throws IOException { + public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in) throws IOException { assert cachedRootArcs != null; - // Short-circuit if this arc is in the root arc cache: - if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) { - final Arc result = cachedRootArcs[labelToMatch]; - if (result == null) { - return result; - } else { - arc.copyFrom(result); - return arc; - } - } - + if (labelToMatch == END_LABEL) { if (follow.isFinal()) { if (follow.target <= 0) { arc.flags = BIT_LAST_ARC; } else { arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target; + arc.node = follow.target; } arc.output = follow.nextFinalOutput; arc.label = END_LABEL; @@ -836,35 +1038,49 @@ } } + // Short-circuit if this arc is in the root arc cache: + if (follow.target == startNode && labelToMatch < cachedRootArcs.length) { + final Arc result = cachedRootArcs[labelToMatch]; + if (result == null) { + return result; + } else { + arc.copyFrom(result); + return arc; + } + } + if (!targetHasArcs(follow)) { return null; } - // TODO: maybe make an explicit thread state that holds - // reusable stuff eg BytesReader: - final BytesReader in = getBytesReader(follow.target); + in.pos = getNodeAddress(follow.target); + arc.node = follow.target; + // System.out.println("fta label=" + (char) labelToMatch); - if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) { + if (in.readByte() == ARCS_AS_FIXED_ARRAY) { // Arcs are full array; do binary search: arc.numArcs = in.readVInt(); - //System.out.println(" bs " + arc.numArcs); - arc.bytesPerArc = in.readInt(); + if (packed) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } arc.posArcsStart = in.pos; int low = 0; int high = arc.numArcs-1; while (low <= high) { //System.out.println(" cycle"); int mid = (low + high) >>> 1; - in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + in.skip(arc.posArcsStart, arc.bytesPerArc*mid + 1); int midLabel = readLabel(in); final int cmp = midLabel - labelToMatch; - if (cmp < 0) + if (cmp < 0) { low = mid + 1; - else if (cmp > 0) + } else if (cmp > 0) { high = mid - 1; - else { + } else { arc.arcIdx = mid-1; //System.out.println(" found!"); return readNextRealArc(arc, in); @@ -875,7 +1091,8 @@ } // Linear scan - readFirstTargetArc(follow, arc); + readFirstRealTargetArc(follow.target, arc, in); + while(true) { //System.out.println(" non-bs cycle"); // TODO: we should fix this code to not have to create @@ -889,7 +1106,7 @@ } else if (arc.isLast()) { return null; } else { - readNextArc(arc); + readNextRealArc(arc, in); } } } @@ -910,7 +1127,11 @@ } if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { - in.readInt(); + if (packed) { + in.readVInt(); + } else { + in.readInt(); + } } if (flag(flags, BIT_LAST_ARC)) { @@ -969,6 +1190,7 @@ @Override public void writeByte(byte b) { + assert posWrite <= bytes.length; if (bytes.length == posWrite) { bytes = ArrayUtil.grow(bytes); } @@ -976,6 +1198,13 @@ bytes[posWrite++] = b; } + public void setPosWrite(int posWrite) { + this.posWrite = posWrite; + if (bytes.length < posWrite) { + bytes = ArrayUtil.grow(bytes, posWrite); + } + } + @Override public void writeBytes(byte[] b, int offset, int length) { final int size = posWrite + length; @@ -987,15 +1216,24 @@ public final BytesReader getBytesReader(int pos) { // TODO: maybe re-use via ThreadLocal? - return new BytesReader(bytes, pos); + if (packed) { + return new ForwardBytesReader(bytes, pos); + } else { + return new ReverseBytesReader(bytes, pos); + } } /** Expert */ - public final static class BytesReader extends DataInput { - final byte[] bytes; + public static abstract class BytesReader extends DataInput { int pos; + abstract void skip(int byteCount); + abstract void skip(int base, int byteCount); + } - public BytesReader(byte[] bytes, int pos) { + final static class ReverseBytesReader extends BytesReader { + final byte[] bytes; + + public ReverseBytesReader(byte[] bytes, int pos) { this.bytes = bytes; this.pos = pos; } @@ -1011,5 +1249,541 @@ b[offset+i] = bytes[pos--]; } } + + public void skip(int count) { + pos -= count; + } + + public void skip(int base, int count) { + pos = base - count; + } } + + // TODO: can we use just ByteArrayDataInput...? need to + // add a .skipBytes to DataInput.. hmm and .setPosition + final static class ForwardBytesReader extends BytesReader { + final byte[] bytes; + + public ForwardBytesReader(byte[] bytes, int pos) { + this.bytes = bytes; + this.pos = pos; + } + + @Override + public byte readByte() { + return bytes[pos++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + System.arraycopy(bytes, pos, b, offset, len); + pos += len; + } + + public void skip(int count) { + pos += count; + } + + public void skip(int base, int count) { + pos = base + count; + } + } + + private static class ArcAndState { + final Arc arc; + final IntsRef chain; + + public ArcAndState(Arc arc, IntsRef chain) { + this.arc = arc; + this.chain = chain; + } + } + + /* + public void countSingleChains() throws IOException { + // TODO: must assert this FST was built with + // "willRewrite" + + final List> queue = new ArrayList>(); + + // TODO: use bitset to not revisit nodes already + // visited + + FixedBitSet seen = new FixedBitSet(1+nodeCount); + int saved = 0; + + queue.add(new ArcAndState(getFirstArc(new Arc()), new IntsRef())); + Arc scratchArc = new Arc(); + while(queue.size() > 0) { + //System.out.println("cycle size=" + queue.size()); + //for(ArcAndState ent : queue) { + // System.out.println(" " + Util.toBytesRef(ent.chain, new BytesRef())); + // } + final ArcAndState arcAndState = queue.get(queue.size()-1); + seen.set(arcAndState.arc.node); + final BytesRef br = Util.toBytesRef(arcAndState.chain, new BytesRef()); + if (br.length > 0 && br.bytes[br.length-1] == -1) { + br.length--; + } + //System.out.println(" top node=" + arcAndState.arc.target + " chain=" + br.utf8ToString()); + if (targetHasArcs(arcAndState.arc) && !seen.get(arcAndState.arc.target)) { + // push + readFirstTargetArc(arcAndState.arc, scratchArc); + //System.out.println(" push label=" + (char) scratchArc.label); + //System.out.println(" tonode=" + scratchArc.target + " last?=" + scratchArc.isLast()); + + final IntsRef chain = IntsRef.deepCopyOf(arcAndState.chain); + chain.grow(1+chain.length); + // TODO + //assert scratchArc.label != END_LABEL; + chain.ints[chain.length] = scratchArc.label; + chain.length++; + + if (scratchArc.isLast()) { + if (scratchArc.target != -1 && inCounts[scratchArc.target] == 1) { + //System.out.println(" append"); + } else { + if (arcAndState.chain.length > 1) { + saved += chain.length-2; + try { + System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef()).utf8ToString()); + } catch (AssertionError ae) { + System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef())); + } + } + chain.length = 0; + } + } else { + //System.out.println(" reset"); + if (arcAndState.chain.length > 1) { + saved += arcAndState.chain.length-2; + try { + System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString()); + } catch (AssertionError ae) { + System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef())); + } + } + if (scratchArc.target != -1 && inCounts[scratchArc.target] != 1) { + chain.length = 0; + } else { + chain.ints[0] = scratchArc.label; + chain.length = 1; + } + } + // TODO: instead of new Arc() we can re-use from + // a by-depth array + queue.add(new ArcAndState(new Arc().copyFrom(scratchArc), chain)); + } else if (!arcAndState.arc.isLast()) { + // next + readNextArc(arcAndState.arc); + //System.out.println(" next label=" + (char) arcAndState.arc.label + " len=" + arcAndState.chain.length); + if (arcAndState.chain.length != 0) { + arcAndState.chain.ints[arcAndState.chain.length-1] = arcAndState.arc.label; + } + } else { + if (arcAndState.chain.length > 1) { + saved += arcAndState.chain.length-2; + System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString()); + } + // pop + //System.out.println(" pop"); + queue.remove(queue.size()-1); + while(queue.size() > 0 && queue.get(queue.size()-1).arc.isLast()) { + queue.remove(queue.size()-1); + } + if (queue.size() > 0) { + final ArcAndState arcAndState2 = queue.get(queue.size()-1); + readNextArc(arcAndState2.arc); + //System.out.println(" read next=" + (char) arcAndState2.arc.label + " queue=" + queue.size()); + assert arcAndState2.arc.label != END_LABEL; + if (arcAndState2.chain.length != 0) { + arcAndState2.chain.ints[arcAndState2.chain.length-1] = arcAndState2.arc.label; + } + } + } + } + + System.out.println("TOT saved " + saved); + } + */ + + // Creates a packed FST + private FST(INPUT_TYPE inputType, int[] nodeRefToAddress, Outputs outputs) { + packed = true; + this.inputType = inputType; + bytes = new byte[128]; + this.nodeRefToAddress = nodeRefToAddress; + this.outputs = outputs; + NO_OUTPUT = outputs.getNoOutput(); + writer = new BytesWriter(); + } + + /** Expert: creates an FST by packing this one. This + * process requires substantial additional RAM (currently + * ~8 bytes per node), but then should produce a smaller FST. */ + public FST pack(int minInCountDeref, int maxDerefNodes) throws IOException { + + // TODO: other things to try + // - renumber the nodes to get more next / better locality? + // - allow multiple input labels on an arc, so + // singular chain of inputs can take one arc (on + // wikipedia terms this could save another ~6%) + // - in the ord case, the output '1' is presumably + // very common (after NO_OUTPUT)... maybe use a bit + // for it..? + // - use spare bits in flags.... for top few labels / + // outputs / targets + + if (nodeAddress == null) { + throw new IllegalArgumentException("this FST was not built with willPackFST=true"); + } + + Arc arc = new Arc(); + + final BytesReader r = getBytesReader(0); + + final int topN = Math.min(maxDerefNodes, inCounts.length); + + // Find top nodes with highest number of incoming arcs: + NodeQueue q = new NodeQueue(topN); + + NodeAndInCount bottom = null; + for(int node=0;node= minInCountDeref) { + if (bottom == null) { + q.add(new NodeAndInCount(node, inCounts[node])); + if (q.size() == topN) { + bottom = q.top(); + } + } else if (inCounts[node] > bottom.count) { + q.insertWithOverflow(new NodeAndInCount(node, inCounts[node])); + } + } + } + + // Free up RAM: + inCounts = null; + + final Map topNodeMap = new HashMap(); + for(int downTo=q.size()-1;downTo>=0;downTo--) { + NodeAndInCount n = q.pop(); + topNodeMap.put(n.node, downTo); + //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); + } + + // TODO: we can use packed ints: + // +1 because node ords start at 1 (0 is reserved as + // stop node): + final int[] nodeRefToAddressIn = new int[topNodeMap.size()]; + + final FST fst = new FST(inputType, nodeRefToAddressIn, outputs); + + final BytesWriter writer = fst.writer; + + final int[] newNodeAddress = new int[1+nodeCount]; + + // Fill initial coarse guess: + for(int node=1;node<=nodeCount;node++) { + newNodeAddress[node] = 1 + bytes.length - nodeAddress[node]; + } + + int absCount; + int deltaCount; + int topCount; + int nextCount; + + // Iterate until we converge: + while(true) { + + //System.out.println("\nITER"); + boolean changed = false; + + // for assert: + boolean negDelta = false; + + writer.posWrite = 0; + // Skip 0 byte since 0 is reserved target: + writer.writeByte((byte) 0); + + fst.arcWithOutputCount = 0; + fst.nodeCount = 0; + fst.arcCount = 0; + + absCount = deltaCount = topCount = nextCount = 0; + + int changedCount = 0; + + int addressError = 0; + + // Since we re-reverse the bytes, we now write the + // nodes backwards, so that BIT_TARGET_NEXT is + // unchanged: + for(int node=nodeCount;node>=1;node--) { + fst.nodeCount++; + final int address = writer.posWrite; + //System.out.println(" node: " + node + " address=" + address); + if (address != newNodeAddress[node]) { + addressError = address - newNodeAddress[node]; + //System.out.println(" change: " + (address - newNodeAddress[node])); + changed = true; + newNodeAddress[node] = address; + changedCount++; + } + + int nodeArcCount = 0; + int bytesPerArc = 0; + + boolean retry = false; + + // for assert: + boolean anyNegDelta = false; + + // Retry loop: possibly iterate more than once, if + // this is an array'd node and bytesPerArc changes: + writeNode: + while(true) { // retry writing this node + + readFirstRealTargetArc(node, arc, r); + + final boolean useArcArray = arc.bytesPerArc != 0; + if (useArcArray) { + // Write false first arc: + if (bytesPerArc == 0) { + bytesPerArc = arc.bytesPerArc; + } + writer.writeByte(ARCS_AS_FIXED_ARRAY); + writer.writeVInt(arc.numArcs); + writer.writeVInt(bytesPerArc); + } + + int maxBytesPerArc = 0; + + while(true) { // iterate over all arcs for this node + + //System.out.println(" arc label=" + arc.label + " target=" + arc.target + " pos=" + writer.posWrite); + final int arcStartPos = writer.posWrite; + nodeArcCount++; + + byte flags = 0; + + if (arc.isLast()) { + flags += BIT_LAST_ARC; + } + /* + if (!useArcArray && nodeUpto < nodes.length-1 && arc.target == nodes[nodeUpto+1]) { + flags += BIT_TARGET_NEXT; + } + */ + if (!useArcArray && node != 1 && arc.target == node-1) { + flags += BIT_TARGET_NEXT; + if (!retry) { + nextCount++; + } + } + if (arc.isFinal()) { + flags += BIT_FINAL_ARC; + if (arc.nextFinalOutput != NO_OUTPUT) { + flags += BIT_ARC_HAS_FINAL_OUTPUT; + } + } else { + assert arc.nextFinalOutput == NO_OUTPUT; + } + if (!targetHasArcs(arc)) { + flags += BIT_STOP_NODE; + } + + if (arc.output != NO_OUTPUT) { + flags += BIT_ARC_HAS_OUTPUT; + } + + final Integer ptr; + final int absPtr; + final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0; + if (doWriteTarget) { + + ptr = topNodeMap.get(arc.target); + if (ptr != null) { + absPtr = ptr; + } else { + absPtr = topNodeMap.size() + newNodeAddress[arc.target] + addressError; + } + + int delta = newNodeAddress[arc.target] + addressError - writer.posWrite - 2; + if (delta < 0) { + //System.out.println("neg: " + delta); + anyNegDelta = true; + delta = 0; + } + + if (delta < absPtr) { + flags |= BIT_TARGET_DELTA; + } + } else { + ptr = null; + absPtr = 0; + } + + writer.writeByte(flags); + fst.writeLabel(arc.label); + + if (arc.output != NO_OUTPUT) { + outputs.write(arc.output, writer); + if (!retry) { + fst.arcWithOutputCount++; + } + } + if (arc.nextFinalOutput != NO_OUTPUT) { + outputs.write(arc.nextFinalOutput, writer); + } + + if (doWriteTarget) { + + int delta = newNodeAddress[arc.target] + addressError - writer.posWrite; + if (delta < 0) { + anyNegDelta = true; + //System.out.println("neg: " + delta); + delta = 0; + } + + if (flag(flags, BIT_TARGET_DELTA)) { + //System.out.println(" delta"); + writer.writeVInt(delta); + if (!retry) { + deltaCount++; + } + } else { + /* + if (ptr != null) { + System.out.println(" deref"); + } else { + System.out.println(" abs"); + } + */ + writer.writeVInt(absPtr); + if (!retry) { + if (absPtr >= topNodeMap.size()) { + absCount++; + } else { + topCount++; + } + } + } + } + + if (useArcArray) { + final int arcBytes = writer.posWrite - arcStartPos; + //System.out.println(" " + arcBytes + " bytes"); + maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); + // NOTE: this may in fact go "backwards", if + // somehow (rarely, possibly never) we use + // more bytesPerArc in this rewrite than the + // incoming FST did... but in this case we + // will retry (below) so it's OK to ovewrite + // bytes: + writer.setPosWrite(arcStartPos + bytesPerArc); + } + + if (arc.isLast()) { + break; + } + + readNextRealArc(arc, r); + } + + if (useArcArray) { + if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc)) { + // converged + break; + } + } else { + break; + } + + //System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc); + + // Retry: + bytesPerArc = maxBytesPerArc; + writer.posWrite = address; + nodeArcCount = 0; + retry = true; + anyNegDelta = false; + } + negDelta |= anyNegDelta; + + fst.arcCount += nodeArcCount; + } + + if (!changed) { + // We don't renumber the nodes (just reverse their + // order) so nodes should only point forward to + // other nodes because we only produce acyclic FSTs + // w/ nodes only pointing "forwards": + assert !negDelta; + // Converged! + break; + } + //System.out.println(" " + changedCount + " of " + fst.nodeCount + " changed; retry"); + } + + for(Map.Entry ent : topNodeMap.entrySet()) { + nodeRefToAddressIn[ent.getValue()] = newNodeAddress[ent.getKey()]; + } + + fst.startNode = newNodeAddress[startNode]; + //System.out.println("new startNode=" + startNode); + + if (emptyOutput != null) { + fst.setEmptyOutput(emptyOutput); + } + + assert fst.nodeCount == nodeCount: "fst.nodeCount=" + fst.nodeCount + " nodeCount=" + nodeCount; + assert fst.arcCount == arcCount; + assert fst.arcWithOutputCount == arcWithOutputCount: "fst.arcWithOutputCount=" + fst.arcWithOutputCount + " arcWithOutputCount=" + arcWithOutputCount; + + final byte[] finalBytes = new byte[writer.posWrite]; + //System.out.println("resize " + fst.bytes.length + " down to " + writer.posWrite); + System.arraycopy(fst.bytes, 0, finalBytes, 0, writer.posWrite); + fst.bytes = finalBytes; + fst.cacheRootArcs(); + + //final int size = fst.sizeInBytes(); + //System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount); + + return fst; + } + + private static class NodeAndInCount implements Comparable { + final int node; + final int count; + + public NodeAndInCount(int node, int count) { + this.node = node; + this.count = count; + } + + @Override + public int compareTo(NodeAndInCount other) { + if (count > other.count) { + return 1; + } else if (count < other.count) { + return -1; + } else { + // Tie-break: smaller node compares as greater than + return other.node - node; + } + } + } + + private static class NodeQueue extends PriorityQueue { + public NodeQueue(int topN) { + super(topN, false); + } + + @Override + public boolean lessThan(NodeAndInCount a, NodeAndInCount b) { + final int cmp = a.compareTo(b); + assert cmp != 0; + return cmp < 0; + } + } } Index: lucene/src/java/org/apache/lucene/util/FixedBitSet.java =================================================================== --- lucene/src/java/org/apache/lucene/util/FixedBitSet.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/FixedBitSet.java (working copy) @@ -95,7 +95,7 @@ } public boolean get(int index) { - assert index >= 0 && index < numBits; + assert index >= 0 && index < numBits: "index=" + index; int i = index >> 6; // div 64 // signed shift will keep a negative index and force an // array-index-out-of-bounds-exception, removing the need for an explicit check. Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 1237044) +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -588,7 +588,7 @@ out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f)); offset += 2; } else { - assert b < 0xf8; + assert b < 0xf8: "b=" + b; int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f); offset += 3; if (ch < UNI_MAX_BMP) { Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1237044) +++ lucene/CHANGES.txt (working copy) @@ -802,6 +802,9 @@ * LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML markup. (Steve Rowe) + +* LUCENE-3725: Added optional packing to FST building; this uses extra + RAM during building but results in a smaller FST. (Mike McCandless) Bug fixes