Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1439839) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -386,7 +386,9 @@ String prefix = getClass().getSimpleName(); File directory = Sort.defaultTempDir(); File tempInput = File.createTempFile(prefix, ".input", directory); - File tempSorted = File.createTempFile(prefix, ".sorted", directory); + // nocommit + //File tempSorted = File.createTempFile(prefix, ".sorted", directory); + File tempSorted = new File("/tmp/AnalyzingSuggester4987089453108809836.sorted"); Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); Sort.ByteSequencesReader reader = null; @@ -398,6 +400,7 @@ boolean success = false; byte buffer[] = new byte[8]; try { + /* ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; @@ -444,6 +447,8 @@ // Free disk space: tempInput.delete(); + */ + System.out.println("tempSorted: " + tempSorted); reader = new Sort.ByteSequencesReader(tempSorted); @@ -464,6 +469,7 @@ Set seenSurfaceForms = new HashSet(); int dedup = 0; + int count = 0; while (reader.read(scratch)) { input.reset(scratch.bytes, scratch.offset, scratch.length); short analyzedLength = input.readShort(); @@ -512,7 +518,10 @@ analyzed.length += 2; Util.toIntsRef(analyzed, scratchInts); - //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); + count++; + if (count % 100000 == 0) { + System.out.println(count + ": ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); + } builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface))); } fst = builder.finish(); @@ -526,9 +535,10 @@ } else { IOUtils.closeWhileHandlingException(reader, writer); } - - tempInput.delete(); - tempSorted.delete(); + + // nocommit + //tempInput.delete(); + //tempSorted.delete(); } } Index: lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1439839) +++ lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) @@ -518,11 +518,11 @@ return; } - /* + // nocommit IndexInput in = dir.openInput("fst.bin", IOContext.DEFAULT); fst = new FST(in, outputs); in.close(); - */ + System.out.println("DO OPEN"); System.out.println("\nNow verify..."); Index: lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (revision 1439839) +++ lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (working copy) @@ -20,15 +20,22 @@ import java.util.Arrays; import java.util.Random; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TimeUnits; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.packed.PackedInts; import org.junit.Ignore; import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; -@Ignore("Requires tons of heap to run (10G works)") +// nocommit +//@Ignore("Requires tons of heap to run (10G works)") @TimeoutSuite(millis = 100 * TimeUnits.HOUR) public class Test2BFST extends LuceneTestCase { @@ -39,6 +46,8 @@ IntsRef input = new IntsRef(ints, 0, ints.length); long seed = random().nextLong(); + Directory dir = new MMapDirectory(_TestUtil.getTempDir("2BFST")); + for(int doPackIter=0;doPackIter<2;doPackIter++) { boolean doPack = doPackIter == 1; @@ -72,42 +81,56 @@ FST fst = b.finish(); - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + for(int verify=0;verify<2;verify++) { + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - Arrays.fill(ints2, 0); - r = new Random(seed); + Arrays.fill(ints2, 0); + r = new Random(seed); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + System.out.println("\nTEST: enum all input/outputs"); + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - Arrays.fill(ints2, 0); - r = new Random(seed); - int upto = 0; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; + Arrays.fill(ints2, 0); + r = new Random(seed); + int upto = 0; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + for(int j=10;j(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); } - assertEquals(input2, pair.input); - assertEquals(NO_OUTPUT, pair.output); - upto++; - nextInput(r, ints2); } - assertEquals(count, upto); } // Build FST w/ ByteSequenceOutputs and stop when FST @@ -138,39 +161,53 @@ } FST fst = b.finish(); + for(int verify=0;verify<2;verify++) { - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - r = new Random(seed); - Arrays.fill(ints, 0); + r = new Random(seed); + Arrays.fill(ints, 0); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + System.out.println("\nTEST: enum all input/outputs"); + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - Arrays.fill(ints, 0); - r = new Random(seed); - int upto = 0; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + r.nextBytes(outputBytes); + assertEquals(output, pair.output); + upto++; + nextInput(r, ints); } - assertEquals(input, pair.input); - r.nextBytes(outputBytes); - assertEquals(output, pair.output); - upto++; - nextInput(r, ints); + assertEquals(count, upto); + + if (verify == 0) { + System.out.println("\nTEST: save/load FST and re-verify"); + IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst", IOContext.DEFAULT); + fst = new FST(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); + } } - assertEquals(count, upto); } // Build FST w/ PositiveIntOutputs and stop when FST @@ -202,46 +239,62 @@ FST fst = b.finish(); - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + for(int verify=0;verify<2;verify++) { - Arrays.fill(ints, 0); + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - output = 1; - r = new Random(seed); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); - System.out.println("\nTEST: enum all input/outputs"); - IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + output = 1; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + assertEquals(output, pair.output.longValue()); + output += 1 + r.nextInt(10); + upto++; + nextInput(r, ints); + } + assertEquals(count, upto); - Arrays.fill(ints, 0); - r = new Random(seed); - int upto = 0; - output = 1; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; + if (verify == 0) { + System.out.println("\nTEST: save/load FST and re-verify"); + IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst", IOContext.DEFAULT); + fst = new FST(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); } - assertEquals(input, pair.input); - assertEquals(output, pair.output.longValue()); - output += 1 + r.nextInt(10); - upto++; - nextInput(r, ints); } - assertEquals(count, upto); } } + dir.close(); } private void nextInput(Random r, int[] ints) { Index: lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (revision 1439839) +++ lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (working copy) @@ -25,7 +25,10 @@ // Used to dedup states (lookup already-frozen states) final class NodeHash { - private GrowableWriter table; + // nocommit back to GrowableWriter, but there's a packed + // ints bug lurking... + //private GrowableWriter table; + private long[] table; private int count; private int mask; private final FST fst; @@ -33,7 +36,8 @@ private final FST.BytesReader in; public NodeHash(FST fst, FST.BytesReader in) { - table = new GrowableWriter(8, 16, PackedInts.COMPACT); + //table = new GrowableWriter(8, 16, PackedInts.COMPACT); + table = new long[16]; mask = 15; this.fst = fst; this.in = in; @@ -120,15 +124,20 @@ int pos = h & mask; int c = 0; while(true) { - final long v = table.get(pos); + //final long v = table.get(pos); + final long v = table[pos]; + //System.out.println("get pos=" + pos + " v=" + v); if (v == 0) { // freeze & add final long node = fst.addNode(nodeIn); //System.out.println(" now freeze node=" + node); assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; count++; - table.set(pos, node); - if (table.size() < 2*count) { + //System.out.println("set pos=" + pos + " node=" + node); + //table.set(pos, node); + table[pos] = node; + //if (table.size() < 2*count) { + if (table.length < 2*count) { rehash(); } return node; @@ -147,8 +156,10 @@ int pos = hash(address) & mask; int c = 0; while(true) { - if (table.get(pos) == 0) { - table.set(pos, address); + //if (table.get(pos) == 0) { + if (table[pos] == 0) { + //table.set(pos, address); + table[pos] = address; break; } @@ -158,16 +169,22 @@ } private void rehash() throws IOException { - final GrowableWriter oldTable = table; + //final GrowableWriter oldTable = table; + final long[] oldTable = table; - if (oldTable.size() >= Integer.MAX_VALUE/2) { + //if (oldTable.size() >= Integer.MAX_VALUE/2) { + if (oldTable.length >= Integer.MAX_VALUE/2) { throw new IllegalStateException("FST too large (> 2.1 GB)"); } - table = new GrowableWriter(oldTable.getBitsPerValue(), 2*oldTable.size(), PackedInts.COMPACT); - mask = table.size()-1; - for(int idx=0;idx 0) { - final int chunk = Math.min(blockSize, left); + final int chunk = (int) Math.min(blockSize, left); byte[] block = new byte[chunk]; in.readBytes(block, 0, block.length); blocks.add(block); Index: lucene/core/src/java/org/apache/lucene/util/fst/FST.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FST.java (revision 1439839) +++ lucene/core/src/java/org/apache/lucene/util/fst/FST.java (working copy) @@ -276,7 +276,6 @@ this.outputs = outputs; this.allowArrayArcs = allowArrayArcs; version = VERSION_CURRENT; - // 32 KB blocks: bytes = new BytesStore(bytesPageBits); // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs @@ -350,7 +349,7 @@ arcCount = in.readVLong(); arcWithOutputCount = in.readVLong(); - int numBytes = in.readVInt(); + long numBytes = in.readVLong(); bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE); NO_OUTPUT = outputs.getNoOutput(); @@ -671,6 +670,11 @@ assert target.node > 0; //System.out.println(" write target"); bytes.writeVLong(target.node); + + // nocommit + if (target.node > startAddress) { + throw new RuntimeException("target.node=" + target.node + " startAddress=" + startAddress); + } } // just write the arcs "like normal" on first pass,