Index: lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (revision 1441164) +++ lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (working copy) @@ -20,10 +20,16 @@ import java.util.Arrays; import java.util.Random; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TimeUnits; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.packed.PackedInts; import org.junit.Ignore; import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; @@ -39,6 +45,8 @@ IntsRef input = new IntsRef(ints, 0, ints.length); long seed = random().nextLong(); + Directory dir = new MMapDirectory(_TestUtil.getTempDir("2BFST")); + for(int doPackIter=0;doPackIter<2;doPackIter++) { boolean doPack = doPackIter == 1; @@ -72,42 +80,56 @@ FST fst = b.finish(); - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + for(int verify=0;verify<2;verify++) { + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - Arrays.fill(ints2, 0); - r = new Random(seed); + Arrays.fill(ints2, 0); + r = new Random(seed); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + System.out.println("\nTEST: enum all input/outputs"); + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - Arrays.fill(ints2, 0); - r = new Random(seed); - int upto = 0; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; + Arrays.fill(ints2, 0); + r = new Random(seed); + int upto = 0; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + for(int j=10;j(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); } - assertEquals(input2, pair.input); - assertEquals(NO_OUTPUT, pair.output); - upto++; - nextInput(r, ints2); } - assertEquals(count, upto); } // Build FST w/ ByteSequenceOutputs and stop when FST @@ -138,39 +160,53 @@ } FST fst = b.finish(); + for(int verify=0;verify<2;verify++) { - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - r = new Random(seed); - Arrays.fill(ints, 0); + r = new Random(seed); + Arrays.fill(ints, 0); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + System.out.println("\nTEST: enum all input/outputs"); + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - Arrays.fill(ints, 0); - r = new Random(seed); - int upto = 0; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + r.nextBytes(outputBytes); + assertEquals(output, pair.output); + upto++; + nextInput(r, ints); } - assertEquals(input, pair.input); - r.nextBytes(outputBytes); - assertEquals(output, pair.output); - upto++; - nextInput(r, ints); + assertEquals(count, upto); + + if (verify == 0) { + System.out.println("\nTEST: save/load FST and re-verify"); + IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst", IOContext.DEFAULT); + fst = new FST(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); + } } - assertEquals(count, upto); } // Build FST w/ PositiveIntOutputs and stop when FST @@ -202,46 +238,62 @@ FST fst = b.finish(); - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + for(int verify=0;verify<2;verify++) { - Arrays.fill(ints, 0); + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - output = 1; - r = new Random(seed); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); - System.out.println("\nTEST: enum all input/outputs"); - IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + output = 1; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + assertEquals(output, pair.output.longValue()); + output += 1 + r.nextInt(10); + upto++; + nextInput(r, ints); + } + assertEquals(count, upto); - Arrays.fill(ints, 0); - r = new Random(seed); - int upto = 0; - output = 1; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; + if (verify == 0) { + System.out.println("\nTEST: save/load FST and re-verify"); + IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst", IOContext.DEFAULT); + fst = new FST(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); } - assertEquals(input, pair.input); - assertEquals(output, pair.output.longValue()); - output += 1 + r.nextInt(10); - upto++; - nextInput(r, ints); } - assertEquals(count, upto); } } + dir.close(); } private void nextInput(Random r, int[] ints) { Index: lucene/core/src/java/org/apache/lucene/util/fst/FST.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FST.java (revision 1441164) +++ lucene/core/src/java/org/apache/lucene/util/fst/FST.java (working copy) @@ -27,11 +27,6 @@ import java.io.OutputStream; import java.util.HashMap; import java.util.Map; -/* -import java.io.Writer; -import java.io.OutputStreamWriter; -import java.io.FileOutputStream; -*/ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.ByteArrayDataOutput; @@ -41,12 +36,15 @@ import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Constants; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.fst.Builder.UnCompiledNode; import org.apache.lucene.util.packed.GrowableWriter; import org.apache.lucene.util.packed.PackedInts; +//import java.io.Writer; +//import java.io.OutputStreamWriter; // TODO: break this into WritableFST and ReadOnlyFST.. then // we can have subclasses of ReadOnlyFST to handle the @@ -276,7 +274,6 @@ this.outputs = outputs; this.allowArrayArcs = allowArrayArcs; version = VERSION_CURRENT; - // 32 KB blocks: bytes = new BytesStore(bytesPageBits); // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs @@ -295,9 +292,20 @@ nodeRefToAddress = null; } + public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; + /** Load a previously saved FST. */ public FST(DataInput in, Outputs outputs) throws IOException { + this(in, outputs, DEFAULT_MAX_BLOCK_BITS); + } + + public FST(DataInput in, Outputs outputs, int maxBlockBits) throws IOException { this.outputs = outputs; + + if (maxBlockBits < 1 || maxBlockBits > 30) { + throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits); + } + // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET); @@ -345,13 +353,13 @@ } else { nodeRefToAddress = null; } - startNode = in.readVInt(); + startNode = in.readVLong(); nodeCount = in.readVLong(); arcCount = in.readVLong(); arcWithOutputCount = in.readVLong(); - int numBytes = in.readVInt(); - bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE); + long numBytes = in.readVLong(); + bytes = new BytesStore(in, numBytes, 1< 0) { - final int chunk = Math.min(blockSize, left); + final int chunk = (int) Math.min(blockSize, left); byte[] block = new byte[chunk]; in.readBytes(block, 0, block.length); blocks.add(block); Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1441164) +++ lucene/CHANGES.txt (working copy) @@ -123,6 +123,9 @@ * LUCENE-4732: Fixed TermsEnum.seekCeil/seekExact on term vectors. (Adrien Grand, Robert Muir) +* LUCENE-4739: Fixed bugs that prevented FSTs more than ~1.1GB from + being saved and loaded (Adrien Grand, Mike McCandless) + ======================= Lucene 4.1.0 ======================= Changes in backwards compatibility policy