Index: lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java (revision 1432529) +++ lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java (working copy) @@ -289,10 +289,10 @@ if (reversed) { expectedPos = pos-numBytes; - left = r.getPosition(); + left = (int) r.getPosition(); } else { expectedPos = pos+numBytes; - left = totalLength - r.getPosition(); + left = (int) (totalLength - r.getPosition()); } assertEquals(expectedPos, r.getPosition()); Index: lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (revision 0) +++ lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (working copy) @@ -0,0 +1,153 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.Random; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.packed.PackedInts; +import org.junit.Ignore; + +// nocommit need Test2BNodes? + +// nocommit +//@Ignore("Requires tons of heap to run") +public class Test2BFST extends LuceneTestCase { + + public void test() throws Exception { + int[] ints = new int[5]; + IntsRef input = new IntsRef(ints, 0, 5); + long seed = random().nextLong(); + + for(int doPackIter=0;doPackIter<2;doPackIter++) { + boolean doPack = doPackIter == 1; + + // nocommit + if (false) { + System.out.println("TEST: doPack=" + doPack + " outputs=bytes"); + Outputs outputs = ByteSequenceOutputs.getSingleton(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, + null, doPack, PackedInts.COMPACT, true); + + byte[] outputBytes = new byte[20]; + BytesRef output = new BytesRef(outputBytes); + int count = 0; + Random r = new Random(seed); + while(true) { + r.nextBytes(outputBytes); + //System.out.println("add: " + input + " -> " + output); + // nocommit: why, if i fail to make deep copy of the + // output, does FST not grow!? + b.add(input, BytesRef.deepCopyOf(output)); + count++; + if (count % 1000000 == 0) { + System.out.println(count + "...: " + b.fstSizeInBytes() + " bytes"); + } + if (b.fstSizeInBytes() > 3L*1024*1024*1024) { + break; + } + nextInput(ints); + } + + FST fst = b.finish(); + + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + + r = new Random(seed); + Arrays.fill(ints, 0); + + FST.BytesReader fstReader = fst.getBytesReader(); + + for(int i=0;i outputs = PositiveIntOutputs.getSingleton(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, + null, doPack, PackedInts.COMPACT, true); + + long output = 1; + + Arrays.fill(ints, 0); + int count = 0; + Random r = new Random(seed); + while(true) { + //System.out.println("add: " + input + " -> " + output); + // nocommit: why, if i fail to make deep copy of the + // output, does FST not grow!? + b.add(input, output); + output += 1+r.nextInt(10); + count++; + if (count % 1000000 == 0) { + System.out.println(count + "...: " + b.fstSizeInBytes() + " bytes"); + } + if (b.fstSizeInBytes() > 3L*1024*1024*1024) { + break; + } + nextInput(ints); + } + + FST fst = b.finish(); + + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + + Arrays.fill(ints, 0); + + FST.BytesReader fstReader = fst.getBytesReader(); + output = 1; + r = new Random(seed); + for(int i=0;i= 0) { + if (ints[downTo] < 255) { + ints[downTo]++; + break; + } else { + ints[downTo] = 0; + downTo--; + } + } + } +} Property changes on: lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java (revision 1432529) +++ lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java (working copy) @@ -282,7 +282,7 @@ public int indexArcCount; /** Byte size of the index. */ - public int indexNumBytes; + public long indexNumBytes; /** Total number of terms in the field. */ public long totalTermCount; Index: lucene/core/src/java/org/apache/lucene/util/fst/ReverseBytesReader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/ReverseBytesReader.java (revision 1432529) +++ lucene/core/src/java/org/apache/lucene/util/fst/ReverseBytesReader.java (working copy) @@ -44,13 +44,13 @@ } @Override - public int getPosition() { + public long getPosition() { return pos; } @Override - public void setPosition(int pos) { - this.pos = pos; + public void setPosition(long pos) { + this.pos = (int) pos; } @Override Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1432529) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -544,7 +544,9 @@ * * *

- * Note: larger FSTs (a few thousand nodes) won't even render, don't bother. + * Note: larger FSTs (a few thousand nodes) won't even + * render, don't bother. If the FST is > 2.1 GB in size + * then this method will throw strange exceptions. * * @param sameRank * If true, the resulting dot file will try @@ -578,7 +580,7 @@ // A bitset of already seen states (target offset). final BitSet seen = new BitSet(); - seen.set(startArc.target); + seen.set((int) startArc.target); // Shape for states. final String stateShape = "circle"; @@ -617,7 +619,7 @@ finalOutput = null; } - emitDotState(out, Integer.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput)); + emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput)); } out.write(" initial -> " + startArc.target + "\n"); @@ -638,8 +640,9 @@ if (FST.targetHasArcs(arc)) { // scan all target arcs //System.out.println(" readFirstTarget..."); - final int node = arc.target; + final long node = arc.target; + fst.readFirstRealTargetArc(arc.target, arc, r); //System.out.println(" firstTarget: " + arc); @@ -648,7 +651,7 @@ //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. - if (arc.target >= 0 && !seen.get(arc.target)) { + if (arc.target >= 0 && !seen.get((int) arc.target)) { /* boolean isFinal = false; @@ -675,12 +678,12 @@ finalOutput = ""; } - emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput); + emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput); // To see the node address, use this instead: //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); - seen.set(arc.target); + seen.set((int) arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); - sameLevelStates.add(arc.target); + sameLevelStates.add((int) arc.target); } String outs; Index: lucene/core/src/java/org/apache/lucene/util/fst/Builder.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Builder.java (revision 1432529) +++ lucene/core/src/java/org/apache/lucene/util/fst/Builder.java (working copy) @@ -42,6 +42,9 @@ * @lucene.experimental */ +// nocommit explain the limits, packed (2.1B nodes) and +// unpacked + public class Builder { private final NodeHash dedupHash; private final FST fst; @@ -187,7 +190,7 @@ } private CompiledNode compileNode(UnCompiledNode nodeIn, int tailLength) throws IOException { - final int node; + final long node; if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) { if (nodeIn.numArcs == 0) { node = fst.addNode(nodeIn); @@ -513,8 +516,12 @@ boolean isCompiled(); } + public long fstSizeInBytes() { + return fst.sizeInBytes(); + } + static final class CompiledNode implements Node { - int node; + long node; @Override public boolean isCompiled() { return true; Index: lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (revision 1432529) +++ lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (working copy) @@ -22,7 +22,8 @@ // Used to dedup states (lookup already-frozen states) final class NodeHash { - private int[] table; + // nocommit can we somehow "switch" to long...? + private long[] table; private int count; private int mask; private final FST fst; @@ -30,13 +31,13 @@ private final FST.BytesReader in; public NodeHash(FST fst, FST.BytesReader in) { - table = new int[16]; + table = new long[16]; mask = 15; this.fst = fst; this.in = in; } - private boolean nodesEqual(Builder.UnCompiledNode node, int address) throws IOException { + private boolean nodesEqual(Builder.UnCompiledNode node, long address) throws IOException { fst.readFirstRealTargetArc(address, scratchArc, in); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; @@ -75,7 +76,8 @@ final Builder.Arc arc = node.arcs[arcIdx]; //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); h = PRIME * h + arc.label; - h = PRIME * h + ((Builder.CompiledNode) arc.target).node; + long n = ((Builder.CompiledNode) arc.target).node; + h = PRIME * h + (int) (n^(n>>32)); h = PRIME * h + arc.output.hashCode(); h = PRIME * h + arc.nextFinalOutput.hashCode(); if (arc.isFinal) { @@ -87,7 +89,7 @@ } // hash code for a frozen node - private int hash(int node) throws IOException { + private int hash(long node) throws IOException { final int PRIME = 31; //System.out.println("hash frozen node=" + node); int h = 0; @@ -95,7 +97,7 @@ while(true) { //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition()); h = PRIME * h + scratchArc.label; - h = PRIME * h + scratchArc.target; + h = PRIME * h + (int) (scratchArc.target^(scratchArc.target>>32)); h = PRIME * h + scratchArc.output.hashCode(); h = PRIME * h + scratchArc.nextFinalOutput.hashCode(); if (scratchArc.isFinal()) { @@ -110,16 +112,16 @@ return h & Integer.MAX_VALUE; } - public int add(Builder.UnCompiledNode nodeIn) throws IOException { + public long add(Builder.UnCompiledNode nodeIn) throws IOException { // System.out.println("hash: add count=" + count + " vs " + table.length); final int h = hash(nodeIn); int pos = h & mask; int c = 0; while(true) { - final int v = table[pos]; + final long v = table[pos]; if (v == 0) { // freeze & add - final int node = fst.addNode(nodeIn); + final long node = fst.addNode(nodeIn); //System.out.println(" now freeze node=" + node); assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; count++; @@ -139,7 +141,7 @@ } // called only by rehash - private void addNew(int address) throws IOException { + private void addNew(long address) throws IOException { int pos = hash(address) & mask; int c = 0; while(true) { @@ -154,16 +156,16 @@ } private void rehash() throws IOException { - final int[] oldTable = table; + final long[] oldTable = table; if (oldTable.length >= Integer.MAX_VALUE/2) { throw new IllegalStateException("FST too large (> 2.1 GB)"); } - table = new int[2*table.length]; + table = new long[2*table.length]; mask = table.length-1; for(int idx=0;idx See the {@link org.apache.lucene.util.fst package * documentation} for some simple examples. - *

NOTE: the FST cannot be larger than ~2.1 GB - * because it uses int to address the byte[]. * * @lucene.experimental */ @@ -138,11 +136,11 @@ // Never serialized; just used to represent the virtual // final node w/ no arcs: - private final static int FINAL_END_NODE = -1; + private final static long FINAL_END_NODE = -1; // Never serialized; just used to represent the virtual // non-final node w/ no arcs: - private final static int NON_FINAL_END_NODE = 0; + private final static long NON_FINAL_END_NODE = 0; // if non-null, this FST accepts the empty string and // produces this output @@ -150,7 +148,7 @@ final BytesStore bytes; - private int startNode = -1; + private long startNode = -1; public final Outputs outputs; @@ -158,10 +156,11 @@ // instead of storing the address of the target node for // a given arc, we mark a single bit noting that the next // node in the byte[] is the target node): - private int lastFrozenNode; + private long lastFrozenNode; private final T NO_OUTPUT; + // nocommit long? public int nodeCount; public int arcCount; public int arcWithOutputCount; @@ -183,19 +182,19 @@ // From node (ord or address); currently only used when // building an FST w/ willPackFST=true: - int node; + long node; /** To node (ord or address) */ - public int target; + public long target; byte flags; public T nextFinalOutput; // address (into the byte[]), or ord/address if label == END_LABEL - int nextArc; + long nextArc; // This is non-zero if current arcs are fixed array: - int posArcsStart; + long posArcsStart; int bytesPerArc; int arcIdx; int numArcs; @@ -279,6 +278,9 @@ this.allowArrayArcs = allowArrayArcs; version = VERSION_CURRENT; // 32 KB blocks: + // nocommit make this page size controllable from + // builder? else ... we can overflow page index while + // building for really really immense FSTs ... bytes = new BytesStore(15); // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs @@ -379,8 +381,8 @@ } /** Returns bytes used to represent the FST */ - public int sizeInBytes() { - int size = bytes.getPosition(); + public long sizeInBytes() { + long size = bytes.getPosition(); if (packed) { size += nodeRefToAddress.ramBytesUsed(); } else if (nodeAddress != null) { @@ -390,23 +392,26 @@ return size; } - void finish(int startNode) throws IOException { + void finish(long startNode) throws IOException { + if (this.startNode != -1) { + throw new IllegalStateException("already finished"); + } if (startNode == FINAL_END_NODE && emptyOutput != null) { startNode = 0; } - if (this.startNode != -1) { - throw new IllegalStateException("already finished"); - } this.startNode = startNode; bytes.finish(); cacheRootArcs(); } - private int getNodeAddress(int node) { + private long getNodeAddress(long node) { if (nodeAddress != null) { // Deref - return (int) nodeAddress.get(node); + // nocommit need check somewhere that node *count* + // does not overflow 2G when you have packing turned + // on... + return nodeAddress.get((int) node); } else { // Straight return node; @@ -506,12 +511,14 @@ if (packed) { ((PackedInts.Mutable) nodeRefToAddress).save(out); } - out.writeVInt(startNode); + // nocommit bump format? but if it's only vInt -> vLong + // then it's automatically back compat ... + out.writeVLong(startNode); out.writeVInt(nodeCount); out.writeVInt(arcCount); out.writeVInt(arcWithOutputCount); - int numBytes = bytes.getPosition(); - out.writeVInt(numBytes); + long numBytes = bytes.getPosition(); + out.writeVLong(numBytes); bytes.writeTo(out); } @@ -587,7 +594,8 @@ // serializes new node by appending its bytes to the end // of the current byte[] - int addNode(Builder.UnCompiledNode nodeIn) throws IOException { + long addNode(Builder.UnCompiledNode nodeIn) throws IOException { + //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { @@ -597,10 +605,10 @@ } } - final int startAddress = bytes.getPosition(); + final long startAddress = bytes.getPosition(); //System.out.println(" startAddr=" + startAddress); - boolean doFixedArray = shouldExpand(nodeIn); + final boolean doFixedArray = shouldExpand(nodeIn); if (doFixedArray) { //System.out.println(" fixedArray"); if (bytesPerArc.length < nodeIn.numArcs) { @@ -612,7 +620,7 @@ final int lastArc = nodeIn.numArcs-1; - int lastArcStart = bytes.getPosition(); + long lastArcStart = bytes.getPosition(); int maxBytesPerArc = 0; for(int arcIdx=0;arcIdx arc = nodeIn.arcs[arcIdx]; @@ -645,7 +653,7 @@ if (!targetHasArcs) { flags += BIT_STOP_NODE; } else if (inCounts != null) { - inCounts.set(target.node, inCounts.get(target.node) + 1); + inCounts.set((int) target.node, inCounts.get((int) target.node) + 1); } if (arc.output != NO_OUTPUT) { @@ -671,14 +679,14 @@ if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); - bytes.writeVInt(target.node); + bytes.writeVLong(target.node); } // just write the arcs "like normal" on first pass, // but record how many bytes each one took, and max // byte size: if (doFixedArray) { - bytesPerArc[arcIdx] = bytes.getPosition() - lastArcStart; + bytesPerArc[arcIdx] = (int) (bytes.getPosition() - lastArcStart); lastArcStart = bytes.getPosition(); maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]); //System.out.println(" bytes=" + bytesPerArc[arcIdx]); @@ -710,7 +718,6 @@ assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed // byte size - assert ((long) startAddress+MAX_HEADER_SIZE) + ((long) nodeIn.numArcs) * maxBytesPerArc < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; //System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs); // create the header @@ -723,14 +730,14 @@ bad.writeVInt(maxBytesPerArc); int headerLen = bad.getPosition(); - final int fixedArrayStart = startAddress + headerLen; + final long fixedArrayStart = startAddress + headerLen; // expand the arcs in place, backwards - int srcPos = bytes.getPosition(); - int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; + long srcPos = bytes.getPosition(); + long destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { - bytes.skip(destPos - srcPos); + bytes.skip((int) (destPos - srcPos)); for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { destPos -= maxBytesPerArc; srcPos -= bytesPerArc[arcIdx]; @@ -747,13 +754,20 @@ bytes.writeBytes(startAddress, header, 0, headerLen); } - final int thisNodeAddress = bytes.getPosition()-1; + final long thisNodeAddress = bytes.getPosition()-1; bytes.reverse(startAddress, thisNodeAddress); + // PackedInts uses int as the index, so we cannot handle + // > 2.1B nodes when packing: + if (nodeAddress != null && nodeCount == Integer.MAX_VALUE) { + throw new IllegalStateException("cannot create a packed FST with more than 2.1 billion nodes"); + } + nodeCount++; - final int node; + final long node; if (nodeAddress != null) { + // Nodes are addressed by 1+ord: if (nodeCount == nodeAddress.size()) { nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); @@ -838,7 +852,7 @@ if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { } else if (packed) { - in.readVInt(); + in.readVLong(); } else { readUnpackedNodeTarget(in); } @@ -854,12 +868,12 @@ } } - private int readUnpackedNodeTarget(BytesReader in) throws IOException { - int target; + private long readUnpackedNodeTarget(BytesReader in) throws IOException { + long target; if (version < VERSION_VINT_TARGET) { target = in.readInt(); } else { - target = in.readVInt(); + target = in.readVLong(); } return target; } @@ -894,8 +908,8 @@ } } - public Arc readFirstRealTargetArc(int node, Arc arc, final BytesReader in) throws IOException { - final int address = getNodeAddress(node); + public Arc readFirstRealTargetArc(long node, Arc arc, final BytesReader in) throws IOException { + final long address = getNodeAddress(node); in.setPosition(address); //System.out.println(" readFirstRealTargtArc address=" //+ address); @@ -960,7 +974,7 @@ //System.out.println(" nextArc fake " + //arc.nextArc); - int pos = getNodeAddress(arc.nextArc); + long pos = getNodeAddress(arc.nextArc); in.setPosition(pos); final byte b = in.readByte(); @@ -1055,15 +1069,15 @@ } } else { if (packed) { - final int pos = in.getPosition(); - final int code = in.readVInt(); + final long pos = in.getPosition(); + final long code = in.readVLong(); if (arc.flag(BIT_TARGET_DELTA)) { // Address is delta-coded from current address: arc.target = pos + code; //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target); } else if (code < nodeRefToAddress.size()) { // Deref - arc.target = (int) nodeRefToAddress.get(code); + arc.target = nodeRefToAddress.get((int) code); //System.out.println(" deref code=" + code + " target=" + arc.target); } else { // Absolute @@ -1192,7 +1206,7 @@ if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { if (packed) { - in.readVInt(); + in.readVLong(); } else { readUnpackedNodeTarget(in); } @@ -1238,11 +1252,6 @@ node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP); } - static abstract class BytesWriter extends DataOutput { - public abstract void setPosition(int posWrite); - public abstract int getPosition(); - } - /** Returns a {@link BytesReader} for this FST, positioned at * position 0. */ public BytesReader getBytesReader() { @@ -1251,7 +1260,7 @@ /** Returns a {@link BytesReader} for this FST, positioned at * the provided position. */ - public BytesReader getBytesReader(int pos) { + public BytesReader getBytesReader(long pos) { // TODO: maybe re-use via ThreadLocal? BytesReader in; if (packed) { @@ -1268,10 +1277,10 @@ /** Reads bytes stored in an FST. */ public static abstract class BytesReader extends DataInput { /** Get current read position. */ - public abstract int getPosition(); + public abstract long getPosition(); /** Set current read position. */ - public abstract void setPosition(int pos); + public abstract void setPosition(long pos); /** Returns true if this reader uses reversed bytes * under-the-hood. */ @@ -1429,6 +1438,9 @@ */ FST pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException { + // NOTE: maxDerefNodes is intentionally int: we cannot + // support > 2.1B deref nodes + // TODO: other things to try // - renumber the nodes to get more next / better locality? // - allow multiple input labels on an arc, so @@ -1517,11 +1529,13 @@ fst.nodeCount = 0; fst.arcCount = 0; + // nocommit test packing > 2.1G FST + absCount = deltaCount = topCount = nextCount = 0; int changedCount = 0; - int addressError = 0; + long addressError = 0; //int totWasted = 0; @@ -1530,11 +1544,11 @@ // unchanged: for(int node=nodeCount;node>=1;node--) { fst.nodeCount++; - final int address = writer.getPosition(); + final long address = writer.getPosition(); //System.out.println(" node: " + node + " address=" + address); if (address != newNodeAddress.get(node)) { - addressError = address - (int) newNodeAddress.get(node); + addressError = address - newNodeAddress.get(node); //System.out.println(" change: " + (address - newNodeAddress[node])); changed = true; newNodeAddress.set(node, address); @@ -1609,19 +1623,18 @@ flags += BIT_ARC_HAS_OUTPUT; } - final Integer ptr; - final int absPtr; + final long absPtr; final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0; if (doWriteTarget) { - ptr = topNodeMap.get(arc.target); + final Integer ptr = topNodeMap.get(arc.target); if (ptr != null) { absPtr = ptr; } else { - absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError; + absPtr = topNodeMap.size() + newNodeAddress.get((int) arc.target) + addressError; } - int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address - 2); + long delta = newNodeAddress.get((int) arc.target) + addressError - buffer.getFilePointer() - address - 2; if (delta < 0) { //System.out.println("neg: " + delta); anyNegDelta = true; @@ -1632,7 +1645,6 @@ flags |= BIT_TARGET_DELTA; } } else { - ptr = null; absPtr = 0; } @@ -1653,7 +1665,7 @@ if (doWriteTarget) { - int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address); + long delta = newNodeAddress.get((int) arc.target) + addressError - buffer.getFilePointer() - address; if (delta < 0) { anyNegDelta = true; //System.out.println("neg: " + delta); @@ -1662,7 +1674,7 @@ if (flag(flags, BIT_TARGET_DELTA)) { //System.out.println(" delta"); - buffer.writeVInt(delta); + buffer.writeVLong(delta); if (!retry) { deltaCount++; } @@ -1674,7 +1686,7 @@ System.out.println(" abs"); } */ - buffer.writeVInt(absPtr); + buffer.writeVLong(absPtr); if (!retry) { if (absPtr >= topNodeMap.size()) { absCount++; @@ -1757,8 +1769,8 @@ } long maxAddress = 0; - for (int key : topNodeMap.keySet()) { - maxAddress = Math.max(maxAddress, newNodeAddress.get(key)); + for (long key : topNodeMap.keySet()) { + maxAddress = Math.max(maxAddress, newNodeAddress.get((int) key)); } PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(), @@ -1768,7 +1780,7 @@ } fst.nodeRefToAddress = nodeRefToAddressIn; - fst.startNode = (int) newNodeAddress.get(startNode); + fst.startNode = newNodeAddress.get((int) startNode); //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); if (emptyOutput != null) { Index: lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java (revision 1432529) +++ lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java (working copy) @@ -46,13 +46,13 @@ } @Override - public int getPosition() { + public long getPosition() { return pos; } @Override - public void setPosition(int pos) { - this.pos = pos; + public void setPosition(long pos) { + this.pos = (int) pos; } @Override Index: lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (revision 1432529) +++ lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (working copy) @@ -103,7 +103,7 @@ /** Absolute writeBytes without changing the current * position. Note: this cannot "grow" the bytes, so you * must only call it on already written parts. */ - void writeBytes(int dest, byte[] b, int offset, int len) { + void writeBytes(long dest, byte[] b, int offset, int len) { //System.out.println(" BS.writeBytes dest=" + dest + " offset=" + offset + " len=" + len); assert dest + len <= getPosition(): "dest=" + dest + " pos=" + getPosition() + " len=" + len; @@ -133,9 +133,9 @@ } */ - final int end = dest + len; - int blockIndex = end >> blockBits; - int downTo = end & blockMask; + final long end = dest + len; + int blockIndex = (int) (end >> blockBits); + int downTo = (int) (end & blockMask); if (downTo == 0) { blockIndex--; downTo = blockSize; @@ -162,7 +162,7 @@ /** Absolute copy bytes self to self, without changing the * position. Note: this cannot "grow" the bytes, so must * only call it on already written parts. */ - public void copyBytes(int src, int dest, int len) { + public void copyBytes(long src, long dest, int len) { //System.out.println("BS.copyBytes src=" + src + " dest=" + dest + " len=" + len); assert src < dest; @@ -192,10 +192,10 @@ } */ - int end = src + len; + long end = src + len; - int blockIndex = end >> blockBits; - int downTo = end & blockMask; + int blockIndex = (int) (end >> blockBits); + int downTo = (int) (end & blockMask); if (downTo == 0) { blockIndex--; downTo = blockSize; @@ -221,9 +221,9 @@ /** Writes an int at the absolute position without * changing the current pointer. */ - public void writeInt(int pos, int value) { - int blockIndex = pos >> blockBits; - int upto = pos & blockMask; + public void writeInt(long pos, int value) { + int blockIndex = (int) (pos >> blockBits); + int upto = (int) (pos & blockMask); byte[] block = blocks.get(blockIndex); int shift = 24; for(int i=0;i<4;i++) { @@ -238,20 +238,20 @@ } /** Reverse the last numBytes. */ - public void reverse(int srcPos, int destPos) { + public void reverse(long srcPos, long destPos) { assert srcPos < destPos; //System.out.println("reverse src=" + srcPos + " dest=" + destPos); - int srcBlockIndex = srcPos >> blockBits; - int src = srcPos & blockMask; + int srcBlockIndex = (int) (srcPos >> blockBits); + int src = (int) (srcPos & blockMask); byte[] srcBlock = blocks.get(srcBlockIndex); - int destBlockIndex = destPos >> blockBits; - int dest = destPos & blockMask; + int destBlockIndex = (int) (destPos >> blockBits); + int dest = (int) (destPos & blockMask); byte[] destBlock = blocks.get(destBlockIndex); //System.out.println(" srcBlock=" + srcBlockIndex + " destBlock=" + destBlockIndex); - int limit = (destPos - srcPos + 1)/2; + int limit = (int) (destPos - srcPos + 1)/2; for(int i=0;i