Index: lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (revision 1432491) +++ lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (working copy) @@ -69,6 +69,14 @@ nextWrite = blocks.get(blocks.size()-1).length; } + /** Absolute write byte; you must ensure dest is < max + * position written so far. */ + public void writeByte(int dest, byte b) { + int blockIndex = dest >> blockBits; + byte[] block = blocks.get(blockIndex); + block[dest & blockMask] = b; + } + @Override public void writeByte(byte b) { if (nextWrite == blockSize) { @@ -237,9 +245,10 @@ } } - /** Reverse the last numBytes. */ + /** Reverse from srcPos, inclusive, to destPos, inclusive. */ public void reverse(int srcPos, int destPos) { assert srcPos < destPos; + assert destPos < getPosition(); //System.out.println("reverse src=" + srcPos + " dest=" + destPos); int srcBlockIndex = srcPos >> blockBits; @@ -275,7 +284,7 @@ } } - public void skip(int len) { + public void skipBytes(int len) { while (len > 0) { int chunk = blockSize - nextWrite; if (len <= chunk) { @@ -294,6 +303,26 @@ return (blocks.size()-1) * blockSize + nextWrite; } + /** Pos must be less than the max position written so far! + * Ie, you cannot "grow" the file with this! */ + public void truncate(int newLen) { + assert newLen <= getPosition(); + assert newLen >= 0; + int blockIndex = newLen >> blockBits; + nextWrite = newLen & blockMask; + if (nextWrite == 0) { + blockIndex--; + nextWrite = blockSize; + } + blocks.subList(blockIndex+1, blocks.size()).clear(); + if (newLen == 0) { + current = null; + } else { + current = blocks.get(blockIndex); + } + assert newLen == getPosition(); + } + public void finish() { if (current != null) { byte[] lastBuffer = new byte[nextWrite]; Index: lucene/core/src/java/org/apache/lucene/util/fst/FST.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FST.java (revision 1432491) +++ lucene/core/src/java/org/apache/lucene/util/fst/FST.java (working copy) @@ -716,7 +716,7 @@ int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { - bytes.skip(destPos - srcPos); + bytes.skipBytes(destPos - srcPos); for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { destPos -= maxBytesPerArc; srcPos -= bytesPerArc[arcIdx]; @@ -1427,9 +1427,6 @@ throw new IllegalArgumentException("this FST was not built with willPackFST=true"); } - final RAMOutputStream buffer = new RAMOutputStream(); - byte[] bufferBytes = new byte[64]; - Arc arc = new Arc(); final BytesReader r = getBytesReader(0); @@ -1536,7 +1533,7 @@ // this is an array'd node and bytesPerArc changes: writeNode: while(true) { // retry writing this node - assert buffer.getFilePointer() == 0; + //System.out.println(" cycle: retry"); readFirstRealTargetArc(node, arc, r); @@ -1546,9 +1543,9 @@ if (bytesPerArc == 0) { bytesPerArc = arc.bytesPerArc; } - buffer.writeByte(ARCS_AS_FIXED_ARRAY); - buffer.writeVInt(arc.numArcs); - buffer.writeVInt(bytesPerArc); + writer.writeByte(ARCS_AS_FIXED_ARRAY); + writer.writeVInt(arc.numArcs); + writer.writeVInt(bytesPerArc); //System.out.println("node " + node + ": " + arc.numArcs + " arcs"); } @@ -1557,7 +1554,7 @@ while(true) { // iterate over all arcs for this node //System.out.println(" cycle next arc"); - final int arcStartPos = (int) buffer.getFilePointer(); + final int arcStartPos = writer.getPosition(); nodeArcCount++; byte flags = 0; @@ -1604,7 +1601,7 @@ absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError; } - int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address - 2); + int delta = (int) (newNodeAddress.get(arc.target) + addressError - writer.getPosition() - 2); if (delta < 0) { //System.out.println("neg: " + delta); anyNegDelta = true; @@ -1620,23 +1617,23 @@ } assert flags != ARCS_AS_FIXED_ARRAY; - buffer.writeByte(flags); + writer.writeByte(flags); - fst.writeLabel(buffer, arc.label); + fst.writeLabel(writer, arc.label); if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, buffer); + outputs.write(arc.output, writer); if (!retry) { fst.arcWithOutputCount++; } } if (arc.nextFinalOutput != NO_OUTPUT) { - outputs.writeFinalOutput(arc.nextFinalOutput, buffer); + outputs.writeFinalOutput(arc.nextFinalOutput, writer); } if (doWriteTarget) { - int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address); + int delta = (int) (newNodeAddress.get(arc.target) + addressError - writer.getPosition()); if (delta < 0) { anyNegDelta = true; //System.out.println("neg: " + delta); @@ -1645,7 +1642,7 @@ if (flag(flags, BIT_TARGET_DELTA)) { //System.out.println(" delta"); - buffer.writeVInt(delta); + writer.writeVInt(delta); if (!retry) { deltaCount++; } @@ -1657,7 +1654,7 @@ System.out.println(" abs"); } */ - buffer.writeVInt(absPtr); + writer.writeVInt(absPtr); if (!retry) { if (absPtr >= topNodeMap.size()) { absCount++; @@ -1669,7 +1666,7 @@ } if (useArcArray) { - final int arcBytes = (int) (buffer.getFilePointer() - arcStartPos); + final int arcBytes = writer.getPosition() - arcStartPos; //System.out.println(" " + arcBytes + " bytes"); maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); // NOTE: this may in fact go "backwards", if @@ -1679,11 +1676,7 @@ // will retry (below) so it's OK to ovewrite // bytes: //wasted += bytesPerArc - arcBytes; - int skip = (int) (arcStartPos + bytesPerArc - buffer.getFilePointer()); - while(skip > 0) { - buffer.writeByte((byte) 0); - skip--; - } + writer.skipBytes(arcStartPos + bytesPerArc - writer.getPosition()); } if (arc.isLast()) { @@ -1708,19 +1701,12 @@ // Retry: bytesPerArc = maxBytesPerArc; - buffer.reset(); + writer.truncate(address); nodeArcCount = 0; retry = true; anyNegDelta = false; } - if (bufferBytes.length < (int) buffer.getFilePointer()) { - bufferBytes = ArrayUtil.grow(bufferBytes, (int) buffer.getFilePointer()); - } - buffer.writeTo(bufferBytes, 0); - writer.writeBytes(bufferBytes, 0, (int) buffer.getFilePointer()); - buffer.reset(); - negDelta |= anyNegDelta; fst.arcCount += nodeArcCount; Index: lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java (revision 1432491) +++ lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java (working copy) @@ -42,7 +42,7 @@ int pos = 0; while(pos < numBytes) { - int op = random().nextInt(7); + int op = random().nextInt(8); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -97,21 +97,21 @@ case 3: { // reverse bytes - if (pos > 0) { - int len = _TestUtil.nextInt(random(), 1, Math.min(100, pos)); + if (pos > 1) { + int len = _TestUtil.nextInt(random(), 2, Math.min(100, pos)); int start; if (len == pos) { start = 0; } else { start = random().nextInt(pos - len); } - int end = start + len; + int end = start + len - 1; if (VERBOSE) { - System.out.println(" reverse start=" + start + " end=" + end + " len=" + len); + System.out.println(" reverse start=" + start + " end=" + end + " len=" + len + " pos=" + pos); } bytes.reverse(start, end); - while(start < end) { + while(start <= end) { byte b = expected[end]; expected[end] = expected[start]; expected[start] = b; @@ -159,17 +159,49 @@ { // skip int len = random().nextInt(Math.min(100, numBytes - pos)); - pos += len; - bytes.skip(len); + if (VERBOSE) { System.out.println(" skip len=" + len); } + + pos += len; + bytes.skipBytes(len); + + // NOTE: must fill in zeros in case truncate was + // used, else we get false fails: + if (len > 0) { + byte[] zeros = new byte[len]; + bytes.writeBytes(pos-len, zeros, 0, len); + } } break; + + case 7: + { + // absWriteByte + if (pos > 0) { + int dest = random().nextInt(pos); + byte b = (byte) random().nextInt(256); + expected[dest] = b; + bytes.writeByte(dest, b); + } + break; + } } assertEquals(pos, bytes.getPosition()); + if (pos > 0 && random().nextInt(50) == 17) { + // truncate + int len = _TestUtil.nextInt(random(), 1, Math.min(pos, 100)); + bytes.truncate(pos - len); + pos -= len; + Arrays.fill(expected, pos, pos+len, (byte) 0); + if (VERBOSE) { + System.out.println(" truncate len=" + len + " newPos=" + pos); + } + } + if ((pos > 0 && random().nextInt(200) == 17)) { verify(bytes, expected, pos); } Index: lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1432491) +++ lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) @@ -483,11 +483,14 @@ break; } } - long t = System.currentTimeMillis() - tStart; - System.out.println((t / 1000.0) + " sec to build"); + long tMid = System.currentTimeMillis(); + System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms"); + assert builder.getTermCount() == ord; FST fst = builder.finish(); + long tEnd = System.currentTimeMillis(); + System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack"); if (fst == null) { System.out.println("FST was fully pruned!"); System.exit(0); Index: lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream