Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (revision 1432289) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (working copy) @@ -21,8 +21,9 @@ import java.util.*; import org.apache.lucene.util.*; +import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FSTBytesReader; /** * Finite state automata based implementation of "autocomplete" functionality. @@ -139,7 +140,7 @@ try { List> rootArcs = new ArrayList>(); Arc arc = automaton.getFirstArc(new Arc()); - FST.BytesReader fstReader = automaton.getBytesReader(0); + FSTBytesReader fstReader = automaton.getBytesReader(0); automaton.readFirstTargetArc(arc, arc, fstReader); while (true) { rootArcs.add(new Arc().copyFrom(arc)); @@ -173,7 +174,7 @@ // Get the UTF-8 bytes representation of the input key. try { final FST.Arc scratch = new FST.Arc(); - FST.BytesReader fstReader = automaton.getBytesReader(0); + FSTBytesReader fstReader = automaton.getBytesReader(0); for (; rootArcIndex < rootArcs.length; rootArcIndex++) { final FST.Arc rootArc = rootArcs[rootArcIndex]; final FST.Arc arc = scratch.copyFrom(rootArc); @@ -338,7 +339,7 @@ final int max = utf8.offset + utf8.length; // Cannot save as instance var since multiple threads // can use FSTCompletion at once... - final FST.BytesReader fstReader = automaton.getBytesReader(0); + final FSTBytesReader fstReader = automaton.getBytesReader(0); for (int i = utf8.offset; i < max; i++) { if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) { // No matching prefixes, return an empty result. @@ -362,7 +363,7 @@ } assert output.offset == 0; output.bytes[output.length++] = (byte) arc.label; - FST.BytesReader fstReader = automaton.getBytesReader(0); + FSTBytesReader fstReader = automaton.getBytesReader(0); automaton.readFirstTargetArc(arc, arc, fstReader); while (true) { if (arc.label == FST.END_LABEL) { Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (revision 1432289) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (working copy) @@ -40,12 +40,12 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FST.Arc; -import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util.MinResult; import org.apache.lucene.util.fst.Util; -import org.apache.lucene.util.fst.Util.MinResult; /** * Suggester based on a weighted FST: it first traverses the prefix, @@ -200,7 +200,7 @@ private Long lookupPrefix(BytesRef scratch, Arc arc) throws /*Bogus*/IOException { assert 0 == fst.outputs.getNoOutput().longValue(); long output = 0; - BytesReader bytesReader = fst.getBytesReader(0); + FSTBytesReader bytesReader = fst.getBytesReader(0); fst.getFirstArc(arc); Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (revision 1432289) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (working copy) @@ -17,15 +17,16 @@ * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.io.IOException; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.Util; // TODO: move to core? nobody else uses it yet though... @@ -77,7 +78,7 @@ new IntsRef())); final FST.Arc scratchArc = new FST.Arc(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); while (queue.size() != 0) { final Path path = queue.remove(queue.size() - 1); Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1432289) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -54,8 +54,8 @@ import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; -import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.fst.PairOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; @@ -587,7 +587,7 @@ //System.out.println(" prefixPaths: " + prefixPaths.size()); - BytesReader bytesReader = fst.getBytesReader(0); + FSTBytesReader bytesReader = fst.getBytesReader(0); FST.Arc> scratchArc = new FST.Arc>(); Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (revision 1432289) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (working copy) @@ -34,6 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; /** * Matches single or multi word synonyms in a token stream. @@ -245,7 +246,7 @@ private final FST fst; - private final FST.BytesReader fstReader; + private final FSTBytesReader fstReader; private final BytesRef scratchBytes = new BytesRef(); Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java (revision 1432289) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.Outputs; /** @@ -41,7 +42,7 @@ private final Outputs outputs = CharSequenceOutputs.getSingleton(); private final FST map; - private final FST.BytesReader fstReader; + private final FSTBytesReader fstReader; private final RollingCharBuffer buffer = new RollingCharBuffer(); private final FST.Arc scratchArc = new FST.Arc(); private final Map> cachedRootArcs; Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java (revision 1432289) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; @@ -49,7 +50,7 @@ try { // Pre-cache root arcs: final FST.Arc scratchArc = new FST.Arc(); - final FST.BytesReader fstReader = map.getBytesReader(0); + final FSTBytesReader fstReader = map.getBytesReader(0); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (revision 1432289) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (working copy) @@ -43,6 +43,7 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; // TODO: somehow factor out a reusable viterbi search here, // so other decompounders/tokenizers can reuse... @@ -142,10 +143,10 @@ private final CharacterDefinition characterDefinition; private final FST.Arc arc = new FST.Arc(); - private final FST.BytesReader fstReader; + private final FSTBytesReader fstReader; private final IntsRef wordIdRef = new IntsRef(); - private final FST.BytesReader userFSTReader; + private final FSTBytesReader userFSTReader; private final TokenInfoFST userFST; private final RollingCharBuffer buffer = new RollingCharBuffer(); Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java (revision 1432289) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java (working copy) @@ -19,8 +19,9 @@ import java.io.IOException; +import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FSTBytesReader; /** * Thin wrapper around an FST with root-arc caching for Japanese. @@ -48,13 +49,13 @@ rootCache = cacheRootArcs(); } - @SuppressWarnings("unchecked") + @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc[] cacheRootArcs() throws IOException { FST.Arc rootCache[] = new FST.Arc[1+(cacheCeiling-0x3040)]; FST.Arc firstArc = new FST.Arc(); fst.getFirstArc(firstArc); FST.Arc arc = new FST.Arc(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs) for (int i = 0; i < rootCache.length; i++) { if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) { @@ -64,7 +65,7 @@ return rootCache; } - public FST.Arc findTargetArc(int ch, FST.Arc follow, FST.Arc arc, boolean useCache, FST.BytesReader fstReader) throws IOException { + public FST.Arc findTargetArc(int ch, FST.Arc follow, FST.Arc arc, boolean useCache, FSTBytesReader fstReader) throws IOException { if (useCache && ch >= 0x3040 && ch <= cacheCeiling) { assert ch != FST.END_LABEL; final Arc result = rootCache[ch - 0x3040]; @@ -83,7 +84,7 @@ return fst.getFirstArc(arc); } - public FST.BytesReader getBytesReader(int pos) { + public FSTBytesReader getBytesReader(int pos) { return fst.getBytesReader(pos); } Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java (revision 1432289) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java (working copy) @@ -31,6 +31,7 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.PositiveIntOutputs; /** @@ -139,7 +140,7 @@ TreeMap result = new TreeMap(); // index, [length, length...] boolean found = false; // true if we found any results - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); FST.Arc arc = new FST.Arc(); int end = off + len; Index: lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java (revision 1432289) +++ lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java (working copy) @@ -203,7 +203,7 @@ final FST.Arc arc = fst.getFirstArc(new FST.Arc()); final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); for(int i=0;i<=term.length;i++) { final int label; @@ -240,7 +240,7 @@ in.offset = 0; final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); while(true) { // read all arcs: Index: lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1432289) +++ lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) @@ -60,7 +60,6 @@ import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; import org.apache.lucene.util.fst.FST.Arc; -import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.packed.PackedInts; @@ -483,6 +482,8 @@ break; } } + long t = System.currentTimeMillis() - tStart; + System.out.println((t / 1000.0) + " sec to build"); assert builder.getTermCount() == ord; FST fst = builder.finish(); @@ -513,6 +514,12 @@ return; } + /* + IndexInput in = dir.openInput("fst.bin", IOContext.DEFAULT); + fst = new FST(in, outputs); + in.close(); + */ + System.out.println("\nNow verify..."); while(true) { @@ -576,7 +583,7 @@ } } - // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out + // java -cp ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-2.0.8.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /xold/tmp/allTerms3.txt out public static void main(String[] args) throws IOException { int prune = 0; int limit = Integer.MAX_VALUE; @@ -1022,7 +1029,7 @@ throws IOException { if (FST.targetHasArcs(arc)) { int childCount = 0; - FST.BytesReader fstReader = fst.getBytesReader(0); + FSTBytesReader fstReader = fst.getBytesReader(0); for (arc = fst.readFirstTargetArc(arc, arc, fstReader);; arc = fst.readNextArc(arc, fstReader), childCount++) { @@ -1292,7 +1299,7 @@ //Util.toDot(fst, w, false, false); //w.close(); - BytesReader reader = fst.getBytesReader(0); + FSTBytesReader reader = fst.getBytesReader(0); //System.out.println("testing: " + allPrefixes.size() + " prefixes"); for (String prefix : allPrefixes) { @@ -1413,7 +1420,7 @@ //Util.toDot(fst, w, false, false); //w.close(); - BytesReader reader = fst.getBytesReader(0); + FSTBytesReader reader = fst.getBytesReader(0); //System.out.println("testing: " + allPrefixes.size() + " prefixes"); for (String prefix : allPrefixes) { Index: lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java (revision 0) +++ lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java (working copy) @@ -0,0 +1,328 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class TestBytesStore extends LuceneTestCase { + + public void testRandom() throws Exception { + + final int iters = atLeast(10); + for(int iter=0;iter 1) { + int numOps = _TestUtil.nextInt(random(), 100, 200); + for(int op=0;op[] arcs = new FST.Arc[1]; Index: lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java (working copy) @@ -0,0 +1,62 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO: can we use just ByteArrayDataInput...? need to +// add a .skipBytes to DataInput.. hmm and .setPosition + +/** Reads from a single byte[]. */ +final class ForwardBytesReader extends FSTBytesReader { + private final byte[] bytes; + private int pos; + + public ForwardBytesReader(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public byte readByte() { + return bytes[pos++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + System.arraycopy(bytes, pos, b, offset, len); + pos += len; + } + + @Override + public void skipBytes(int count) { + pos += count; + } + + @Override + public int getPosition() { + return pos; + } + + @Override + public void setPosition(int pos) { + this.pos = pos; + } + + @Override + public boolean reversed() { + return false; + } +} Property changes on: lucene/core/src/java/org/apache/lucene/util/fst/ForwardBytesReader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (working copy) @@ -0,0 +1,430 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +// TODO: merge with PagedBytes, except PagedBytes doesn't +// let you read while writing which FST needs + +class BytesStore extends DataOutput { + + private final List blocks = new ArrayList(); + + private final int blockSize; + private final int blockBits; + private final int blockMask; + + private byte[] current; + private int nextWrite; + + public BytesStore(int blockBits) { + this.blockBits = blockBits; + blockSize = 1 << blockBits; + blockMask = blockSize-1; + nextWrite = blockSize; + } + + /** Pulls bytes from the provided IndexInput. */ + public BytesStore(DataInput in, int numBytes, int maxBlockSize) throws IOException { + int blockSize = 2; + int blockBits = 1; + while(blockSize < numBytes && blockSize < maxBlockSize) { + blockSize *= 2; + blockBits++; + } + this.blockBits = blockBits; + this.blockSize = blockSize; + this.blockMask = blockSize-1; + int left = numBytes; + while(left > 0) { + final int chunk = Math.min(blockSize, left); + byte[] block = new byte[chunk]; + in.readBytes(block, 0, block.length); + blocks.add(block); + left -= chunk; + } + + // So .getPosition still works + nextWrite = blocks.get(blocks.size()-1).length; + } + + @Override + public void writeByte(byte b) { + if (nextWrite == blockSize) { + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + current[nextWrite++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int len) { + while (len > 0) { + int chunk = blockSize - nextWrite; + if (len <= chunk) { + System.arraycopy(b, offset, current, nextWrite, len); + nextWrite += len; + break; + } else { + if (chunk > 0) { + System.arraycopy(b, offset, current, nextWrite, chunk); + offset += chunk; + len -= chunk; + } + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + } + } + + /** Absolute writeBytes without changing the current + * position. Note: this cannot "grow" the bytes, so you + * must only call it on already written parts. */ + void writeBytes(int dest, byte[] b, int offset, int len) { + //System.out.println(" BS.writeBytes dest=" + dest + " offset=" + offset + " len=" + len); + assert dest + len <= getPosition(): "dest=" + dest + " pos=" + getPosition() + " len=" + len; + + // Note: weird: must go "backwards" because copyBytes + // calls us with overlapping src/dest. If we + // go forwards then we overwrite bytes before we can + // copy them: + + /* + int blockIndex = dest >> blockBits; + int upto = dest & blockMask; + byte[] block = blocks.get(blockIndex); + while (len > 0) { + int chunk = blockSize - upto; + System.out.println(" cycle chunk=" + chunk + " len=" + len); + if (len <= chunk) { + System.arraycopy(b, offset, block, upto, len); + break; + } else { + System.arraycopy(b, offset, block, upto, chunk); + offset += chunk; + len -= chunk; + blockIndex++; + block = blocks.get(blockIndex); + upto = 0; + } + } + */ + + final int end = dest + len; + int blockIndex = end >> blockBits; + int downTo = end & blockMask; + if (downTo == 0) { + blockIndex--; + downTo = blockSize; + } + byte[] block = blocks.get(blockIndex); + + while (len > 0) { + //System.out.println(" cycle downTo=" + downTo + " len=" + len); + if (len <= downTo) { + //System.out.println(" final: offset=" + offset + " len=" + len + " dest=" + (downTo-len)); + System.arraycopy(b, offset, block, downTo-len, len); + break; + } else { + len -= downTo; + //System.out.println(" partial: offset=" + (offset + len) + " len=" + downTo + " dest=0"); + System.arraycopy(b, offset + len, block, 0, downTo); + blockIndex--; + block = blocks.get(blockIndex); + downTo = blockSize; + } + } + } + + /** Absolute copy bytes self to self, without changing the + * position. Note: this cannot "grow" the bytes, so must + * only call it on already written parts. */ + public void copyBytes(int src, int dest, int len) { + //System.out.println("BS.copyBytes src=" + src + " dest=" + dest + " len=" + len); + assert src < dest; + + // Note: weird: must go "backwards" because copyBytes + // calls us with overlapping src/dest. If we + // go forwards then we overwrite bytes before we can + // copy them: + + /* + int blockIndex = src >> blockBits; + int upto = src & blockMask; + byte[] block = blocks.get(blockIndex); + while (len > 0) { + int chunk = blockSize - upto; + System.out.println(" cycle: chunk=" + chunk + " len=" + len); + if (len <= chunk) { + writeBytes(dest, block, upto, len); + break; + } else { + writeBytes(dest, block, upto, chunk); + blockIndex++; + block = blocks.get(blockIndex); + upto = 0; + len -= chunk; + dest += chunk; + } + } + */ + + int end = src + len; + + int blockIndex = end >> blockBits; + int downTo = end & blockMask; + if (downTo == 0) { + blockIndex--; + downTo = blockSize; + } + byte[] block = blocks.get(blockIndex); + + while (len > 0) { + //System.out.println(" cycle downTo=" + downTo); + if (len <= downTo) { + //System.out.println(" finish"); + writeBytes(dest, block, downTo-len, len); + break; + } else { + //System.out.println(" partial"); + len -= downTo; + writeBytes(dest + len, block, 0, downTo); + blockIndex--; + block = blocks.get(blockIndex); + downTo = blockSize; + } + } + } + + /** Writes an int at the absolute position without + * changing the current pointer. */ + public void writeInt(int pos, int value) { + int blockIndex = pos >> blockBits; + int upto = pos & blockMask; + byte[] block = blocks.get(blockIndex); + int shift = 24; + for(int i=0;i<4;i++) { + block[upto++] = (byte) (value >> shift); + shift -= 8; + if (upto == blockSize) { + upto = 0; + blockIndex++; + block = blocks.get(blockIndex); + } + } + } + + /** Reverse the last numBytes. */ + public void reverse(int srcPos, int destPos) { + assert srcPos < destPos; + //System.out.println("reverse src=" + srcPos + " dest=" + destPos); + + int srcBlockIndex = srcPos >> blockBits; + int src = srcPos & blockMask; + byte[] srcBlock = blocks.get(srcBlockIndex); + + int destBlockIndex = destPos >> blockBits; + int dest = destPos & blockMask; + byte[] destBlock = blocks.get(destBlockIndex); + //System.out.println(" srcBlock=" + srcBlockIndex + " destBlock=" + destBlockIndex); + + int limit = (destPos - srcPos + 1)/2; + for(int i=0;i scratchArc = new FST.Arc(); protected int upto; @@ -145,7 +145,7 @@ // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; @@ -153,8 +153,8 @@ boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid+1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); @@ -284,7 +284,7 @@ // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; @@ -292,8 +292,8 @@ boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid+1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); @@ -434,7 +434,7 @@ FST.Arc arc = getArc(upto-1); int targetLabel = getTargetLabel(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); while(true) { //System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); Index: lucene/core/src/java/org/apache/lucene/util/fst/FST.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FST.java (revision 1432289) +++ lucene/core/src/java/org/apache/lucene/util/fst/FST.java (working copy) @@ -51,9 +51,6 @@ // job, ie, once we are at a 'suffix only', just store the // completion labels as a string not as a series of arcs. -// TODO: maybe make an explicit thread state that holds -// reusable stuff eg BytesReader, a scratch arc - // NOTE: while the FST is able to represent a non-final // dead-end state (NON_FINAL_END_NODE=0), the layers above // (FSTEnum, Util) have problems with this!! @@ -139,8 +136,7 @@ // produces this output T emptyOutput; - // Not private to avoid synthetic access$NNN methods: - byte[] bytes; + private final BytesStore bytes; private int startNode = -1; @@ -254,8 +250,6 @@ return (flags & bit) != 0; } - private final BytesWriter writer; - private GrowableWriter nodeAddress; // TODO: we could be smarter here, and prune periodically @@ -269,17 +263,19 @@ this.inputType = inputType; this.outputs = outputs; this.allowArrayArcs = allowArrayArcs; - bytes = new byte[128]; + // 32 KB blocks: + bytes = new BytesStore(15); + // pad: ensure no node gets address 0 which is reserved to mean + // the stop state w/ no arcs + bytes.writeByte((byte) 0); NO_OUTPUT = outputs.getNoOutput(); if (willPackFST) { - nodeAddress = new GrowableWriter(PackedInts.bitsRequired(bytes.length - 1), 8, acceptableOverheadRatio); + nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio); inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio); } else { nodeAddress = null; inCounts = null; } - - writer = new DefaultBytesWriter(); emptyOutput = null; packed = false; @@ -289,23 +285,29 @@ /** Load a previously saved FST. */ public FST(DataInput in, Outputs outputs) throws IOException { this.outputs = outputs; - writer = null; // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED); packed = in.readByte() == 1; if (in.readByte() == 1) { // accepts empty string + // 1 KB blocks: + BytesStore emptyBytes = new BytesStore(10); int numBytes = in.readVInt(); - bytes = new byte[numBytes]; - in.readBytes(bytes, 0, numBytes); - + emptyBytes.copyBytes(in, numBytes); + // De-serialize empty-string output: - BytesReader reader; + FSTBytesReader reader; if (packed) { - reader = new ForwardBytesReader(bytes, 0); + reader = emptyBytes.getForwardReader(); } else { - reader = new ReverseBytesReader(bytes, bytes.length-1); + reader = emptyBytes.getReverseReader(); + // NoOutputs uses 0 bytes when writing its output, + // so we have to check here else BytesStore gets + // angry: + if (numBytes > 0) { + reader.setPosition(numBytes-1); + } } emptyOutput = outputs.readFinalOutput(reader); } else { @@ -335,8 +337,9 @@ arcCount = in.readVInt(); arcWithOutputCount = in.readVInt(); - bytes = new byte[in.readVInt()]; - in.readBytes(bytes, 0, bytes.length); + int numBytes = in.readVInt(); + bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE); + NO_OUTPUT = outputs.getNoOutput(); cacheRootArcs(); @@ -353,7 +356,7 @@ /** Returns bytes used to represent the FST */ public int sizeInBytes() { - int size = bytes.length; + int size = bytes.getPosition(); if (packed) { size += nodeRefToAddress.ramBytesUsed(); } else if (nodeAddress != null) { @@ -370,10 +373,8 @@ if (this.startNode != -1) { throw new IllegalStateException("already finished"); } - byte[] finalBytes = new byte[writer.getPosition()]; - System.arraycopy(bytes, 0, finalBytes, 0, writer.getPosition()); - bytes = finalBytes; this.startNode = startNode; + bytes.finish(); cacheRootArcs(); } @@ -394,7 +395,7 @@ cachedRootArcs = (Arc[]) new Arc[0x80]; final Arc arc = new Arc(); getFirstArc(arc); - final BytesReader in = getBytesReader(0); + final FSTBytesReader in = getBytesReader(0); if (targetHasArcs(arc)) { readFirstRealTargetArc(arc.target, arc, in); while(true) { @@ -485,8 +486,9 @@ out.writeVInt(nodeCount); out.writeVInt(arcCount); out.writeVInt(arcWithOutputCount); - out.writeVInt(bytes.length); - out.writeBytes(bytes, 0, bytes.length); + int numBytes = bytes.getPosition(); + out.writeVInt(numBytes); + bytes.writeTo(out); } /** @@ -526,17 +528,16 @@ } } - private void writeLabel(int v) throws IOException { + private void writeLabel(DataOutput out, int v) throws IOException { assert v >= 0: "v=" + v; if (inputType == INPUT_TYPE.BYTE1) { assert v <= 255: "v=" + v; - writer.writeByte((byte) v); + out.writeByte((byte) v); } else if (inputType == INPUT_TYPE.BYTE2) { assert v <= 65535: "v=" + v; - writer.writeShort((short) v); + out.writeShort((short) v); } else { - //writeInt(v); - writer.writeVInt(v); + out.writeVInt(v); } } @@ -563,7 +564,7 @@ // serializes new node by appending its bytes to the end // of the current byte[] int addNode(Builder.UnCompiledNode nodeIn) throws IOException { - //System.out.println("FST.addNode pos=" + writer.posWrite + " numArcs=" + nodeIn.numArcs); + //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { return FINAL_END_NODE; @@ -572,23 +573,24 @@ } } - int startAddress = writer.getPosition(); + int startAddress = bytes.getPosition(); //System.out.println(" startAddr=" + startAddress); final boolean doFixedArray = shouldExpand(nodeIn); final int fixedArrayStart; if (doFixedArray) { + //System.out.println(" fixedArray"); if (bytesPerArc.length < nodeIn.numArcs) { bytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)]; } // write a "false" first arc: - writer.writeByte(ARCS_AS_FIXED_ARRAY); - writer.writeVInt(nodeIn.numArcs); + bytes.writeByte(ARCS_AS_FIXED_ARRAY); + bytes.writeVInt(nodeIn.numArcs); // placeholder -- we'll come back and write the number // of bytes per arc (int) here: // TODO: we could make this a vInt instead - writer.writeInt(0); - fixedArrayStart = writer.getPosition(); + bytes.writeInt(0); + fixedArrayStart = bytes.getPosition(); //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); } else { fixedArrayStart = 0; @@ -598,12 +600,13 @@ final int lastArc = nodeIn.numArcs-1; - int lastArcStart = writer.getPosition(); + int lastArcStart = bytes.getPosition(); int maxBytesPerArc = 0; for(int arcIdx=0;arcIdx arc = nodeIn.arcs[arcIdx]; final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; int flags = 0; + //System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node); if (arcIdx == lastArc) { flags += BIT_LAST_ARC; @@ -637,34 +640,34 @@ flags += BIT_ARC_HAS_OUTPUT; } - writer.writeByte((byte) flags); - writeLabel(arc.label); + bytes.writeByte((byte) flags); + writeLabel(bytes, arc.label); - // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + writer.posWrite + " output=" + outputs.outputToString(arc.output)); + // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, writer); + outputs.write(arc.output, bytes); //System.out.println(" write output"); arcWithOutputCount++; } if (arc.nextFinalOutput != NO_OUTPUT) { //System.out.println(" write final output"); - outputs.writeFinalOutput(arc.nextFinalOutput, writer); + outputs.writeFinalOutput(arc.nextFinalOutput, bytes); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); - writer.writeInt(target.node); + bytes.writeInt(target.node); } // just write the arcs "like normal" on first pass, // but record how many bytes each one took, and max // byte size: if (doFixedArray) { - bytesPerArc[arcIdx] = writer.getPosition() - lastArcStart; - lastArcStart = writer.getPosition(); + bytesPerArc[arcIdx] = bytes.getPosition() - lastArcStart; + lastArcStart = bytes.getPosition(); maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]); //System.out.println(" bytes=" + bytesPerArc[arcIdx]); } @@ -676,48 +679,38 @@ // such cases if (doFixedArray) { - //System.out.println(" doFixedArray"); assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed // byte size final int sizeNeeded = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc; assert ((long) fixedArrayStart) + ((long) nodeIn.numArcs) * maxBytesPerArc < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, sizeNeeded); + //System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs); // TODO: we could make this a vInt instead - bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); - bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16); - bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8); - bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; + bytes.writeInt(fixedArrayStart-4, maxBytesPerArc); // expand the arcs in place, backwards - int srcPos = writer.getPosition(); + int srcPos = bytes.getPosition(); int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; - writer.setPosition(destPos); - for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { - //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); - destPos -= maxBytesPerArc; - srcPos -= bytesPerArc[arcIdx]; - if (srcPos != destPos) { - assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " bytesPerArc[arcIdx]=" + bytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; - System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]); + assert destPos >= srcPos; + if (destPos > srcPos) { + bytes.skip(destPos - srcPos); + for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { + destPos -= maxBytesPerArc; + srcPos -= bytesPerArc[arcIdx]; + //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); + if (srcPos != destPos) { + //System.out.println(" copy len=" + bytesPerArc[arcIdx]); + assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " bytesPerArc[arcIdx]=" + bytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; + bytes.copyBytes(srcPos, destPos, bytesPerArc[arcIdx]); + } } } } - // reverse bytes in-place; we do this so that the - // "BIT_TARGET_NEXT" opto can work, ie, it reads the - // node just before the current one - final int endAddress = writer.getPosition() - 1; + final int thisNodeAddress = bytes.getPosition()-1; - int left = startAddress; - int right = endAddress; - while (left < right) { - final byte b = bytes[left]; - bytes[left++] = bytes[right]; - bytes[right--] = b; - } - //System.out.println(" endAddress=" + endAddress); + bytes.reverse(startAddress, thisNodeAddress); nodeCount++; final int node; @@ -727,14 +720,15 @@ nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue())); } - nodeAddress.set(nodeCount, endAddress); + nodeAddress.set(nodeCount, thisNodeAddress); // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); node = nodeCount; } else { - node = endAddress; + node = thisNodeAddress; } lastFrozenNode = node; + //System.out.println(" ret node=" + node + " address=" + thisNodeAddress + " nodeAddress=" + nodeAddress); return node; } @@ -763,7 +757,7 @@ * * @return Returns the second argument * (arc). */ - public Arc readLastTargetArc(Arc follow, Arc arc, FST.BytesReader in) throws IOException { + public Arc readLastTargetArc(Arc follow, Arc arc, FSTBytesReader in) throws IOException { //System.out.println("readLast"); if (!targetHasArcs(follow)) { //System.out.println(" end node"); @@ -774,7 +768,7 @@ arc.flags = BIT_LAST_ARC; return arc; } else { - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); arc.node = follow.target; final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { @@ -786,7 +780,7 @@ arc.bytesPerArc = in.readInt(); } //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); - arc.posArcsStart = in.pos; + arc.posArcsStart = in.getPosition(); arc.arcIdx = arc.numArcs - 2; } else { arc.flags = b; @@ -808,14 +802,14 @@ if (packed) { in.readVInt(); } else { - in.skip(4); + in.skipBytes(4); } } arc.flags = in.readByte(); } - // Undo the byte flags we read: - in.skip(-1); - arc.nextArc = in.pos; + // Undo the byte flags we read: + in.skipBytes(-1); + arc.nextArc = in.getPosition(); } readNextRealArc(arc, in); assert arc.isLast(); @@ -830,7 +824,7 @@ * * @return Returns the second argument (arc). */ - public Arc readFirstTargetArc(Arc follow, Arc arc, BytesReader in) throws IOException { + public Arc readFirstTargetArc(Arc follow, Arc arc, FSTBytesReader in) throws IOException { //int pos = address; //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal()); if (follow.isFinal()) { @@ -853,10 +847,9 @@ } } - public Arc readFirstRealTargetArc(int node, Arc arc, final BytesReader in) throws IOException { - assert in.bytes == bytes; + public Arc readFirstRealTargetArc(int node, Arc arc, final FSTBytesReader in) throws IOException { final int address = getNodeAddress(node); - in.pos = address; + in.setPosition(address); //System.out.println(" readFirstRealTargtArc address=" //+ address); //System.out.println(" flags=" + arc.flags); @@ -872,7 +865,7 @@ arc.bytesPerArc = in.readInt(); } arc.arcIdx = -1; - arc.nextArc = arc.posArcsStart = in.pos; + arc.nextArc = arc.posArcsStart = in.getPosition(); //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); } else { //arc.flags = b; @@ -889,17 +882,17 @@ * @return Returns true if arc points to a state in an * expanded array format. */ - boolean isExpandedTarget(Arc follow, FST.BytesReader in) throws IOException { + boolean isExpandedTarget(Arc follow, FSTBytesReader in) throws IOException { if (!targetHasArcs(follow)) { return false; } else { - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); return in.readByte() == ARCS_AS_FIXED_ARRAY; } } /** In-place read; returns the arc. */ - public Arc readNextArc(Arc arc, BytesReader in) throws IOException { + public Arc readNextArc(Arc arc, FSTBytesReader in) throws IOException { if (arc.label == END_LABEL) { // This was a fake inserted "final" arc if (arc.nextArc <= 0) { @@ -913,12 +906,16 @@ /** Peeks at next arc's label; does not alter arc. Do * not call this if arc.isLast()! */ - public int readNextArcLabel(Arc arc, BytesReader in) throws IOException { + public int readNextArcLabel(Arc arc, FSTBytesReader in) throws IOException { assert !arc.isLast(); if (arc.label == END_LABEL) { - //System.out.println(" nextArc fake " + arc.nextArc); - int pos = in.pos = getNodeAddress(arc.nextArc); + //System.out.println(" nextArc fake " + + //arc.nextArc); + + int pos = getNodeAddress(arc.nextArc); + in.setPosition(pos); + final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { //System.out.println(" nextArc fake array"); @@ -929,18 +926,18 @@ in.readInt(); } } else { - in.pos = pos; + in.setPosition(pos); } } else { if (arc.bytesPerArc != 0) { //System.out.println(" nextArc real array"); // arcs are at fixed entries - in.pos = arc.posArcsStart; - in.skip((1+arc.arcIdx)*arc.bytesPerArc); + in.setPosition(arc.posArcsStart); + in.skipBytes((1+arc.arcIdx)*arc.bytesPerArc); } else { // arcs are packed //System.out.println(" nextArc real packed"); - in.pos = arc.nextArc; + in.setPosition(arc.nextArc); } } // skip flags @@ -950,8 +947,7 @@ /** Never returns null, but you should never call this if * arc.isLast() is true. */ - public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { - assert in.bytes == bytes; + public Arc readNextRealArc(Arc arc, final FSTBytesReader in) throws IOException { // TODO: can't assert this because we call from readFirstArc // assert !flag(arc.flags, BIT_LAST_ARC); @@ -961,10 +957,11 @@ // arcs are at fixed entries arc.arcIdx++; assert arc.arcIdx < arc.numArcs; - in.skip(arc.posArcsStart, arc.arcIdx*arc.bytesPerArc); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.arcIdx*arc.bytesPerArc); } else { // arcs are packed - in.pos = arc.nextArc; + in.setPosition(arc.nextArc); } arc.flags = in.readByte(); arc.label = readLabel(in); @@ -987,9 +984,9 @@ } else { arc.target = NON_FINAL_END_NODE; } - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); } else if (arc.flag(BIT_TARGET_NEXT)) { - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); // TODO: would be nice to make this lazy -- maybe // caller doesn't need the target and is scanning arcs... if (nodeAddress == null) { @@ -998,17 +995,18 @@ // must scan seekToNextNode(in); } else { - in.skip(arc.posArcsStart, arc.bytesPerArc * arc.numArcs); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc * arc.numArcs); } } - arc.target = in.pos; + arc.target = in.getPosition(); } else { arc.target = arc.node - 1; assert arc.target > 0; } } else { if (packed) { - final int pos = in.pos; + final int pos = in.getPosition(); final int code = in.readVInt(); if (arc.flag(BIT_TARGET_DELTA)) { // Address is delta-coded from current address: @@ -1021,21 +1019,20 @@ } else { // Absolute arc.target = code; - //System.out.println(" abs code=" + code + " derefLen=" + nodeRefToAddress.length); + //System.out.println(" abs code=" + code); } } else { arc.target = in.readInt(); } - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); } return arc; } /** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */ - public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in) throws IOException { + public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, FSTBytesReader in) throws IOException { assert cachedRootArcs != null; - assert in.bytes == bytes; if (labelToMatch == END_LABEL) { if (follow.isFinal()) { @@ -1070,7 +1067,7 @@ return null; } - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); arc.node = follow.target; @@ -1084,13 +1081,14 @@ } else { arc.bytesPerArc = in.readInt(); } - arc.posArcsStart = in.pos; + arc.posArcsStart = in.getPosition(); int low = 0; int high = arc.numArcs-1; while (low <= high) { //System.out.println(" cycle"); int mid = (low + high) >>> 1; - in.skip(arc.posArcsStart, arc.bytesPerArc*mid + 1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid + 1); int midLabel = readLabel(in); final int cmp = midLabel - labelToMatch; if (cmp < 0) { @@ -1128,7 +1126,7 @@ } } - private void seekToNextNode(BytesReader in) throws IOException { + private void seekToNextNode(FSTBytesReader in) throws IOException { while(true) { @@ -1196,142 +1194,28 @@ public abstract int getPosition(); } - // Non-static: writes to FST's byte[] - class DefaultBytesWriter extends BytesWriter { - int posWrite; - - public DefaultBytesWriter() { - // pad: ensure no node gets address 0 which is reserved to mean - // the stop state w/ no arcs - posWrite = 1; - } - - @Override - public void writeByte(byte b) { - assert posWrite <= bytes.length; - if (bytes.length == posWrite) { - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes); - } - assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length; - bytes[posWrite++] = b; - } - - @Override - public int getPosition() { - return posWrite; - } - - @Override - public void setPosition(int posWrite) { - this.posWrite = posWrite; - if (bytes.length < posWrite) { - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, posWrite); - } - } - - @Override - public void writeBytes(byte[] b, int offset, int length) { - final int size = posWrite + length; - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, size); - System.arraycopy(b, offset, bytes, posWrite, length); - posWrite += length; - } - } - - /** Returns a {@link BytesReader} for this FST, positioned at + /** Returns a {@link FSTBytesReader} for this FST, positioned at * position 0. */ - public BytesReader getBytesReader() { + public FSTBytesReader getBytesReader() { return getBytesReader(0); } - /** Returns a {@link BytesReader} for this FST, positioned at + /** Returns a {@link FSTBytesReader} for this FST, positioned at * the provided position. */ - public BytesReader getBytesReader(int pos) { + public FSTBytesReader getBytesReader(int pos) { // TODO: maybe re-use via ThreadLocal? + FSTBytesReader in; if (packed) { - return new ForwardBytesReader(bytes, pos); + in = bytes.getForwardReader(); } else { - return new ReverseBytesReader(bytes, pos); + in = bytes.getReverseReader(); } - } - - /** Reads the bytes from this FST. Use {@link - * #getBytesReader(int)} to obtain an instance for this - * FST; re-use across calls (but only within a single - * thread) for better performance. */ - public static abstract class BytesReader extends DataInput { - protected int pos; - protected final byte[] bytes; - protected BytesReader(byte[] bytes, int pos) { - this.bytes = bytes; - this.pos = pos; + if (pos != 0) { + in.setPosition(pos); } - abstract void skip(int byteCount); - abstract void skip(int base, int byteCount); + return in; } - final static class ReverseBytesReader extends BytesReader { - - public ReverseBytesReader(byte[] bytes, int pos) { - super(bytes, pos); - } - - @Override - public byte readByte() { - return bytes[pos--]; - } - - @Override - public void readBytes(byte[] b, int offset, int len) { - for(int i=0;i { final Arc arc; final IntsRef chain; @@ -1451,14 +1335,13 @@ */ // Creates a packed FST - private FST(INPUT_TYPE inputType, PackedInts.Reader nodeRefToAddress, Outputs outputs) { + private FST(INPUT_TYPE inputType, Outputs outputs) { packed = true; this.inputType = inputType; - bytes = new byte[128]; - this.nodeRefToAddress = nodeRefToAddress; + // 32 KB blocks: + bytes = new BytesStore(15); this.outputs = outputs; NO_OUTPUT = outputs.getNoOutput(); - writer = new DefaultBytesWriter(); // NOTE: bogus because this is only used during // building; we need to break out mutable FST from @@ -1495,9 +1378,12 @@ throw new IllegalArgumentException("this FST was not built with willPackFST=true"); } + final RAMOutputStream buffer = new RAMOutputStream(); + byte[] bufferBytes = new byte[64]; + Arc arc = new Arc(); - final BytesReader r = getBytesReader(0); + final FSTBytesReader r = getBytesReader(0); final int topN = Math.min(maxDerefNodes, inCounts.size()); @@ -1529,17 +1415,13 @@ //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); } - final FST fst = new FST(inputType, null, outputs); - - final BytesWriter writer = fst.writer; - // +1 because node ords start at 1 (0 is reserved as stop node): final GrowableWriter newNodeAddress = new GrowableWriter( - PackedInts.bitsRequired(bytes.length), 1 + nodeCount, acceptableOverheadRatio); + PackedInts.bitsRequired(this.bytes.getPosition()), 1 + nodeCount, acceptableOverheadRatio); // Fill initial coarse guess: for(int node=1;node<=nodeCount;node++) { - newNodeAddress.set(node, 1 + bytes.length - nodeAddress.get(node)); + newNodeAddress.set(node, 1 + this.bytes.getPosition() - nodeAddress.get(node)); } int absCount; @@ -1547,6 +1429,8 @@ int topCount; int nextCount; + FST fst; + // Iterate until we converge: while(true) { @@ -1556,7 +1440,10 @@ // for assert: boolean negDelta = false; - writer.setPosition(0); + fst = new FST(inputType, outputs); + + final BytesStore writer = fst.bytes; + // Skip 0 byte since 0 is reserved target: writer.writeByte((byte) 0); @@ -1578,6 +1465,7 @@ for(int node=nodeCount;node>=1;node--) { fst.nodeCount++; final int address = writer.getPosition(); + //System.out.println(" node: " + node + " address=" + address); if (address != newNodeAddress.get(node)) { addressError = address - (int) newNodeAddress.get(node); @@ -1599,7 +1487,8 @@ // this is an array'd node and bytesPerArc changes: writeNode: while(true) { // retry writing this node - + assert buffer.getFilePointer() == 0; + //System.out.println(" cycle: retry"); readFirstRealTargetArc(node, arc, r); final boolean useArcArray = arc.bytesPerArc != 0; @@ -1608,18 +1497,18 @@ if (bytesPerArc == 0) { bytesPerArc = arc.bytesPerArc; } - writer.writeByte(ARCS_AS_FIXED_ARRAY); - writer.writeVInt(arc.numArcs); - writer.writeVInt(bytesPerArc); + buffer.writeByte(ARCS_AS_FIXED_ARRAY); + buffer.writeVInt(arc.numArcs); + buffer.writeVInt(bytesPerArc); //System.out.println("node " + node + ": " + arc.numArcs + " arcs"); } int maxBytesPerArc = 0; //int wasted = 0; while(true) { // iterate over all arcs for this node + //System.out.println(" cycle next arc"); - //System.out.println(" arc label=" + arc.label + " target=" + arc.target + " pos=" + writer.posWrite); - final int arcStartPos = writer.getPosition(); + final int arcStartPos = (int) buffer.getFilePointer(); nodeArcCount++; byte flags = 0; @@ -1666,7 +1555,7 @@ absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError; } - int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.getPosition() - 2; + int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address - 2); if (delta < 0) { //System.out.println("neg: " + delta); anyNegDelta = true; @@ -1681,22 +1570,22 @@ absPtr = 0; } - writer.writeByte(flags); - fst.writeLabel(arc.label); + buffer.writeByte(flags); + fst.writeLabel(buffer, arc.label); if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, writer); + outputs.write(arc.output, buffer); if (!retry) { fst.arcWithOutputCount++; } } if (arc.nextFinalOutput != NO_OUTPUT) { - outputs.writeFinalOutput(arc.nextFinalOutput, writer); + outputs.writeFinalOutput(arc.nextFinalOutput, buffer); } if (doWriteTarget) { - int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.getPosition(); + int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address); if (delta < 0) { anyNegDelta = true; //System.out.println("neg: " + delta); @@ -1705,7 +1594,7 @@ if (flag(flags, BIT_TARGET_DELTA)) { //System.out.println(" delta"); - writer.writeVInt(delta); + buffer.writeVInt(delta); if (!retry) { deltaCount++; } @@ -1717,7 +1606,7 @@ System.out.println(" abs"); } */ - writer.writeVInt(absPtr); + buffer.writeVInt(absPtr); if (!retry) { if (absPtr >= topNodeMap.size()) { absCount++; @@ -1729,7 +1618,7 @@ } if (useArcArray) { - final int arcBytes = writer.getPosition() - arcStartPos; + final int arcBytes = (int) (buffer.getFilePointer() - arcStartPos); //System.out.println(" " + arcBytes + " bytes"); maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); // NOTE: this may in fact go "backwards", if @@ -1739,7 +1628,11 @@ // will retry (below) so it's OK to ovewrite // bytes: //wasted += bytesPerArc - arcBytes; - writer.setPosition(arcStartPos + bytesPerArc); + int skip = (int) (arcStartPos + bytesPerArc - buffer.getFilePointer()); + while(skip > 0) { + buffer.writeByte((byte) 0); + skip--; + } } if (arc.isLast()) { @@ -1764,11 +1657,19 @@ // Retry: bytesPerArc = maxBytesPerArc; - writer.setPosition(address); + buffer.reset(); nodeArcCount = 0; retry = true; anyNegDelta = false; } + + if (bufferBytes.length < (int) buffer.getFilePointer()) { + bufferBytes = ArrayUtil.grow(bufferBytes, (int) buffer.getFilePointer()); + } + buffer.writeTo(bufferBytes, 0); + writer.writeBytes(bufferBytes, 0, (int) buffer.getFilePointer()); + buffer.reset(); + negDelta |= anyNegDelta; fst.arcCount += nodeArcCount; @@ -1799,7 +1700,6 @@ } fst.nodeRefToAddress = nodeRefToAddressIn; - fst.startNode = (int) newNodeAddress.get(startNode); //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); @@ -1810,11 +1710,8 @@ assert fst.nodeCount == nodeCount: "fst.nodeCount=" + fst.nodeCount + " nodeCount=" + nodeCount; assert fst.arcCount == arcCount; assert fst.arcWithOutputCount == arcWithOutputCount: "fst.arcWithOutputCount=" + fst.arcWithOutputCount + " arcWithOutputCount=" + arcWithOutputCount; - - final byte[] finalBytes = new byte[writer.getPosition()]; - //System.out.println("resize " + fst.bytes.length + " down to " + writer.posWrite); - System.arraycopy(fst.bytes, 0, finalBytes, 0, writer.getPosition()); - fst.bytes = finalBytes; + + fst.bytes.finish(); fst.cacheRootArcs(); //final int size = fst.sizeInBytes(); Index: lucene/core/src/java/org/apache/lucene/util/fst/ReverseBytesReader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/ReverseBytesReader.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/fst/ReverseBytesReader.java (working copy) @@ -0,0 +1,61 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Reads in reverse from a single byte[]. */ +final class ReverseBytesReader extends FSTBytesReader { + private final byte[] bytes; + private int pos; + + public ReverseBytesReader(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public byte readByte() { + return bytes[pos--]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for(int i=0;i T get(FST fst, IntsRef input) throws IOException { + final FSTBytesReader fstReader = fst.getBytesReader(0); + // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); - final FST.BytesReader fstReader = fst.getBytesReader(0); - // Accumulate output as we go T output = fst.outputs.getNoOutput(); for(int i=0;i T get(FST fst, BytesRef input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE1; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); @@ -101,7 +100,7 @@ * fit this. */ public static IntsRef getByOutput(FST fst, long targetOutput) throws IOException { - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); // TODO: would be nice not to alloc this on every lookup FST.Arc arc = fst.getFirstArc(new FST.Arc()); @@ -147,8 +146,8 @@ boolean exact = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid); final byte flags = in.readByte(); fst.readLabel(in); final long minArcOutput; @@ -273,7 +272,7 @@ public static class TopNSearcher { private final FST fst; - private final FST.BytesReader bytesReader; + private final FSTBytesReader bytesReader; private final int topN; private final int maxQueueDepth; @@ -374,7 +373,7 @@ //System.out.println("search topN=" + topN); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); final T NO_OUTPUT = fst.outputs.getNoOutput(); // TODO: we could enable FST to sorting arcs by weight @@ -595,7 +594,7 @@ emitDotState(out, "initial", "point", "white", ""); final T NO_OUTPUT = fst.outputs.getNoOutput(); - final FST.BytesReader r = fst.getBytesReader(0); + final FSTBytesReader r = fst.getBytesReader(0); // final FST.Arc scratchArc = new FST.Arc(); @@ -855,10 +854,10 @@ * @param fst the fst to operate on * @param follow the arc to follow reading the label from * @param arc the arc to read into in place - * @param in the fst's {@link BytesReader} + * @param in the fst's {@link FSTBytesReader} */ public static Arc readCeilArc(int label, FST fst, Arc follow, - Arc arc, BytesReader in) throws IOException { + Arc arc, FSTBytesReader in) throws IOException { // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum? if (label == FST.END_LABEL) { if (follow.isFinal()) { @@ -893,8 +892,8 @@ // " targetLabel=" + targetLabel); while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc * mid + 1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc * mid + 1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - label; // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + Index: lucene/core/src/java/org/apache/lucene/util/fst/FSTBytesReader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FSTBytesReader.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/fst/FSTBytesReader.java (working copy) @@ -0,0 +1,36 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; + +/** Reads bytes stored in an FST. */ +public abstract class FSTBytesReader extends DataInput { + /** Get current read position. */ + public abstract int getPosition(); + + /** Set current read position. */ + public abstract void setPosition(int pos); + + /** Returns true if this reader uses reversed bytes + * under-the-hood. */ + public abstract boolean reversed(); + + /** Skips bytes. */ + public abstract void skipBytes(int count); +} Property changes on: lucene/core/src/java/org/apache/lucene/util/fst/FSTBytesReader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (revision 1432289) +++ lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (working copy) @@ -34,7 +34,7 @@ this.fst = fst; } - private boolean nodesEqual(Builder.UnCompiledNode node, int address, FST.BytesReader in) throws IOException { + private boolean nodesEqual(Builder.UnCompiledNode node, int address, FSTBytesReader in) throws IOException { fst.readFirstRealTargetArc(address, scratchArc, in); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; @@ -87,12 +87,12 @@ // hash code for a frozen node private int hash(int node) throws IOException { final int PRIME = 31; - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); //System.out.println("hash frozen node=" + node); int h = 0; fst.readFirstRealTargetArc(node, scratchArc, in); while(true) { - //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal()); + //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition()); h = PRIME * h + scratchArc.label; h = PRIME * h + scratchArc.target; h = PRIME * h + scratchArc.output.hashCode(); @@ -111,7 +111,7 @@ public int add(Builder.UnCompiledNode nodeIn) throws IOException { // System.out.println("hash: add count=" + count + " vs " + table.length); - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); final int h = hash(nodeIn); int pos = h & mask; int c = 0; @@ -120,7 +120,6 @@ if (v == 0) { // freeze & add final int node = fst.addNode(nodeIn); - //System.out.println(" now freeze node=" + node); assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; count++; table[pos] = node;