Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java (revision 1431560) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; @@ -49,7 +50,7 @@ try { // Pre-cache root arcs: final FST.Arc scratchArc = new FST.Arc(); - final FST.BytesReader fstReader = map.getBytesReader(0); + final FSTBytesReader fstReader = map.getBytesReader(0); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java (revision 1431560) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.Outputs; /** @@ -41,7 +42,7 @@ private final Outputs outputs = CharSequenceOutputs.getSingleton(); private final FST map; - private final FST.BytesReader fstReader; + private final FSTBytesReader fstReader; private final RollingCharBuffer buffer = new RollingCharBuffer(); private final FST.Arc scratchArc = new FST.Arc(); private final Map> cachedRootArcs; Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (revision 1431560) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (working copy) @@ -34,6 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; /** * Matches single or multi word synonyms in a token stream. @@ -245,7 +246,7 @@ private final FST fst; - private final FST.BytesReader fstReader; + private final FSTBytesReader fstReader; private final BytesRef scratchBytes = new BytesRef(); Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java (revision 1431560) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java (working copy) @@ -31,6 +31,7 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.PositiveIntOutputs; /** @@ -139,7 +140,7 @@ TreeMap result = new TreeMap(); // index, [length, length...] boolean found = false; // true if we found any results - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); FST.Arc arc = new FST.Arc(); int end = off + len; Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java (revision 1431560) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java (working copy) @@ -19,8 +19,9 @@ import java.io.IOException; +import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FSTBytesReader; /** * Thin wrapper around an FST with root-arc caching for Japanese. @@ -48,13 +49,13 @@ rootCache = cacheRootArcs(); } - @SuppressWarnings("unchecked") + @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc[] cacheRootArcs() throws IOException { FST.Arc rootCache[] = new FST.Arc[1+(cacheCeiling-0x3040)]; FST.Arc firstArc = new FST.Arc(); fst.getFirstArc(firstArc); FST.Arc arc = new FST.Arc(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs) for (int i = 0; i < rootCache.length; i++) { if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) { @@ -64,7 +65,7 @@ return rootCache; } - public FST.Arc findTargetArc(int ch, FST.Arc follow, FST.Arc arc, boolean useCache, FST.BytesReader fstReader) throws IOException { + public FST.Arc findTargetArc(int ch, FST.Arc follow, FST.Arc arc, boolean useCache, FSTBytesReader fstReader) throws IOException { if (useCache && ch >= 0x3040 && ch <= cacheCeiling) { assert ch != FST.END_LABEL; final Arc result = rootCache[ch - 0x3040]; @@ -83,7 +84,7 @@ return fst.getFirstArc(arc); } - public FST.BytesReader getBytesReader(int pos) { + public FSTBytesReader getBytesReader(int pos) { return fst.getBytesReader(pos); } Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (revision 1431560) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (working copy) @@ -43,6 +43,7 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; // TODO: somehow factor out a reusable viterbi search here, // so other decompounders/tokenizers can reuse... @@ -142,10 +143,10 @@ private final CharacterDefinition characterDefinition; private final FST.Arc arc = new FST.Arc(); - private final FST.BytesReader fstReader; + private final FSTBytesReader fstReader; private final IntsRef wordIdRef = new IntsRef(); - private final FST.BytesReader userFSTReader; + private final FSTBytesReader userFSTReader; private final TokenInfoFST userFST; private final RollingCharBuffer buffer = new RollingCharBuffer(); Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (revision 1431560) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (working copy) @@ -21,8 +21,9 @@ import java.util.*; import org.apache.lucene.util.*; +import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FSTBytesReader; /** * Finite state automata based implementation of "autocomplete" functionality. @@ -139,7 +140,7 @@ try { List> rootArcs = new ArrayList>(); Arc arc = automaton.getFirstArc(new Arc()); - FST.BytesReader fstReader = automaton.getBytesReader(0); + FSTBytesReader fstReader = automaton.getBytesReader(0); automaton.readFirstTargetArc(arc, arc, fstReader); while (true) { rootArcs.add(new Arc().copyFrom(arc)); @@ -173,7 +174,7 @@ // Get the UTF-8 bytes representation of the input key. try { final FST.Arc scratch = new FST.Arc(); - FST.BytesReader fstReader = automaton.getBytesReader(0); + FSTBytesReader fstReader = automaton.getBytesReader(0); for (; rootArcIndex < rootArcs.length; rootArcIndex++) { final FST.Arc rootArc = rootArcs[rootArcIndex]; final FST.Arc arc = scratch.copyFrom(rootArc); @@ -338,7 +339,7 @@ final int max = utf8.offset + utf8.length; // Cannot save as instance var since multiple threads // can use FSTCompletion at once... - final FST.BytesReader fstReader = automaton.getBytesReader(0); + final FSTBytesReader fstReader = automaton.getBytesReader(0); for (int i = utf8.offset; i < max; i++) { if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) { // No matching prefixes, return an empty result. @@ -362,7 +363,7 @@ } assert output.offset == 0; output.bytes[output.length++] = (byte) arc.label; - FST.BytesReader fstReader = automaton.getBytesReader(0); + FSTBytesReader fstReader = automaton.getBytesReader(0); automaton.readFirstTargetArc(arc, arc, fstReader); while (true) { if (arc.label == FST.END_LABEL) { Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (revision 1431560) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (working copy) @@ -40,12 +40,12 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FST.Arc; -import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util.MinResult; import org.apache.lucene.util.fst.Util; -import org.apache.lucene.util.fst.Util.MinResult; /** * Suggester based on a weighted FST: it first traverses the prefix, @@ -200,7 +200,7 @@ private Long lookupPrefix(BytesRef scratch, Arc arc) throws /*Bogus*/IOException { assert 0 == fst.outputs.getNoOutput().longValue(); long output = 0; - BytesReader bytesReader = fst.getBytesReader(0); + FSTBytesReader bytesReader = fst.getBytesReader(0); fst.getFirstArc(arc); Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1431560) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -54,8 +54,8 @@ import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; -import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.fst.PairOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; @@ -587,7 +587,7 @@ //System.out.println(" prefixPaths: " + prefixPaths.size()); - BytesReader bytesReader = fst.getBytesReader(0); + FSTBytesReader bytesReader = fst.getBytesReader(0); FST.Arc> scratchArc = new FST.Arc>(); Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (revision 1431560) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (working copy) @@ -17,15 +17,16 @@ * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.io.IOException; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.Util; // TODO: move to core? nobody else uses it yet though... @@ -77,7 +78,7 @@ new IntsRef())); final FST.Arc scratchArc = new FST.Arc(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); while (queue.size() != 0) { final Path path = queue.remove(queue.size() - 1); Index: lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1431560) +++ lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) @@ -60,7 +60,6 @@ import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; import org.apache.lucene.util.fst.FST.Arc; -import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.packed.PackedInts; @@ -1022,7 +1021,7 @@ throws IOException { if (FST.targetHasArcs(arc)) { int childCount = 0; - FST.BytesReader fstReader = fst.getBytesReader(0); + FSTBytesReader fstReader = fst.getBytesReader(0); for (arc = fst.readFirstTargetArc(arc, arc, fstReader);; arc = fst.readNextArc(arc, fstReader), childCount++) { @@ -1292,7 +1291,7 @@ //Util.toDot(fst, w, false, false); //w.close(); - BytesReader reader = fst.getBytesReader(0); + FSTBytesReader reader = fst.getBytesReader(0); //System.out.println("testing: " + allPrefixes.size() + " prefixes"); for (String prefix : allPrefixes) { @@ -1413,7 +1412,7 @@ //Util.toDot(fst, w, false, false); //w.close(); - BytesReader reader = fst.getBytesReader(0); + FSTBytesReader reader = fst.getBytesReader(0); //System.out.println("testing: " + allPrefixes.size() + " prefixes"); for (String prefix : allPrefixes) { Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1431560) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -23,7 +23,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.FST.Arc; -import org.apache.lucene.util.fst.FST.BytesReader; /** Static helper methods. * @@ -39,7 +38,7 @@ // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); // Accumulate output as we go T output = fst.outputs.getNoOutput(); @@ -64,7 +63,7 @@ public static T get(FST fst, BytesRef input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE1; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); @@ -101,7 +100,7 @@ * fit this. */ public static IntsRef getByOutput(FST fst, long targetOutput) throws IOException { - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); // TODO: would be nice not to alloc this on every lookup FST.Arc arc = fst.getFirstArc(new FST.Arc()); @@ -147,8 +146,8 @@ boolean exact = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid); final byte flags = in.readByte(); fst.readLabel(in); final long minArcOutput; @@ -273,7 +272,7 @@ public static class TopNSearcher { private final FST fst; - private final FST.BytesReader bytesReader; + private final FSTBytesReader bytesReader; private final int topN; private final int maxQueueDepth; @@ -374,7 +373,7 @@ //System.out.println("search topN=" + topN); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); final T NO_OUTPUT = fst.outputs.getNoOutput(); // TODO: we could enable FST to sorting arcs by weight @@ -595,7 +594,7 @@ emitDotState(out, "initial", "point", "white", ""); final T NO_OUTPUT = fst.outputs.getNoOutput(); - final FST.BytesReader r = fst.getBytesReader(0); + final FSTBytesReader r = fst.getBytesReader(0); // final FST.Arc scratchArc = new FST.Arc(); @@ -858,7 +857,7 @@ * @param in the fst's {@link BytesReader} */ public static Arc readCeilArc(int label, FST fst, Arc follow, - Arc arc, BytesReader in) throws IOException { + Arc arc, FSTBytesReader in) throws IOException { // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum? if (label == FST.END_LABEL) { if (follow.isFinal()) { @@ -893,8 +892,8 @@ // " targetLabel=" + targetLabel); while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc * mid + 1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc * mid + 1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - label; // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + Index: lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java (revision 1431560) +++ lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java (working copy) @@ -17,11 +17,11 @@ * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; -import java.io.IOException; - /** Can next() and advance() through the terms in an FST * * @lucene.experimental @@ -35,7 +35,7 @@ @SuppressWarnings({"rawtypes","unchecked"}) protected T[] output = (T[]) new Object[10]; protected final T NO_OUTPUT; - protected final FST.BytesReader fstReader; + protected final FSTBytesReader fstReader; protected final FST.Arc scratchArc = new FST.Arc(); protected int upto; @@ -145,7 +145,7 @@ // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; @@ -153,8 +153,8 @@ boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid+1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); @@ -284,7 +284,7 @@ // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; @@ -292,8 +292,8 @@ boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart; - in.skip(arc.bytesPerArc*mid+1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); @@ -434,7 +434,7 @@ FST.Arc arc = getArc(upto-1); int targetLabel = getTargetLabel(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); while(true) { //System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); Index: lucene/core/src/java/org/apache/lucene/util/fst/FSTBytesReader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FSTBytesReader.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/fst/FSTBytesReader.java (working copy) @@ -0,0 +1,36 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; + +/** Reads bytes stored in an FST. */ +public abstract class FSTBytesReader extends DataInput { + /** Get current read position. */ + public abstract int getPosition(); + + /** Set current read position. */ + public abstract void setPosition(int pos); + + /** Returns true if this reader uses reversed bytes + * under-the-hood. */ + public abstract boolean reversed(); + + /** Skips bytes. */ + public abstract void skipBytes(int count); +} Property changes on: lucene/core/src/java/org/apache/lucene/util/fst/FSTBytesReader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (revision 1431560) +++ lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (working copy) @@ -34,7 +34,7 @@ this.fst = fst; } - private boolean nodesEqual(Builder.UnCompiledNode node, int address, FST.BytesReader in) throws IOException { + private boolean nodesEqual(Builder.UnCompiledNode node, int address, FSTBytesReader in) throws IOException { fst.readFirstRealTargetArc(address, scratchArc, in); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; @@ -87,7 +87,7 @@ // hash code for a frozen node private int hash(int node) throws IOException { final int PRIME = 31; - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); //System.out.println("hash frozen node=" + node); int h = 0; fst.readFirstRealTargetArc(node, scratchArc, in); @@ -111,7 +111,7 @@ public int add(Builder.UnCompiledNode nodeIn) throws IOException { // System.out.println("hash: add count=" + count + " vs " + table.length); - final FST.BytesReader in = fst.getBytesReader(0); + final FSTBytesReader in = fst.getBytesReader(0); final int h = hash(nodeIn); int pos = h & mask; int c = 0; @@ -120,7 +120,6 @@ if (v == 0) { // freeze & add final int node = fst.addNode(nodeIn); - //System.out.println(" now freeze node=" + node); assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; count++; table[pos] = node; Index: lucene/core/src/java/org/apache/lucene/util/fst/FST.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FST.java (revision 1431560) +++ lucene/core/src/java/org/apache/lucene/util/fst/FST.java (working copy) @@ -51,9 +51,6 @@ // job, ie, once we are at a 'suffix only', just store the // completion labels as a string not as a series of arcs. -// TODO: maybe make an explicit thread state that holds -// reusable stuff eg BytesReader, a scratch arc - // NOTE: while the FST is able to represent a non-final // dead-end state (NON_FINAL_END_NODE=0), the layers above // (FSTEnum, Util) have problems with this!! @@ -95,6 +92,9 @@ private final static byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT; + private final RAMOutputStream buffer; + private byte[] bufferBytes; + /** * @see #shouldExpand(UnCompiledNode) */ @@ -139,8 +139,7 @@ // produces this output T emptyOutput; - // Not private to avoid synthetic access$NNN methods: - byte[] bytes; + private final BytesStore bytes; private int startNode = -1; @@ -254,8 +253,6 @@ return (flags & bit) != 0; } - private final BytesWriter writer; - private GrowableWriter nodeAddress; // TODO: we could be smarter here, and prune periodically @@ -269,17 +266,18 @@ this.inputType = inputType; this.outputs = outputs; this.allowArrayArcs = allowArrayArcs; - bytes = new byte[128]; + buffer = new RAMOutputStream(); + bufferBytes = new byte[64]; + // 32 KB blocks: + bytes = new BytesStore(15); NO_OUTPUT = outputs.getNoOutput(); if (willPackFST) { - nodeAddress = new GrowableWriter(PackedInts.bitsRequired(bytes.length - 1), 8, acceptableOverheadRatio); + nodeAddress = new GrowableWriter(12, 8, acceptableOverheadRatio); inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio); } else { nodeAddress = null; inCounts = null; } - - writer = new DefaultBytesWriter(); emptyOutput = null; packed = false; @@ -289,23 +287,30 @@ /** Load a previously saved FST. */ public FST(DataInput in, Outputs outputs) throws IOException { this.outputs = outputs; - writer = null; + buffer = null; // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED); packed = in.readByte() == 1; if (in.readByte() == 1) { // accepts empty string + // 1 KB blocks: + BytesStore emptyBytes = new BytesStore(10); int numBytes = in.readVInt(); - bytes = new byte[numBytes]; - in.readBytes(bytes, 0, numBytes); - + emptyBytes.copyBytes(in, numBytes); + // De-serialize empty-string output: - BytesReader reader; + FSTBytesReader reader; if (packed) { - reader = new ForwardBytesReader(bytes, 0); + reader = emptyBytes.getForwardReader(); } else { - reader = new ReverseBytesReader(bytes, bytes.length-1); + reader = emptyBytes.getReverseReader(); + // NoOutputs uses 0 bytes when writing its output, + // so we have to check here else BytesStore gets + // angry: + if (numBytes > 0) { + reader.setPosition(numBytes-1); + } } emptyOutput = outputs.readFinalOutput(reader); } else { @@ -335,8 +340,9 @@ arcCount = in.readVInt(); arcWithOutputCount = in.readVInt(); - bytes = new byte[in.readVInt()]; - in.readBytes(bytes, 0, bytes.length); + int numBytes = in.readVInt(); + bytes = new BytesStore(in, numBytes); + NO_OUTPUT = outputs.getNoOutput(); cacheRootArcs(); @@ -353,7 +359,7 @@ /** Returns bytes used to represent the FST */ public int sizeInBytes() { - int size = bytes.length; + int size = bytes.getPosition(); if (packed) { size += nodeRefToAddress.ramBytesUsed(); } else if (nodeAddress != null) { @@ -370,10 +376,8 @@ if (this.startNode != -1) { throw new IllegalStateException("already finished"); } - byte[] finalBytes = new byte[writer.getPosition()]; - System.arraycopy(bytes, 0, finalBytes, 0, writer.getPosition()); - bytes = finalBytes; this.startNode = startNode; + bytes.finish(); cacheRootArcs(); } @@ -394,7 +398,7 @@ cachedRootArcs = (Arc[]) new Arc[0x80]; final Arc arc = new Arc(); getFirstArc(arc); - final BytesReader in = getBytesReader(0); + final FSTBytesReader in = getBytesReader(0); if (targetHasArcs(arc)) { readFirstRealTargetArc(arc.target, arc, in); while(true) { @@ -485,8 +489,9 @@ out.writeVInt(nodeCount); out.writeVInt(arcCount); out.writeVInt(arcWithOutputCount); - out.writeVInt(bytes.length); - out.writeBytes(bytes, 0, bytes.length); + int numBytes = bytes.getPosition(); + out.writeVInt(numBytes); + bytes.writeTo(out); } /** @@ -526,17 +531,16 @@ } } - private void writeLabel(int v) throws IOException { + private void writeLabel(DataOutput out, int v) throws IOException { assert v >= 0: "v=" + v; if (inputType == INPUT_TYPE.BYTE1) { assert v <= 255: "v=" + v; - writer.writeByte((byte) v); + out.writeByte((byte) v); } else if (inputType == INPUT_TYPE.BYTE2) { assert v <= 65535: "v=" + v; - writer.writeShort((short) v); + out.writeShort((short) v); } else { - //writeInt(v); - writer.writeVInt(v); + out.writeVInt(v); } } @@ -563,7 +567,7 @@ // serializes new node by appending its bytes to the end // of the current byte[] int addNode(Builder.UnCompiledNode nodeIn) throws IOException { - //System.out.println("FST.addNode pos=" + writer.posWrite + " numArcs=" + nodeIn.numArcs); + //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { return FINAL_END_NODE; @@ -572,8 +576,7 @@ } } - int startAddress = writer.getPosition(); - //System.out.println(" startAddr=" + startAddress); + assert buffer.getFilePointer() == 0; final boolean doFixedArray = shouldExpand(nodeIn); final int fixedArrayStart; @@ -582,13 +585,13 @@ bytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)]; } // write a "false" first arc: - writer.writeByte(ARCS_AS_FIXED_ARRAY); - writer.writeVInt(nodeIn.numArcs); + buffer.writeByte(ARCS_AS_FIXED_ARRAY); + buffer.writeVInt(nodeIn.numArcs); // placeholder -- we'll come back and write the number // of bytes per arc (int) here: // TODO: we could make this a vInt instead - writer.writeInt(0); - fixedArrayStart = writer.getPosition(); + buffer.writeInt(0); + fixedArrayStart = (int) buffer.getFilePointer(); //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); } else { fixedArrayStart = 0; @@ -598,7 +601,7 @@ final int lastArc = nodeIn.numArcs-1; - int lastArcStart = writer.getPosition(); + int lastArcStart = (int) buffer.getFilePointer(); int maxBytesPerArc = 0; for(int arcIdx=0;arcIdx arc = nodeIn.arcs[arcIdx]; @@ -637,34 +640,34 @@ flags += BIT_ARC_HAS_OUTPUT; } - writer.writeByte((byte) flags); - writeLabel(arc.label); + buffer.writeByte((byte) flags); + writeLabel(buffer, arc.label); - // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + writer.posWrite + " output=" + outputs.outputToString(arc.output)); + // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, writer); + outputs.write(arc.output, buffer); //System.out.println(" write output"); arcWithOutputCount++; } if (arc.nextFinalOutput != NO_OUTPUT) { //System.out.println(" write final output"); - outputs.writeFinalOutput(arc.nextFinalOutput, writer); + outputs.writeFinalOutput(arc.nextFinalOutput, buffer); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); - writer.writeInt(target.node); + buffer.writeInt(target.node); } // just write the arcs "like normal" on first pass, // but record how many bytes each one took, and max // byte size: if (doFixedArray) { - bytesPerArc[arcIdx] = writer.getPosition() - lastArcStart; - lastArcStart = writer.getPosition(); + bytesPerArc[arcIdx] = (int) (buffer.getFilePointer() - lastArcStart); + lastArcStart = (int) buffer.getFilePointer(); maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]); //System.out.println(" bytes=" + bytesPerArc[arcIdx]); } @@ -675,50 +678,69 @@ // outputs, then we should selectively disable array for // such cases + final int bufferUpto = (int) buffer.getFilePointer(); + + // Pull bytes into bufferBytes byte[]: + final int sizeNeeded; if (doFixedArray) { + sizeNeeded = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc; + assert sizeNeeded >= bufferUpto; + } else { + sizeNeeded = bufferUpto; + } + if (bufferBytes.length < sizeNeeded) { + bufferBytes = ArrayUtil.grow(bufferBytes, sizeNeeded); + } + buffer.writeTo(bufferBytes, 0); + + if (doFixedArray) { //System.out.println(" doFixedArray"); assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed // byte size - final int sizeNeeded = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc; assert ((long) fixedArrayStart) + ((long) nodeIn.numArcs) * maxBytesPerArc < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, sizeNeeded); // TODO: we could make this a vInt instead - bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); - bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16); - bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8); - bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; + bufferBytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); + bufferBytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16); + bufferBytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8); + bufferBytes[fixedArrayStart-1] = (byte) maxBytesPerArc; // expand the arcs in place, backwards - int srcPos = writer.getPosition(); + int srcPos = bufferUpto; int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; - writer.setPosition(destPos); for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); destPos -= maxBytesPerArc; srcPos -= bytesPerArc[arcIdx]; if (srcPos != destPos) { assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " bytesPerArc[arcIdx]=" + bytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; - System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]); + System.arraycopy(bufferBytes, srcPos, bufferBytes, destPos, bytesPerArc[arcIdx]); } } } + // TODO: we could just write to the BytesStore and add a + // .reverseBytes() ... this would save the double + // buffering + // reverse bytes in-place; we do this so that the // "BIT_TARGET_NEXT" opto can work, ie, it reads the // node just before the current one - final int endAddress = writer.getPosition() - 1; + final int endAddress = sizeNeeded - 1; - int left = startAddress; + int left = 0; int right = endAddress; while (left < right) { - final byte b = bytes[left]; - bytes[left++] = bytes[right]; - bytes[right--] = b; + final byte b = bufferBytes[left]; + bufferBytes[left++] = bufferBytes[right]; + bufferBytes[right--] = b; } //System.out.println(" endAddress=" + endAddress); + bytes.writeBytes(bufferBytes, sizeNeeded); + final int thisNodeAddress = (bytes.getPosition()-1); + nodeCount++; final int node; if (nodeAddress != null) { @@ -727,14 +749,16 @@ nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue())); } - nodeAddress.set(nodeCount, endAddress); + nodeAddress.set(nodeCount, thisNodeAddress); // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); node = nodeCount; } else { - node = endAddress; + node = thisNodeAddress; } lastFrozenNode = node; + buffer.reset(); + return node; } @@ -763,7 +787,7 @@ * * @return Returns the second argument * (arc). */ - public Arc readLastTargetArc(Arc follow, Arc arc, FST.BytesReader in) throws IOException { + public Arc readLastTargetArc(Arc follow, Arc arc, FSTBytesReader in) throws IOException { //System.out.println("readLast"); if (!targetHasArcs(follow)) { //System.out.println(" end node"); @@ -774,7 +798,7 @@ arc.flags = BIT_LAST_ARC; return arc; } else { - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); arc.node = follow.target; final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { @@ -786,7 +810,7 @@ arc.bytesPerArc = in.readInt(); } //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); - arc.posArcsStart = in.pos; + arc.posArcsStart = in.getPosition(); arc.arcIdx = arc.numArcs - 2; } else { arc.flags = b; @@ -808,14 +832,14 @@ if (packed) { in.readVInt(); } else { - in.skip(4); + in.skipBytes(4); } } arc.flags = in.readByte(); } - // Undo the byte flags we read: - in.skip(-1); - arc.nextArc = in.pos; + // Undo the byte flags we read: + in.skipBytes(-1); + arc.nextArc = in.getPosition(); } readNextRealArc(arc, in); assert arc.isLast(); @@ -830,7 +854,7 @@ * * @return Returns the second argument (arc). */ - public Arc readFirstTargetArc(Arc follow, Arc arc, BytesReader in) throws IOException { + public Arc readFirstTargetArc(Arc follow, Arc arc, FSTBytesReader in) throws IOException { //int pos = address; //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal()); if (follow.isFinal()) { @@ -853,10 +877,9 @@ } } - public Arc readFirstRealTargetArc(int node, Arc arc, final BytesReader in) throws IOException { - assert in.bytes == bytes; + public Arc readFirstRealTargetArc(int node, Arc arc, final FSTBytesReader in) throws IOException { final int address = getNodeAddress(node); - in.pos = address; + in.setPosition(address); //System.out.println(" readFirstRealTargtArc address=" //+ address); //System.out.println(" flags=" + arc.flags); @@ -872,7 +895,7 @@ arc.bytesPerArc = in.readInt(); } arc.arcIdx = -1; - arc.nextArc = arc.posArcsStart = in.pos; + arc.nextArc = arc.posArcsStart = in.getPosition(); //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); } else { //arc.flags = b; @@ -889,17 +912,17 @@ * @return Returns true if arc points to a state in an * expanded array format. */ - boolean isExpandedTarget(Arc follow, FST.BytesReader in) throws IOException { + boolean isExpandedTarget(Arc follow, FSTBytesReader in) throws IOException { if (!targetHasArcs(follow)) { return false; } else { - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); return in.readByte() == ARCS_AS_FIXED_ARRAY; } } /** In-place read; returns the arc. */ - public Arc readNextArc(Arc arc, BytesReader in) throws IOException { + public Arc readNextArc(Arc arc, FSTBytesReader in) throws IOException { if (arc.label == END_LABEL) { // This was a fake inserted "final" arc if (arc.nextArc <= 0) { @@ -913,12 +936,16 @@ /** Peeks at next arc's label; does not alter arc. Do * not call this if arc.isLast()! */ - public int readNextArcLabel(Arc arc, BytesReader in) throws IOException { + public int readNextArcLabel(Arc arc, FSTBytesReader in) throws IOException { assert !arc.isLast(); if (arc.label == END_LABEL) { - //System.out.println(" nextArc fake " + arc.nextArc); - int pos = in.pos = getNodeAddress(arc.nextArc); + //System.out.println(" nextArc fake " + + //arc.nextArc); + + int pos = getNodeAddress(arc.nextArc); + in.setPosition(pos); + final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { //System.out.println(" nextArc fake array"); @@ -929,18 +956,18 @@ in.readInt(); } } else { - in.pos = pos; + in.setPosition(pos); } } else { if (arc.bytesPerArc != 0) { //System.out.println(" nextArc real array"); // arcs are at fixed entries - in.pos = arc.posArcsStart; - in.skip((1+arc.arcIdx)*arc.bytesPerArc); + in.setPosition(arc.posArcsStart); + in.skipBytes((1+arc.arcIdx)*arc.bytesPerArc); } else { // arcs are packed //System.out.println(" nextArc real packed"); - in.pos = arc.nextArc; + in.setPosition(arc.nextArc); } } // skip flags @@ -950,8 +977,7 @@ /** Never returns null, but you should never call this if * arc.isLast() is true. */ - public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { - assert in.bytes == bytes; + public Arc readNextRealArc(Arc arc, final FSTBytesReader in) throws IOException { // TODO: can't assert this because we call from readFirstArc // assert !flag(arc.flags, BIT_LAST_ARC); @@ -961,10 +987,11 @@ // arcs are at fixed entries arc.arcIdx++; assert arc.arcIdx < arc.numArcs; - in.skip(arc.posArcsStart, arc.arcIdx*arc.bytesPerArc); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.arcIdx*arc.bytesPerArc); } else { // arcs are packed - in.pos = arc.nextArc; + in.setPosition(arc.nextArc); } arc.flags = in.readByte(); arc.label = readLabel(in); @@ -987,9 +1014,9 @@ } else { arc.target = NON_FINAL_END_NODE; } - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); } else if (arc.flag(BIT_TARGET_NEXT)) { - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); // TODO: would be nice to make this lazy -- maybe // caller doesn't need the target and is scanning arcs... if (nodeAddress == null) { @@ -998,17 +1025,18 @@ // must scan seekToNextNode(in); } else { - in.skip(arc.posArcsStart, arc.bytesPerArc * arc.numArcs); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc * arc.numArcs); } } - arc.target = in.pos; + arc.target = in.getPosition(); } else { arc.target = arc.node - 1; assert arc.target > 0; } } else { if (packed) { - final int pos = in.pos; + final int pos = in.getPosition(); final int code = in.readVInt(); if (arc.flag(BIT_TARGET_DELTA)) { // Address is delta-coded from current address: @@ -1026,16 +1054,15 @@ } else { arc.target = in.readInt(); } - arc.nextArc = in.pos; + arc.nextArc = in.getPosition(); } return arc; } /** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */ - public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in) throws IOException { + public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, FSTBytesReader in) throws IOException { assert cachedRootArcs != null; - assert in.bytes == bytes; if (labelToMatch == END_LABEL) { if (follow.isFinal()) { @@ -1070,7 +1097,7 @@ return null; } - in.pos = getNodeAddress(follow.target); + in.setPosition(getNodeAddress(follow.target)); arc.node = follow.target; @@ -1084,13 +1111,14 @@ } else { arc.bytesPerArc = in.readInt(); } - arc.posArcsStart = in.pos; + arc.posArcsStart = in.getPosition(); int low = 0; int high = arc.numArcs-1; while (low <= high) { //System.out.println(" cycle"); int mid = (low + high) >>> 1; - in.skip(arc.posArcsStart, arc.bytesPerArc*mid + 1); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc*mid + 1); int midLabel = readLabel(in); final int cmp = midLabel - labelToMatch; if (cmp < 0) { @@ -1128,7 +1156,7 @@ } } - private void seekToNextNode(BytesReader in) throws IOException { + private void seekToNextNode(FSTBytesReader in) throws IOException { while(true) { @@ -1196,142 +1224,28 @@ public abstract int getPosition(); } - // Non-static: writes to FST's byte[] - class DefaultBytesWriter extends BytesWriter { - int posWrite; - - public DefaultBytesWriter() { - // pad: ensure no node gets address 0 which is reserved to mean - // the stop state w/ no arcs - posWrite = 1; - } - - @Override - public void writeByte(byte b) { - assert posWrite <= bytes.length; - if (bytes.length == posWrite) { - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes); - } - assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length; - bytes[posWrite++] = b; - } - - @Override - public int getPosition() { - return posWrite; - } - - @Override - public void setPosition(int posWrite) { - this.posWrite = posWrite; - if (bytes.length < posWrite) { - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, posWrite); - } - } - - @Override - public void writeBytes(byte[] b, int offset, int length) { - final int size = posWrite + length; - assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; - bytes = ArrayUtil.grow(bytes, size); - System.arraycopy(b, offset, bytes, posWrite, length); - posWrite += length; - } - } - - /** Returns a {@link BytesReader} for this FST, positioned at + /** Returns a {@link FSTBytesReader} for this FST, positioned at * position 0. */ - public BytesReader getBytesReader() { + public FSTBytesReader getBytesReader() { return getBytesReader(0); } - /** Returns a {@link BytesReader} for this FST, positioned at + /** Returns a {@link FSTBytesReader} for this FST, positioned at * the provided position. */ - public BytesReader getBytesReader(int pos) { + public FSTBytesReader getBytesReader(int pos) { // TODO: maybe re-use via ThreadLocal? + FSTBytesReader in; if (packed) { - return new ForwardBytesReader(bytes, pos); + in = bytes.getForwardReader(); } else { - return new ReverseBytesReader(bytes, pos); + in = bytes.getReverseReader(); } - } - - /** Reads the bytes from this FST. Use {@link - * #getBytesReader(int)} to obtain an instance for this - * FST; re-use across calls (but only within a single - * thread) for better performance. */ - public static abstract class BytesReader extends DataInput { - protected int pos; - protected final byte[] bytes; - protected BytesReader(byte[] bytes, int pos) { - this.bytes = bytes; - this.pos = pos; + if (pos != 0) { + in.setPosition(pos); } - abstract void skip(int byteCount); - abstract void skip(int base, int byteCount); + return in; } - final static class ReverseBytesReader extends BytesReader { - - public ReverseBytesReader(byte[] bytes, int pos) { - super(bytes, pos); - } - - @Override - public byte readByte() { - return bytes[pos--]; - } - - @Override - public void readBytes(byte[] b, int offset, int len) { - for(int i=0;i { final Arc arc; final IntsRef chain; @@ -1451,14 +1365,14 @@ */ // Creates a packed FST - private FST(INPUT_TYPE inputType, PackedInts.Reader nodeRefToAddress, Outputs outputs) { + private FST(INPUT_TYPE inputType, Outputs outputs) { packed = true; this.inputType = inputType; - bytes = new byte[128]; - this.nodeRefToAddress = nodeRefToAddress; + // 32 KB blocks: + bytes = new BytesStore(15); + buffer = null; this.outputs = outputs; NO_OUTPUT = outputs.getNoOutput(); - writer = new DefaultBytesWriter(); // NOTE: bogus because this is only used during // building; we need to break out mutable FST from @@ -1497,7 +1411,7 @@ Arc arc = new Arc(); - final BytesReader r = getBytesReader(0); + final FSTBytesReader r = getBytesReader(0); final int topN = Math.min(maxDerefNodes, inCounts.size()); @@ -1529,17 +1443,13 @@ //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); } - final FST fst = new FST(inputType, null, outputs); - - final BytesWriter writer = fst.writer; - // +1 because node ords start at 1 (0 is reserved as stop node): final GrowableWriter newNodeAddress = new GrowableWriter( - PackedInts.bitsRequired(bytes.length), 1 + nodeCount, acceptableOverheadRatio); + PackedInts.bitsRequired(this.bytes.getPosition()), 1 + nodeCount, acceptableOverheadRatio); // Fill initial coarse guess: for(int node=1;node<=nodeCount;node++) { - newNodeAddress.set(node, 1 + bytes.length - nodeAddress.get(node)); + newNodeAddress.set(node, 1 + this.bytes.getPosition() - nodeAddress.get(node)); } int absCount; @@ -1547,6 +1457,8 @@ int topCount; int nextCount; + FST fst; + // Iterate until we converge: while(true) { @@ -1556,7 +1468,10 @@ // for assert: boolean negDelta = false; - writer.setPosition(0); + fst = new FST(inputType, outputs); + + final BytesStore writer = fst.bytes; + // Skip 0 byte since 0 is reserved target: writer.writeByte((byte) 0); @@ -1578,6 +1493,7 @@ for(int node=nodeCount;node>=1;node--) { fst.nodeCount++; final int address = writer.getPosition(); + //System.out.println(" node: " + node + " address=" + address); if (address != newNodeAddress.get(node)) { addressError = address - (int) newNodeAddress.get(node); @@ -1599,7 +1515,7 @@ // this is an array'd node and bytesPerArc changes: writeNode: while(true) { // retry writing this node - + assert buffer.getFilePointer() == 0; readFirstRealTargetArc(node, arc, r); final boolean useArcArray = arc.bytesPerArc != 0; @@ -1608,9 +1524,9 @@ if (bytesPerArc == 0) { bytesPerArc = arc.bytesPerArc; } - writer.writeByte(ARCS_AS_FIXED_ARRAY); - writer.writeVInt(arc.numArcs); - writer.writeVInt(bytesPerArc); + buffer.writeByte(ARCS_AS_FIXED_ARRAY); + buffer.writeVInt(arc.numArcs); + buffer.writeVInt(bytesPerArc); //System.out.println("node " + node + ": " + arc.numArcs + " arcs"); } @@ -1618,8 +1534,7 @@ //int wasted = 0; while(true) { // iterate over all arcs for this node - //System.out.println(" arc label=" + arc.label + " target=" + arc.target + " pos=" + writer.posWrite); - final int arcStartPos = writer.getPosition(); + final int arcStartPos = (int) buffer.getFilePointer(); nodeArcCount++; byte flags = 0; @@ -1666,7 +1581,7 @@ absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError; } - int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.getPosition() - 2; + int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address - 2); if (delta < 0) { //System.out.println("neg: " + delta); anyNegDelta = true; @@ -1681,22 +1596,22 @@ absPtr = 0; } - writer.writeByte(flags); - fst.writeLabel(arc.label); + buffer.writeByte(flags); + fst.writeLabel(buffer, arc.label); if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, writer); + outputs.write(arc.output, buffer); if (!retry) { fst.arcWithOutputCount++; } } if (arc.nextFinalOutput != NO_OUTPUT) { - outputs.writeFinalOutput(arc.nextFinalOutput, writer); + outputs.writeFinalOutput(arc.nextFinalOutput, buffer); } if (doWriteTarget) { - int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.getPosition(); + int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address); if (delta < 0) { anyNegDelta = true; //System.out.println("neg: " + delta); @@ -1705,7 +1620,7 @@ if (flag(flags, BIT_TARGET_DELTA)) { //System.out.println(" delta"); - writer.writeVInt(delta); + buffer.writeVInt(delta); if (!retry) { deltaCount++; } @@ -1717,7 +1632,7 @@ System.out.println(" abs"); } */ - writer.writeVInt(absPtr); + buffer.writeVInt(absPtr); if (!retry) { if (absPtr >= topNodeMap.size()) { absCount++; @@ -1729,7 +1644,7 @@ } if (useArcArray) { - final int arcBytes = writer.getPosition() - arcStartPos; + final int arcBytes = (int) (buffer.getFilePointer() - arcStartPos); //System.out.println(" " + arcBytes + " bytes"); maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); // NOTE: this may in fact go "backwards", if @@ -1739,7 +1654,11 @@ // will retry (below) so it's OK to ovewrite // bytes: //wasted += bytesPerArc - arcBytes; - writer.setPosition(arcStartPos + bytesPerArc); + int skip = (int) (arcStartPos + bytesPerArc - buffer.getFilePointer()); + while(skip > 0) { + buffer.writeByte((byte) 0); + skip--; + } } if (arc.isLast()) { @@ -1764,11 +1683,19 @@ // Retry: bytesPerArc = maxBytesPerArc; - writer.setPosition(address); + buffer.reset(); nodeArcCount = 0; retry = true; anyNegDelta = false; } + + if (bufferBytes.length < (int) buffer.getFilePointer()) { + bufferBytes = ArrayUtil.grow(bufferBytes, (int) buffer.getFilePointer()); + } + buffer.writeTo(bufferBytes, 0); + writer.writeBytes(bufferBytes, 0, (int) buffer.getFilePointer()); + buffer.reset(); + negDelta |= anyNegDelta; fst.arcCount += nodeArcCount; @@ -1799,7 +1726,6 @@ } fst.nodeRefToAddress = nodeRefToAddressIn; - fst.startNode = (int) newNodeAddress.get(startNode); //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); @@ -1810,11 +1736,8 @@ assert fst.nodeCount == nodeCount: "fst.nodeCount=" + fst.nodeCount + " nodeCount=" + nodeCount; assert fst.arcCount == arcCount; assert fst.arcWithOutputCount == arcWithOutputCount: "fst.arcWithOutputCount=" + fst.arcWithOutputCount + " arcWithOutputCount=" + arcWithOutputCount; - - final byte[] finalBytes = new byte[writer.getPosition()]; - //System.out.println("resize " + fst.bytes.length + " down to " + writer.posWrite); - System.arraycopy(fst.bytes, 0, finalBytes, 0, writer.getPosition()); - fst.bytes = finalBytes; + + fst.bytes.finish(); fst.cacheRootArcs(); //final int size = fst.sizeInBytes(); Index: lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (working copy) @@ -0,0 +1,248 @@ +package org.apache.lucene.util.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +// TODO: merge with PagedBytes, except PagedBytes doesn't +// let you read while writing which FST needs + +class BytesStore extends DataOutput { + private final static int MAX_BLOCK_SIZE = Integer.MAX_VALUE; + + private final List blocks = new ArrayList(); + + private final int blockSize; + private final int blockBits; + private final int blockMask; + + private byte[] current; + private int nextWrite; + + public BytesStore(int blockBits) { + this.blockBits = blockBits; + blockSize = 1 << blockBits; + blockMask = blockSize-1; + nextWrite = blockSize; + } + + /** Pulls bytes from the provided IndexInput. */ + public BytesStore(DataInput in, int numBytes) throws IOException { + int blockSize = 2; + int blockBits = 1; + while(blockSize < numBytes && blockSize < MAX_BLOCK_SIZE) { + blockSize *= 2; + blockBits++; + } + this.blockBits = blockBits; + this.blockSize = blockSize; + this.blockMask = blockSize-1; + int left = numBytes; + while(left > 0) { + final int chunk = Math.min(blockSize, left); + byte[] block = new byte[chunk]; + in.readBytes(block, 0, block.length); + blocks.add(block); + left -= chunk; + } + } + + @Override + public void writeByte(byte b) { + if (nextWrite == blockSize) { + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + current[nextWrite++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int len) { + while (len > 0) { + int chunk = blockSize - nextWrite; + if (len <= chunk) { + System.arraycopy(b, offset, current, nextWrite, len); + nextWrite += len; + break; + } else { + if (chunk > 0) { + System.arraycopy(b, offset, current, nextWrite, chunk); + offset += chunk; + len -= chunk; + } + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + } + } + + public void skip(int len) { + System.out.println("skip len=" + len); + while (len > 0) { + int chunk = blockSize - nextWrite; + if (len <= chunk) { + nextWrite += len; + break; + } else { + len -= chunk; + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + } + } + + public int getPosition() { + return (blocks.size()-1) * blockSize + nextWrite; + } + + public void finish() { + if (current != null) { + byte[] lastBuffer = new byte[nextWrite]; + System.arraycopy(current, 0, lastBuffer, 0, nextWrite); + blocks.set(blocks.size()-1, lastBuffer); + current = null; + } + } + + public void writeTo(DataOutput out) throws IOException { + for(byte[] block : blocks) { + out.writeBytes(block, 0, block.length); + } + } + + public FSTBytesReader getForwardReader() { + return new FSTBytesReader() { + private byte[] current; + private int nextBuffer; + private int nextRead = blockSize; + + @Override + public byte readByte() { + if (nextRead == blockSize) { + current = blocks.get(nextBuffer++); + nextRead = 0; + } + return current[nextRead++]; + } + + @Override + public void skipBytes(int count) { + setPosition(getPosition() + count); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + while(len > 0) { + int chunkLeft = blockSize - nextRead; + if (len <= chunkLeft) { + System.arraycopy(current, nextRead, b, offset, len); + nextRead += len; + break; + } else { + if (chunkLeft > 0) { + System.arraycopy(current, nextRead, b, offset, chunkLeft); + offset += chunkLeft; + len -= chunkLeft; + } + current = blocks.get(nextBuffer++); + nextRead = 0; + } + } + } + + @Override + public int getPosition() { + return (nextBuffer-1)*blockSize + nextRead; + } + + @Override + public void setPosition(int pos) { + int bufferIndex = pos >> blockBits; + nextBuffer = bufferIndex+1; + current = blocks.get(bufferIndex); + nextRead = pos & blockMask; + assert getPosition() == pos; + } + + @Override + public boolean reversed() { + return false; + } + }; + } + + public FSTBytesReader getReverseReader() { + return new FSTBytesReader() { + private byte[] current = blocks.size() == 0 ? null : blocks.get(0); + private int nextBuffer = -1; + private int nextRead = 0; + + @Override + public byte readByte() { + if (nextRead == -1) { + current = blocks.get(nextBuffer--); + nextRead = blockSize-1; + } + return current[nextRead--]; + } + + @Override + public void skipBytes(int count) { + setPosition(getPosition() - count); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for(int i=0;i> blockBits; + nextBuffer = bufferIndex-1; + current = blocks.get(bufferIndex); + nextRead = pos & blockMask; + assert getPosition() == pos: "pos=" + pos + " getPos()=" + getPosition(); + } + + @Override + public boolean reversed() { + return true; + } + }; + } +} Property changes on: lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java (revision 1431560) +++ lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java (working copy) @@ -53,6 +53,7 @@ import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTBytesReader; import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; @@ -568,7 +569,7 @@ private final BytesRef term = new BytesRef(); - private final FST.BytesReader fstReader; + private final FSTBytesReader fstReader; // TODO: can we share this with the frame in STE? private final class Frame { @@ -1262,7 +1263,7 @@ private boolean eof; final BytesRef term = new BytesRef(); - private final FST.BytesReader fstReader; + private final FSTBytesReader fstReader; @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc[] arcs = new FST.Arc[1]; Index: lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java (revision 1431560) +++ lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java (working copy) @@ -203,7 +203,7 @@ final FST.Arc arc = fst.getFirstArc(new FST.Arc()); final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); for(int i=0;i<=term.length;i++) { final int label; @@ -240,7 +240,7 @@ in.offset = 0; final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FSTBytesReader fstReader = fst.getBytesReader(0); while(true) { // read all arcs: