Index: lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java =================================================================== --- lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (revision 1127319) +++ lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (working copy) @@ -929,45 +929,6 @@ dir.close(); } - // LUCENE-1044: Simulate checksum error in segments_N - public void testSegmentsChecksumError() throws IOException { - Directory dir = newDirectory(); - - IndexWriter writer = null; - - writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer())); - - // add 100 documents - for (int i = 0; i < 100; i++) { - addDoc(writer); - } - - // close - writer.close(); - - long gen = SegmentInfos.getCurrentSegmentGeneration(dir); - assertTrue("segment generation should be > 0 but got " + gen, gen > 0); - - final String segmentsFileName = SegmentInfos.getCurrentSegmentFileName(dir); - IndexInput in = dir.openInput(segmentsFileName); - IndexOutput out = dir.createOutput(IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", 1+gen)); - out.copyBytes(in, in.length()-1); - byte b = in.readByte(); - out.writeByte((byte) (1+b)); - out.close(); - in.close(); - - IndexReader reader = null; - try { - reader = IndexReader.open(dir, true); - } catch (IOException e) { - e.printStackTrace(System.out); - fail("segmentInfos failed to retry fallback to correct segments_N file"); - } - reader.close(); - dir.close(); - } - // Simulate a corrupt index by removing last byte of // latest segments file and make sure we get an // IOException trying to open the index: Index: lucene/backwards/src/test/org/apache/lucene/store/TestCopyBytes.java =================================================================== --- lucene/backwards/src/test/org/apache/lucene/store/TestCopyBytes.java (revision 1127319) +++ lucene/backwards/src/test/org/apache/lucene/store/TestCopyBytes.java (working copy) @@ -1,106 +0,0 @@ -package org.apache.lucene.store; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util._TestUtil; - -import org.junit.Test; - -public class TestCopyBytes extends LuceneTestCase { - - private byte value(int idx) { - return (byte) ((idx%256) * (1+(idx/256))); - } - - - @Test - public void testCopyBytes() throws Exception { - for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) { - Directory dir = newDirectory(); - if (VERBOSE) { - System.out.println("TEST: iter=" + iter + " dir=" + dir); - } - - // make random file - IndexOutput out = dir.createOutput("test"); - byte[] bytes = new byte[_TestUtil.nextInt(random, 1, 77777)]; - final int size = _TestUtil.nextInt(random, 1, 1777777); - int upto = 0; - int byteUpto = 0; - while(upto < size) { - bytes[byteUpto++] = value(upto); - upto++; - if (byteUpto == bytes.length) { - out.writeBytes(bytes, 0, bytes.length); - byteUpto = 0; - } - } - - out.writeBytes(bytes, 0, byteUpto); - assertEquals(size, out.getFilePointer()); - out.close(); - assertEquals(size, dir.fileLength("test")); - - // copy from test -> test2 - final IndexInput in = dir.openInput("test"); - - out = dir.createOutput("test2"); - - upto = 0; - while(upto < size) { - if (random.nextBoolean()) { - out.writeByte(in.readByte()); - upto++; - } else { - final int chunk = Math.min(_TestUtil.nextInt(random, 1, bytes.length), size-upto); - out.copyBytes(in, chunk); - upto += chunk; - } - } - assertEquals(size, upto); - out.close(); - in.close(); - - // verify - IndexInput in2 = dir.openInput("test2"); - upto = 0; - while(upto < size) { - if (random.nextBoolean()) { - final byte v = in2.readByte(); - assertEquals(value(upto), v); - upto++; - } else { - final int limit = Math.min(_TestUtil.nextInt(random, 1, bytes.length), size-upto); - in2.readBytes(bytes, 0, limit); - for(int byteIdx=0;byteIdx= 0 && x <= 255; + br.bytes[i] = (byte) x; + } + br.length = ir.length; + return br; + } + + private static IntsRef toIntsRef(String s, int inputMode) { + return toIntsRef(s, inputMode, new IntsRef(10)); + } + + private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { + if (inputMode == 0) { + // utf8 + return toIntsRef(new BytesRef(s), ir); + } else { + // utf32 + return toIntsRefUTF32(s, ir); + } + } + + private static IntsRef toIntsRefUTF32(String s, IntsRef ir) { + final int charLength = s.length(); + int charIdx = 0; + int intIdx = 0; + while(charIdx < charLength) { + if (intIdx == ir.ints.length) { + ir.grow(intIdx+1); + } + final int utf32 = s.codePointAt(charIdx); + ir.ints[intIdx] = utf32; + charIdx += Character.charCount(utf32); + intIdx++; + } + ir.length = intIdx; + return ir; + } + + private static IntsRef toIntsRef(BytesRef br, IntsRef ir) { + if (br.length > ir.ints.length) { + ir.grow(br.length); + } + for(int i=0;i outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + final List> pairs = new ArrayList>(terms.length); + for(IntsRef term : terms) { + pairs.add(new FSTTester.InputOutput(term, NO_OUTPUT)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (ord) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], outputs.get(idx))); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (random monotonically increasing positive number) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final List> pairs = new ArrayList>(terms.length); + long lastOutput = 0; + for(int idx=0;idx(terms[idx], outputs.get(value))); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (random positive number) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Pair + { + final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final PairOutputs outputs = new PairOutputs(o1, o2); + final List>> pairs = new ArrayList>>(terms.length); + long lastOutput = 0; + for(int idx=0;idx>(terms[idx], + outputs.get(o1.get(idx), + o2.get(value)))); + } + new FSTTester>(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Sequence-of-bytes + { + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + final BytesRef NO_OUTPUT = outputs.getNoOutput(); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Sequence-of-ints + { + final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Up to two positive ints, shared, generally but not + // monotonically increasing + { + if (VERBOSE) { + System.out.println("TEST: now test UpToTwoPositiveIntOutputs"); + } + final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true); + final List> pairs = new ArrayList>(terms.length); + long lastOutput = 0; + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + } + + private static class FSTTester { + + final Random random; + final List> pairs; + final int inputMode; + final Outputs outputs; + final Directory dir; + + public FSTTester(Random random, Directory dir, int inputMode, List> pairs, Outputs outputs) { + this.random = random; + this.dir = dir; + this.inputMode = inputMode; + this.pairs = pairs; + this.outputs = outputs; + } + + private static class InputOutput implements Comparable> { + public final IntsRef input; + public final T output; + + public InputOutput(IntsRef input, T output) { + this.input = input; + this.output = output; + } + + public int compareTo(InputOutput other) { + if (other instanceof InputOutput) { + return input.compareTo((other).input); + } else { + throw new IllegalArgumentException(); + } + } + } + + public void doTest() throws IOException { + // no pruning + doTest(0, 0); + + if (!(outputs instanceof UpToTwoPositiveIntOutputs)) { + // simple pruning + doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0); + + // leafy pruning + doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size())); + } + } + + // runs the term, returning the output, or null if term + // isn't accepted. if prefixLength is non-null it must be + // length 1 int array; prefixLength[0] is set to the length + // of the term prefix that matches + private T run(FST fst, IntsRef term, int[] prefixLength) throws IOException { + assert prefixLength == null || prefixLength.length == 1; + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + + for(int i=0;i<=term.length;i++) { + final int label; + if (i == term.length) { + label = FST.END_LABEL; + } else { + label = term.ints[term.offset+i]; + } + //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); + if (fst.findTargetArc(label, arc, arc) == null) { + if (prefixLength != null) { + prefixLength[0] = i; + return output; + } else { + return null; + } + } + output = fst.outputs.add(output, arc.output); + } + + if (prefixLength != null) { + prefixLength[0] = term.length; + } + + return output; + } + + private T randomAcceptedWord(FST fst, IntsRef in) throws IOException { + FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + final List> arcs = new ArrayList>(); + in.length = 0; + in.offset = 0; + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + + while(true) { + // read all arcs: + fst.readFirstTargetArc(arc, arc); + arcs.add(new FST.Arc().copyFrom(arc)); + while(!arc.isLast()) { + fst.readNextArc(arc); + arcs.add(new FST.Arc().copyFrom(arc)); + } + + // pick one + arc = arcs.get(random.nextInt(arcs.size())); + arcs.clear(); + + // accumulate output + output = fst.outputs.add(output, arc.output); + + // append label + if (arc.label == FST.END_LABEL) { + break; + } + + if (in.ints.length == in.length) { + in.grow(1+in.length); + } + in.ints[in.length++] = arc.label; + } + + return output; + } + + + FST doTest(int prune1, int prune2) throws IOException { + if (VERBOSE) { + System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); + } + + final Builder builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, + prune1, prune2, + prune1==0 && prune2==0, outputs); + + for(InputOutput pair : pairs) { + if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { + final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs; + final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output; + @SuppressWarnings("unchecked") final Builder builderObject = (Builder) builder; + builderObject.add(pair.input, _outputs.get(twoLongs.first)); + builderObject.add(pair.input, _outputs.get(twoLongs.second)); + } else { + builder.add(pair.input, pair.output); + } + } + FST fst = builder.finish(); + + if (random.nextBoolean() && fst != null) { + IndexOutput out = dir.createOutput("fst.bin"); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst.bin"); + try { + fst = new FST(in, outputs); + } finally { + in.close(); + dir.deleteFile("fst.bin"); + } + } + + if (VERBOSE && pairs.size() <= 20 && fst != null) { + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + Util.toDot(fst, w, false, false); + w.close(); + System.out.println("SAVED out.dot"); + } + + if (VERBOSE) { + if (fst == null) { + System.out.println(" fst has 0 nodes (fully pruned)"); + } else { + System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs"); + } + } + + if (prune1 == 0 && prune2 == 0) { + verifyUnPruned(inputMode, fst); + } else { + verifyPruned(inputMode, fst, prune1, prune2); + } + + return fst; + } + + // FST is complete + private void verifyUnPruned(int inputMode, FST fst) throws IOException { + + if (pairs.size() == 0) { + assertNull(fst); + return; + } + + if (VERBOSE) { + System.out.println("TEST: now verify " + pairs.size() + " terms"); + for(InputOutput pair : pairs) { + assertNotNull(pair); + assertNotNull(pair.input); + assertNotNull(pair.output); + System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); + } + } + + assertNotNull(fst); + + // visit valid paris in order -- make sure all words + // are accepted, and FSTEnum's next() steps through + // them correctly + if (VERBOSE) { + System.out.println("TEST: check valid terms/next()"); + } + { + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + for(InputOutput pair : pairs) { + IntsRef term = pair.input; + if (VERBOSE) { + System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output)); + } + Object output = run(fst, term, null); + + assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output); + assertEquals(pair.output, output); + + // verify enum's next + IntsRefFSTEnum.InputOutput t = fstEnum.next(); + assertNotNull(t); + assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input); + assertEquals(pair.output, t.output); + } + assertNull(fstEnum.next()); + } + + final Map termsMap = new HashMap(); + for(InputOutput pair : pairs) { + termsMap.put(pair.input, pair.output); + } + + // find random matching word and make sure it's valid + if (VERBOSE) { + System.out.println("TEST: verify random accepted terms"); + } + final IntsRef scratch = new IntsRef(10); + for(int iter=0;iter<500*RANDOM_MULTIPLIER;iter++) { + T output = randomAcceptedWord(fst, scratch); + assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch)); + assertEquals(termsMap.get(scratch), output); + } + + // test IntsRefFSTEnum.seek: + if (VERBOSE) { + System.out.println("TEST: verify seek"); + } + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + if (random.nextBoolean()) { + // seek to term that doesn't exist: + while(true) { + final IntsRef term = toIntsRef(getRandomString(), inputMode); + int pos = Collections.binarySearch(pairs, new InputOutput(term, null)); + if (pos < 0) { + pos = -(pos+1); + // ok doesn't exist + //System.out.println(" seek " + inputToString(inputMode, term)); + final IntsRefFSTEnum.InputOutput seekResult; + if (random.nextBoolean()) { + if (VERBOSE) { + System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term)); + } + seekResult = fstEnum.seekFloor(term); + pos--; + } else { + if (VERBOSE) { + System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term)); + } + seekResult = fstEnum.seekCeil(term); + } + + if (pos != -1 && pos < pairs.size()) { + //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output)); + assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult); + if (VERBOSE) { + System.out.println(" got " + inputToString(inputMode, seekResult.input)); + } + assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input); + assertEquals(pairs.get(pos).output, seekResult.output); + } else { + // seeked before start or beyond end + //System.out.println("seek=" + seekTerm); + assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult); + if (VERBOSE) { + System.out.println(" got null"); + } + } + + break; + } + } + } else { + // seek to term that does exist: + InputOutput pair = pairs.get(random.nextInt(pairs.size())); + final IntsRefFSTEnum.InputOutput seekResult; + if (random.nextBoolean()) { + if (VERBOSE) { + System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input)); + } + seekResult = fstEnum.seekFloor(pair.input); + } else { + if (VERBOSE) { + System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input)); + } + seekResult = fstEnum.seekCeil(pair.input); + } + assertNotNull(seekResult); + assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input); + assertEquals(pair.output, seekResult.output); + } + } + + if (VERBOSE) { + System.out.println("TEST: mixed next/seek"); + } + + // test mixed next/seek + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + if (VERBOSE) { + System.out.println("TEST: iter " + iter); + } + // reset: + fstEnum = new IntsRefFSTEnum(fst); + int upto = -1; + while(true) { + boolean isDone = false; + if (upto == pairs.size()-1 || random.nextBoolean()) { + // next + upto++; + if (VERBOSE) { + System.out.println(" do next"); + } + isDone = fstEnum.next() == null; + } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) { + int attempt = 0; + for(;attempt<10;attempt++) { + IntsRef term = toIntsRef(getRandomString(), inputMode); + if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) { + int pos = Collections.binarySearch(pairs, new InputOutput(term, null)); + assert pos < 0; + upto = -(pos+1); + + if (random.nextBoolean()) { + upto--; + assertTrue(upto != -1); + if (VERBOSE) { + System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")"); + } + isDone = fstEnum.seekFloor(term) == null; + } else { + if (VERBOSE) { + System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")"); + } + isDone = fstEnum.seekCeil(term) == null; + } + + break; + } + } + if (attempt == 10) { + continue; + } + + } else { + final int inc = random.nextInt(pairs.size() - upto - 1); + upto += inc; + if (upto == -1) { + upto = 0; + } + + if (random.nextBoolean()) { + if (VERBOSE) { + System.out.println(" do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")"); + } + isDone = fstEnum.seekCeil(pairs.get(upto).input) == null; + } else { + if (VERBOSE) { + System.out.println(" do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")"); + } + isDone = fstEnum.seekFloor(pairs.get(upto).input) == null; + } + } + if (VERBOSE) { + if (!isDone) { + System.out.println(" got " + inputToString(inputMode, fstEnum.current().input)); + } else { + System.out.println(" got null"); + } + } + + if (upto == pairs.size()) { + assertTrue(isDone); + break; + } else { + assertFalse(isDone); + assertEquals(pairs.get(upto).input, fstEnum.current().input); + assertEquals(pairs.get(upto).output, fstEnum.current().output); + + /* + if (upto < pairs.size()-1) { + int tryCount = 0; + while(tryCount < 10) { + final IntsRef t = toIntsRef(getRandomString(), inputMode); + if (pairs.get(upto).input.compareTo(t) < 0) { + final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0; + if (VERBOSE) { + System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected); + } + assertEquals(expected, fstEnum.beforeNext(t)); + break; + } + tryCount++; + } + } + */ + } + } + } + } + + private static class CountMinOutput { + int count; + T output; + T finalOutput; + boolean isLeaf = true; + boolean isFinal; + } + + // FST is pruned + private void verifyPruned(int inputMode, FST fst, int prune1, int prune2) throws IOException { + + if (VERBOSE) { + System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs); + for(InputOutput pair : pairs) { + System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); + } + } + + // To validate the FST, we brute-force compute all prefixes + // in the terms, matched to their "common" outputs, prune that + // set according to the prune thresholds, then assert the FST + // matches that same set. + + // NOTE: Crazy RAM intensive!! + + //System.out.println("TEST: tally prefixes"); + + // build all prefixes + final Map> prefixes = new HashMap>(); + final IntsRef scratch = new IntsRef(10); + for(InputOutput pair: pairs) { + scratch.copy(pair.input); + for(int idx=0;idx<=pair.input.length;idx++) { + scratch.length = idx; + CountMinOutput cmo = prefixes.get(scratch); + if (cmo == null) { + cmo = new CountMinOutput(); + cmo.count = 1; + cmo.output = pair.output; + prefixes.put(new IntsRef(scratch), cmo); + } else { + cmo.count++; + cmo.output = outputs.common(cmo.output, pair.output); + } + if (idx == pair.input.length) { + cmo.isFinal = true; + cmo.finalOutput = cmo.output; + } + } + } + + if (VERBOSE) { + System.out.println("TEST: now prune"); + } + + // prune 'em + final Iterator>> it = prefixes.entrySet().iterator(); + while(it.hasNext()) { + Map.Entry> ent = it.next(); + final IntsRef prefix = ent.getKey(); + final CountMinOutput cmo = ent.getValue(); + if (VERBOSE) { + System.out.println(" term=" + inputToString(inputMode, prefix) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal); + } + final boolean keep; + if (prune1 > 0) { + keep = cmo.count >= prune1; + } else { + assert prune2 > 0; + if (prune2 > 1 && cmo.count >= prune2) { + keep = true; + } else if (prefix.length > 0) { + // consult our parent + scratch.length = prefix.length-1; + System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length); + final CountMinOutput cmo2 = prefixes.get(scratch); + //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); + keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1))); + } else if (cmo.count >= prune2) { + keep = true; + } else { + keep = false; + } + } + + if (!keep) { + it.remove(); + //System.out.println(" remove"); + } else { + // clear isLeaf for all ancestors + //System.out.println(" keep"); + scratch.copy(prefix); + scratch.length--; + while(scratch.length >= 0) { + final CountMinOutput cmo2 = prefixes.get(scratch); + if (cmo2 != null) { + //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); + cmo2.isLeaf = false; + } + scratch.length--; + } + } + } + + //System.out.println("TEST: after prune"); + /* + for(Map.Entry ent : prefixes.entrySet()) { + System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); + if (ent.getValue().isFinal) { + System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); + } + } + */ + + if (prefixes.size() <= 1) { + assertNull(fst); + return; + } + + assertNotNull(fst); + + // make sure FST only enums valid prefixes + if (VERBOSE) { + System.out.println("TEST: check pruned enum"); + } + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + IntsRefFSTEnum.InputOutput current; + while((current = fstEnum.next()) != null) { + if (VERBOSE) { + System.out.println(" fstEnum.next term=" + inputToString(inputMode, current.input) + " output=" + outputs.outputToString(current.output)); + } + final CountMinOutput cmo = prefixes.get(current.input); + assertNotNull(cmo); + assertTrue(cmo.isLeaf || cmo.isFinal); + //if (cmo.isFinal && !cmo.isLeaf) { + if (cmo.isFinal) { + assertEquals(cmo.finalOutput, current.output); + } else { + assertEquals(cmo.output, current.output); + } + } + + // make sure all non-pruned prefixes are present in the FST + if (VERBOSE) { + System.out.println("TEST: verify all prefixes"); + } + final int[] stopNode = new int[1]; + for(Map.Entry> ent : prefixes.entrySet()) { + if (ent.getKey().length > 0) { + final CountMinOutput cmo = ent.getValue(); + final T output = run(fst, ent.getKey(), stopNode); + if (VERBOSE) { + System.out.println("TEST: verify term=" + inputToString(inputMode, ent.getKey()) + " output=" + outputs.outputToString(cmo.output)); + } + // if (cmo.isFinal && !cmo.isLeaf) { + if (cmo.isFinal) { + assertEquals(cmo.finalOutput, output); + } else { + assertEquals(cmo.output, output); + } + assertEquals(ent.getKey().length, stopNode[0]); + } + } + } + } + + public void testRandomWords() throws IOException { + testRandomWords(1000, 5 * RANDOM_MULTIPLIER); + //testRandomWords(20, 100); + } + + private String inputModeToString(int mode) { + if (mode == 0) { + return "utf8"; + } else { + return "utf32"; + } + } + + private void testRandomWords(int maxNumWords, int numIter) throws IOException { + for(int iter=0;iter termsSet = new HashSet(); + IntsRef[] terms = new IntsRef[numWords]; + while(termsSet.size() < numWords) { + final String term = getRandomString(); + termsSet.add(toIntsRef(term, inputMode)); + } + doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()])); + } + } + } + + static String getRandomString() { + final String term; + if (random.nextBoolean()) { + term = _TestUtil.randomRealisticUnicodeString(random); + } else { + // we want to mix in limited-alphabet symbols so + // we get more sharing of the nodes given how few + // terms we are testing... + term = simpleRandomString(random); + } + return term; + } + + @Nightly + public void testBigSet() throws IOException { + testRandomWords(50000, RANDOM_MULTIPLIER); + } + + private static String inputToString(int inputMode, IntsRef term) { + if (inputMode == 0) { + // utf8 + return toBytesRef(term).utf8ToString() + " " + term; + } else { + // utf32 + return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term; + } + } + + private static IntsRef toIntsRef(String s) { + final int charCount = s.length(); + IntsRef ir = new IntsRef(charCount); + for(int charIDX=0;charIDX= 0 && ch < 65536); + chars[charIDX] = (char) ch; + } + return new String(chars); + } + + // Build FST for all unique terms in the test line docs + // file, up until a time limit + public void testRealTerms() throws Exception { + + /* + if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) { + // no + CodecProvider.getDefault().setDefaultFieldCodec("Standard"); + } + */ + + final LineFileDocs docs = new LineFileDocs(random); + final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 100 : 1; + final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64); + final File tempDir = _TestUtil.getTempDir("fstlines"); + final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir)); + final IndexWriter writer = new IndexWriter(dir, conf); + writer.setInfoStream(VERBOSE ? System.out : null); + final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC * 1000; + Document doc; + int docCount = 0; + while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) { + writer.addDocument(doc); + docCount++; + } + IndexReader r = IndexReader.open(writer, true); + writer.close(); + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE2, 0, 0, true, outputs); + + boolean storeOrd = false; + if (VERBOSE) { + if (storeOrd) { + System.out.println("FST stores ord"); + } else { + System.out.println("FST stores docFreq"); + } + } + TermEnum termEnum = r.terms(new Term("body", "")); + if (VERBOSE) { + System.out.println("TEST: got termEnum=" + termEnum); + } + int ord = 0; + while(true) { + final Term term = termEnum.term(); + if (term == null || !"body".equals(term.field())) { + break; + } + + // No ord in 3.x: + /* + if (ord == 0) { + try { + termsEnum.ord(); + } catch (UnsupportedOperationException uoe) { + if (VERBOSE) { + System.out.println("TEST: codec doesn't support ord; FST stores docFreq"); + } + storeOrd = false; + } + } + */ + final int output; + if (storeOrd) { + output = ord; + } else { + output = termEnum.docFreq(); + } + //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0))); + builder.add(toIntsRef(term.text()), outputs.get(output)); + ord++; + if (ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { + System.out.println(ord + " terms..."); + } + termEnum.next(); + } + final FST fst = builder.finish(); + if (VERBOSE) { + System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes"); + } + + if (ord > 0) { + // Now confirm BytesRefFSTEnum and TermEnum act the + // same: + final IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + for(int iter=0;iter<1000*RANDOM_MULTIPLIER;iter++) { + final String randomTerm = getRandomString(); + + if (VERBOSE) { + System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0))); + } + + termEnum = r.terms(new Term("body", randomTerm)); + final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm)); + + if (termEnum.term() == null || !"body".equals(termEnum.term().field())) { + assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult); + } else { + assertSame(termEnum, fstEnum, storeOrd); + for(int nextIter=0;nextIter<10;nextIter++) { + if (VERBOSE) { + System.out.println("TEST: next"); + //if (storeOrd) { + //System.out.println(" ord=" + termEnum.ord()); + //} + } + termEnum.next(); + if (termEnum.term() != null && "body".equals(termEnum.term().field())) { + if (VERBOSE) { + System.out.println(" term=" + termEnum.term()); + } + assertNotNull(fstEnum.next()); + assertSame(termEnum, fstEnum, storeOrd); + } else { + if (VERBOSE) { + System.out.println(" end!"); + } + IntsRefFSTEnum.InputOutput nextResult = fstEnum.next(); + if (nextResult != null) { + System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output)); + fail(); + } + break; + } + } + } + } + } + + r.close(); + dir.close(); + } + + private void assertSame(TermEnum termEnum, IntsRefFSTEnum fstEnum, boolean storeOrd) throws Exception { + if (termEnum.term() == null || !"body".equals(termEnum.term().field())) { + if (fstEnum.current() != null) { + fail("fstEnum.current().input=" + toString(fstEnum.current().input)); + } + } else { + assertNotNull(fstEnum.current()); + assertEquals(termEnum.term() + " != " + toString(fstEnum.current().input), termEnum.term().text(), toString(fstEnum.current().input)); + if (storeOrd) { + // fst stored the ord + // No ord in 3.x + // assertEquals(termEnum.ord(), ((Long) fstEnum.current().output).longValue()); + } else { + // fst stored the docFreq + assertEquals(termEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); + } + } + } + + private static abstract class VisitTerms { + private final String dirOut; + private final String wordsFileIn; + private int inputMode; + private final Outputs outputs; + private final Builder builder; + + public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs outputs) { + this.dirOut = dirOut; + this.wordsFileIn = wordsFileIn; + this.inputMode = inputMode; + this.outputs = outputs; + + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs); + } + + protected abstract T getOutput(IntsRef input, int ord) throws IOException; + + public void run(int limit, boolean verify) throws IOException { + BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); + try { + final IntsRef intsRef = new IntsRef(10); + long tStart = System.currentTimeMillis(); + int ord = 0; + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + toIntsRef(w, inputMode, intsRef); + builder.add(intsRef, + getOutput(intsRef, ord)); + + ord++; + if (ord % 500000 == 0) { + System.out.println( + String.format(Locale.ENGLISH, + "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord)); + } + if (ord >= limit) { + break; + } + } + + assert builder.getTermCount() == ord; + final FST fst = builder.finish(); + if (fst == null) { + System.out.println("FST was fully pruned!"); + System.exit(0); + } + + if (dirOut == null) + return; + + System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); + if (fst.getNodeCount() < 100) { + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + Util.toDot(fst, w, false, false); + w.close(); + System.out.println("Wrote FST to out.dot"); + } + + Directory dir = FSDirectory.open(new File(dirOut)); + IndexOutput out = dir.createOutput("fst.bin"); + fst.save(out); + out.close(); + + System.out.println("Saved FST to fst.bin."); + + if (!verify) { + return; + } + + System.out.println("\nNow verify..."); + + is.close(); + is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); + + ord = 0; + tStart = System.currentTimeMillis(); + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + toIntsRef(w, inputMode, intsRef); + T expected = getOutput(intsRef, ord); + T actual = Util.get(fst, intsRef); + if (actual == null) { + throw new RuntimeException("unexpected null output on input=" + w); + } + if (!actual.equals(expected)) { + throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); + } + + ord++; + if (ord % 500000 == 0) { + System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); + } + if (ord >= limit) { + break; + } + } + + double totSec = ((System.currentTimeMillis() - tStart)/1000.0); + System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); + + } finally { + is.close(); + } + } + } + + // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out + public static void main(String[] args) throws IOException { + int prune = 0; + int limit = Integer.MAX_VALUE; + int inputMode = 0; // utf8 + boolean storeOrds = false; + boolean storeDocFreqs = false; + boolean verify = true; + + String wordsFileIn = null; + String dirOut = null; + + int idx = 0; + while (idx < args.length) { + if (args[idx].equals("-prune")) { + prune = Integer.valueOf(args[1 + idx]); + idx++; + } else if (args[idx].equals("-limit")) { + limit = Integer.valueOf(args[1 + idx]); + idx++; + } else if (args[idx].equals("-utf8")) { + inputMode = 0; + } else if (args[idx].equals("-utf32")) { + inputMode = 1; + } else if (args[idx].equals("-docFreq")) { + storeDocFreqs = true; + } else if (args[idx].equals("-ords")) { + storeOrds = true; + } else if (args[idx].equals("-noverify")) { + verify = false; + } else if (args[idx].startsWith("-")) { + System.err.println("Unrecognized option: " + args[idx]); + System.exit(-1); + } else { + if (wordsFileIn == null) { + wordsFileIn = args[idx]; + } else if (dirOut == null) { + dirOut = args[idx]; + } else { + System.err.println("Too many arguments, expected: input [output]"); + System.exit(-1); + } + } + idx++; + } + + if (wordsFileIn == null) { + System.err.println("No input file."); + System.exit(-1); + } + + // ord benefits from share, docFreqs don't: + + if (storeOrds && storeDocFreqs) { + // Store both ord & docFreq: + final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true); + final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false); + final PairOutputs outputs = new PairOutputs(o1, o2); + new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs) { + Random rand; + @Override + public PairOutputs.Pair getOutput(IntsRef input, int ord) { + if (ord == 0) { + rand = new Random(17); + } + return new PairOutputs.Pair(o1.get(ord), + o2.get(_TestUtil.nextInt(rand, 1, 5000))); + } + }.run(limit, verify); + } else if (storeOrds) { + // Store only ords + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + @Override + public Long getOutput(IntsRef input, int ord) { + return outputs.get(ord); + } + }.run(limit, verify); + } else if (storeDocFreqs) { + // Store only docFreq + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + Random rand; + @Override + public Long getOutput(IntsRef input, int ord) { + if (ord == 0) { + rand = new Random(17); + } + return outputs.get(_TestUtil.nextInt(rand, 1, 5000)); + } + }.run(limit, verify); + } else { + // Store nothing + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + @Override + public Object getOutput(IntsRef input, int ord) { + return NO_OUTPUT; + } + }.run(limit, verify); + } + } + + public void testSingleString() throws Exception { + final Outputs outputs = NoOutputs.getSingleton(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + b.add(new BytesRef("foobar"), outputs.getNoOutput()); + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(b.finish()); + assertNull(fstEnum.seekFloor(new BytesRef("foo"))); + assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); + } + + public void testSimple() throws Exception { + + // Get outputs -- passing true means FST will share + // (delta code) the outputs. This should result in + // smaller FST if the outputs grow monotonically. But + // if numbers are "random", false should give smaller + // final size: + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + + // Build an FST mapping BytesRef -> Long + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + + final BytesRef a = new BytesRef("a"); + final BytesRef b = new BytesRef("b"); + final BytesRef c = new BytesRef("c"); + + builder.add(a, outputs.get(17)); + builder.add(b, outputs.get(42)); + builder.add(c, outputs.get(13824324872317238L)); + + final FST fst = builder.finish(); + + assertEquals(13824324872317238L, (long) Util.get(fst, c)); + assertEquals(42, (long) Util.get(fst, b)); + assertEquals(17, (long) Util.get(fst, a)); + + BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + BytesRefFSTEnum.InputOutput seekResult; + seekResult = fstEnum.seekFloor(a); + assertNotNull(seekResult); + assertEquals(17, (long) seekResult.output); + + // goes to a + seekResult = fstEnum.seekFloor(new BytesRef("aa")); + assertNotNull(seekResult); + assertEquals(17, (long) seekResult.output); + + // goes to b + seekResult = fstEnum.seekCeil(new BytesRef("aa")); + assertNotNull(seekResult); + assertEquals(b, seekResult.input); + assertEquals(42, (long) seekResult.output); + } + + /** + * Test state expansion (array format) on close-to-root states. Creates + * synthetic input that has one expanded state on each level. + * + * @see "https://issues.apache.org/jira/browse/LUCENE-2933" + */ + public void testExpandedCloseToRoot() throws Exception { + class SyntheticData { + FST compile(String[] lines) throws IOException { + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object nothing = outputs.getNoOutput(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + + int line = 0; + final BytesRef term = new BytesRef(); + while (line < lines.length) { + String w = lines[line++]; + if (w == null) { + break; + } + term.copy(w); + b.add(term, nothing); + } + + return b.finish(); + } + + void generate(ArrayList out, StringBuilder b, char from, char to, + int depth) { + if (depth == 0 || from == to) { + String seq = b.toString() + "_" + out.size() + "_end"; + out.add(seq); + } else { + for (char c = from; c <= to; c++) { + b.append(c); + generate(out, b, from, c == to ? to : from, depth - 1); + b.deleteCharAt(b.length() - 1); + } + } + } + + public int verifyStateAndBelow(FST fst, Arc arc, int depth) + throws IOException { + if (fst.targetHasArcs(arc)) { + int childCount = 0; + for (arc = fst.readFirstTargetArc(arc, arc);; + arc = fst.readNextArc(arc), childCount++) + { + boolean expanded = fst.isExpandedTarget(arc); + int children = verifyStateAndBelow(fst, new FST.Arc().copyFrom(arc), depth + 1); + + assertEquals( + expanded, + (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && + children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || + children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP); + if (arc.isLast()) break; + } + + return childCount; + } + return 0; + } + } + + // Sanity check. + assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP); + assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0); + + SyntheticData s = new SyntheticData(); + + ArrayList out = new ArrayList(); + StringBuilder b = new StringBuilder(); + s.generate(out, b, 'a', 'i', 10); + String[] input = out.toArray(new String[out.size()]); + Arrays.sort(input); + FST fst = s.compile(input); + FST.Arc arc = fst.getFirstArc(new FST.Arc()); + s.verifyStateAndBelow(fst, arc, 1); + } + + // Make sure raw FST can differentiate between final vs + // non-final end nodes + public void testNonFinalStopNodes() throws Exception { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + final Long nothing = outputs.getNoOutput(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + + final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs); + + final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); + + // Add final stop node + { + final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); + node.isFinal = true; + rootNode.addArc('a', node); + final Builder.CompiledNode frozen = new Builder.CompiledNode(); + frozen.address = fst.addNode(node); + rootNode.arcs[0].nextFinalOutput = outputs.get(17); + rootNode.arcs[0].isFinal = true; + rootNode.arcs[0].output = nothing; + rootNode.arcs[0].target = frozen; + } + + // Add non-final stop node + { + final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); + rootNode.addArc('b', node); + final Builder.CompiledNode frozen = new Builder.CompiledNode(); + frozen.address = fst.addNode(node); + rootNode.arcs[1].nextFinalOutput = nothing; + rootNode.arcs[1].output = outputs.get(42); + rootNode.arcs[1].target = frozen; + } + + fst.finish(fst.addNode(rootNode)); + + checkStopNodes(fst, outputs); + + // Make sure it still works after save/load: + Directory dir = newDirectory(); + IndexOutput out = dir.createOutput("fst"); + fst.save(out); + out.close(); + + IndexInput in = dir.openInput("fst"); + final FST fst2 = new FST(in, outputs); + checkStopNodes(fst2, outputs); + in.close(); + dir.close(); + } + + private void checkStopNodes(FST fst, PositiveIntOutputs outputs) throws Exception { + final Long nothing = outputs.getNoOutput(); + FST.Arc startArc = fst.getFirstArc(new FST.Arc()); + assertEquals(nothing, startArc.output); + assertEquals(nothing, startArc.nextFinalOutput); + + FST.Arc arc = fst.readFirstTargetArc(startArc, new FST.Arc()); + assertEquals('a', arc.label); + assertEquals(17, arc.nextFinalOutput.longValue()); + assertTrue(arc.isFinal()); + + arc = fst.readNextArc(arc); + assertEquals('b', arc.label); + assertFalse(arc.isFinal()); + assertEquals(42, arc.output.longValue()); + } +} Property changes on: lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/index/IndexFormatTooOldException.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexFormatTooOldException.java (revision 0) +++ lucene/src/java/org/apache/lucene/index/IndexFormatTooOldException.java (revision 0) @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +/** + * This exception is thrown when Lucene detects + * an index that is too old for this Lucene version + */ +public class IndexFormatTooOldException extends CorruptIndexException { + + public IndexFormatTooOldException(String filename, String version) { + super("Format version is not supported" + (filename!=null ? (" in file '" + filename + "'") : "") + + ": " + version + ". This version of Lucene only supports indexes created with release 3.0 and later."); + } + + public IndexFormatTooOldException(String filename, int version, int minVersion, int maxVersion) { + super("Format version is not supported" + (filename!=null ? (" in file '" + filename + "'") : "") + + ": " + version + " (needs to be between " + minVersion + " and " + maxVersion + + "). This version of Lucene only supports indexes created with release 3.0 and later."); + } + +} Property changes on: lucene/src/java/org/apache/lucene/index/IndexFormatTooOldException.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/index/IndexFormatTooNewException.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexFormatTooNewException.java (revision 0) +++ lucene/src/java/org/apache/lucene/index/IndexFormatTooNewException.java (revision 0) @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +/** + * This exception is thrown when Lucene detects + * an index that is newer than this Lucene version. + */ +public class IndexFormatTooNewException extends CorruptIndexException { + + public IndexFormatTooNewException(String filename, int version, int minVersion, int maxVersion) { + super("Format version is not supported" + (filename!=null ? (" in file '" + filename + "'") : "") + + ": " + version + " (needs to be between " + minVersion + " and " + maxVersion + ")"); + } + +} Property changes on: lucene/src/java/org/apache/lucene/index/IndexFormatTooNewException.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/store/IndexInput.java =================================================================== --- lucene/src/java/org/apache/lucene/store/IndexInput.java (revision 1127319) +++ lucene/src/java/org/apache/lucene/store/IndexInput.java (working copy) @@ -19,200 +19,15 @@ import java.io.IOException; import java.io.Closeable; -import java.util.Map; -import java.util.HashMap; /** Abstract base class for input from a file in a {@link Directory}. A * random-access input stream. Used for all Lucene index input operations. * @see Directory */ -public abstract class IndexInput implements Cloneable,Closeable { +public abstract class IndexInput extends DataInput implements Cloneable,Closeable { - private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format - protected byte[] copyBuf = null; - /** Reads and returns a single byte. - * @see IndexOutput#writeByte(byte) - */ - public abstract byte readByte() throws IOException; - - /** Reads a specified number of bytes into an array at the specified offset. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @see IndexOutput#writeBytes(byte[],int) - */ - public abstract void readBytes(byte[] b, int offset, int len) - throws IOException; - - /** Reads a specified number of bytes into an array at the - * specified offset with control over whether the read - * should be buffered (callers who have their own buffer - * should pass in "false" for useBuffer). Currently only - * {@link BufferedIndexInput} respects this parameter. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @param useBuffer set to false if the caller will handle - * buffering. - * @see IndexOutput#writeBytes(byte[],int) - */ - public void readBytes(byte[] b, int offset, int len, boolean useBuffer) - throws IOException { - // Default to ignoring useBuffer entirely - readBytes(b, offset, len); - } - - /** Reads four bytes and returns an int. - * @see IndexOutput#writeInt(int) - */ - public int readInt() throws IOException { - return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) - | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); - } - - /** Reads an int stored in variable-length format. Reads between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexOutput#writeVInt(int) - */ - public int readVInt() throws IOException { - /* This is the original code of this method, - * but a Hotspot bug (see LUCENE-2975) corrupts the for-loop if - * readByte() is inlined. So the loop was unwinded! - byte b = readByte(); - int i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7F) << shift; - } - return i; - */ - byte b = readByte(); - int i = b & 0x7F; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7F) << 7; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7F) << 14; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7F) << 21; - if ((b & 0x80) == 0) return i; - b = readByte(); - assert (b & 0x80) == 0; - return i | ((b & 0x7F) << 28); - } - - /** Reads eight bytes and returns a long. - * @see IndexOutput#writeLong(long) - */ - public long readLong() throws IOException { - return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); - } - - /** Reads a long stored in variable-length format. Reads between one and - * nine bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. */ - public long readVLong() throws IOException { - /* This is the original code of this method, - * but a Hotspot bug (see LUCENE-2975) corrupts the for-loop if - * readByte() is inlined. So the loop was unwinded! - byte b = readByte(); - long i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7FL) << shift; - } - return i; - */ - byte b = readByte(); - long i = b & 0x7FL; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7FL) << 7; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7FL) << 14; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7FL) << 21; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7FL) << 28; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7FL) << 35; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7FL) << 42; - if ((b & 0x80) == 0) return i; - b = readByte(); - i |= (b & 0x7FL) << 49; - if ((b & 0x80) == 0) return i; - b = readByte(); - assert (b & 0x80) == 0; - return i | ((b & 0x7FL) << 56); - } - - /** Call this if readString should read characters stored - * in the old modified UTF8 format (length in java chars - * and java's modified UTF8 encoding). This is used for - * indices written pre-2.4 See LUCENE-510 for details. */ - public void setModifiedUTF8StringsMode() { - preUTF8Strings = true; - } - - /** Reads a string. - * @see IndexOutput#writeString(String) - */ - public String readString() throws IOException { - if (preUTF8Strings) - return readModifiedUTF8String(); - int length = readVInt(); - final byte[] bytes = new byte[length]; - readBytes(bytes, 0, length); - return new String(bytes, 0, length, "UTF-8"); - } - - private String readModifiedUTF8String() throws IOException { - int length = readVInt(); - final char[] chars = new char[length]; - readChars(chars, 0, length); - return new String(chars, 0, length); - } - - /** Reads Lucene's old "modified UTF-8" encoded - * characters into an array. - * @param buffer the array to read characters into - * @param start the offset in the array to start storing characters - * @param length the number of characters to read - * @see IndexOutput#writeChars(String,int,int) - * @deprecated -- please use readString or readBytes - * instead, and construct the string - * from those utf8 bytes - */ - @Deprecated - public void readChars(char[] buffer, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - byte b = readByte(); - if ((b & 0x80) == 0) - buffer[i] = (char)(b & 0x7F); - else if ((b & 0xE0) != 0xE0) { - buffer[i] = (char)(((b & 0x1F) << 6) - | (readByte() & 0x3F)); - } else { - buffer[i] = (char)(((b & 0x0F) << 12) - | ((readByte() & 0x3F) << 6) - | (readByte() & 0x3F)); - } - } - } - /** * Expert * @@ -239,7 +54,6 @@ } } } - /** Closes the stream to further operations. */ public abstract void close() throws IOException; @@ -258,37 +72,6 @@ /** The number of bytes in the file. */ public abstract long length(); - /** Returns a clone of this stream. - * - *

Clones of a stream access the same data, and are positioned at the same - * point as the stream they were cloned from. - * - *

Expert: Subclasses must ensure that clones may be positioned at - * different points in the input from each other and from the stream they - * were cloned from. - */ - @Override - public Object clone() { - IndexInput clone = null; - try { - clone = (IndexInput)super.clone(); - } catch (CloneNotSupportedException e) {} - - return clone; - } - - public Map readStringStringMap() throws IOException { - final Map map = new HashMap(); - final int count = readInt(); - for(int i=0;inumBytes bytes to the given {@link IndexOutput}. *

Index: lucene/src/java/org/apache/lucene/store/IndexOutput.java =================================================================== --- lucene/src/java/org/apache/lucene/store/IndexOutput.java (revision 1127319) +++ lucene/src/java/org/apache/lucene/store/IndexOutput.java (working copy) @@ -19,166 +19,14 @@ import java.io.IOException; import java.io.Closeable; -import java.util.Map; -import org.apache.lucene.util.UnicodeUtil; /** Abstract base class for output to a file in a Directory. A random-access * output stream. Used for all Lucene index output operations. * @see Directory * @see IndexInput */ -public abstract class IndexOutput implements Closeable { +public abstract class IndexOutput extends DataOutput implements Closeable { - /** Writes a single byte. - * @see IndexInput#readByte() - */ - public abstract void writeByte(byte b) throws IOException; - - /** Writes an array of bytes. - * @param b the bytes to write - * @param length the number of bytes to write - * @see IndexInput#readBytes(byte[],int,int) - */ - public void writeBytes(byte[] b, int length) throws IOException { - writeBytes(b, 0, length); - } - - /** Writes an array of bytes. - * @param b the bytes to write - * @param offset the offset in the byte array - * @param length the number of bytes to write - * @see IndexInput#readBytes(byte[],int,int) - */ - public abstract void writeBytes(byte[] b, int offset, int length) throws IOException; - - /** Writes an int as four bytes. - * @see IndexInput#readInt() - */ - public void writeInt(int i) throws IOException { - writeByte((byte)(i >> 24)); - writeByte((byte)(i >> 16)); - writeByte((byte)(i >> 8)); - writeByte((byte) i); - } - - /** Writes an int in a variable-length format. Writes between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVInt() - */ - public void writeVInt(int i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a long as eight bytes. - * @see IndexInput#readLong() - */ - public void writeLong(long i) throws IOException { - writeInt((int) (i >> 32)); - writeInt((int) i); - } - - /** Writes an long in a variable-length format. Writes between one and nine - * bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVLong() - */ - public void writeVLong(long i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a string. - * @see IndexInput#readString() - */ - public void writeString(String s) throws IOException { - final UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); - UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); - writeVInt(utf8Result.length); - writeBytes(utf8Result.result, 0, utf8Result.length); - } - - /** Writes a sub sequence of characters from s as the old - * format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes - * instead or use {@link #writeString} - */ - @Deprecated - public void writeChars(String s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = s.charAt(i); - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - /** Writes a sub sequence of characters from char[] as - * the old format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} - */ - @Deprecated - public void writeChars(char[] s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = s[i]; - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - private static int COPY_BUFFER_SIZE = 16384; - private byte[] copyBuffer; - - /** Copy numBytes bytes from input to ourself. */ - public void copyBytes(IndexInput input, long numBytes) throws IOException { - assert numBytes >= 0: "numBytes=" + numBytes; - long left = numBytes; - if (copyBuffer == null) - copyBuffer = new byte[COPY_BUFFER_SIZE]; - while(left > 0) { - final int toCopy; - if (left > COPY_BUFFER_SIZE) - toCopy = COPY_BUFFER_SIZE; - else - toCopy = (int) left; - input.readBytes(copyBuffer, 0, toCopy); - writeBytes(copyBuffer, 0, toCopy); - left -= toCopy; - } - } - /** Forces any buffered output to be written. */ public abstract void flush() throws IOException; @@ -210,16 +58,4 @@ * @param length file length */ public void setLength(long length) throws IOException {} - - public void writeStringStringMap(Map map) throws IOException { - if (map == null) { - writeInt(0); - } else { - writeInt(map.size()); - for(final Map.Entry entry: map.entrySet()) { - writeString(entry.getKey()); - writeString(entry.getValue()); - } - } - } } Index: lucene/src/java/org/apache/lucene/store/RAMOutputStream.java =================================================================== --- lucene/src/java/org/apache/lucene/store/RAMOutputStream.java (revision 1127319) +++ lucene/src/java/org/apache/lucene/store/RAMOutputStream.java (working copy) @@ -162,7 +162,7 @@ } @Override - public void copyBytes(IndexInput input, long numBytes) throws IOException { + public void copyBytes(DataInput input, long numBytes) throws IOException { assert numBytes >= 0: "numBytes=" + numBytes; while (numBytes > 0) { Index: lucene/src/java/org/apache/lucene/store/DataInput.java =================================================================== --- lucene/src/java/org/apache/lucene/store/DataInput.java (revision 0) +++ lucene/src/java/org/apache/lucene/store/DataInput.java (revision 0) @@ -0,0 +1,251 @@ +package org.apache.lucene.store; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** + * Abstract base class for performing read operations of Lucene's low-level + * data types. + */ +public abstract class DataInput implements Cloneable { + + private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format + + /** Call this if readString should read characters stored + * in the old modified UTF8 format (length in java chars + * and java's modified UTF8 encoding). This is used for + * indices written pre-2.4 See LUCENE-510 for details. */ + public void setModifiedUTF8StringsMode() { + preUTF8Strings = true; + } + + /** Reads and returns a single byte. + * @see DataOutput#writeByte(byte) + */ + public abstract byte readByte() throws IOException; + + /** Reads a specified number of bytes into an array at the specified offset. + * @param b the array to read bytes into + * @param offset the offset in the array to start storing bytes + * @param len the number of bytes to read + * @see DataOutput#writeBytes(byte[],int) + */ + public abstract void readBytes(byte[] b, int offset, int len) + throws IOException; + + /** Reads a specified number of bytes into an array at the + * specified offset with control over whether the read + * should be buffered (callers who have their own buffer + * should pass in "false" for useBuffer). Currently only + * {@link BufferedIndexInput} respects this parameter. + * @param b the array to read bytes into + * @param offset the offset in the array to start storing bytes + * @param len the number of bytes to read + * @param useBuffer set to false if the caller will handle + * buffering. + * @see DataOutput#writeBytes(byte[],int) + */ + public void readBytes(byte[] b, int offset, int len, boolean useBuffer) + throws IOException + { + // Default to ignoring useBuffer entirely + readBytes(b, offset, len); + } + + /** Reads two bytes and returns a short. + * @see DataOutput#writeByte(byte) + */ + public short readShort() throws IOException { + return (short) (((readByte() & 0xFF) << 8) | (readByte() & 0xFF)); + } + + /** Reads four bytes and returns an int. + * @see DataOutput#writeInt(int) + */ + public int readInt() throws IOException { + return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) + | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); + } + + /** Reads an int stored in variable-length format. Reads between one and + * five bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataOutput#writeVInt(int) + */ + public int readVInt() throws IOException { + /* This is the original code of this method, + * but a Hotspot bug (see LUCENE-2975) corrupts the for-loop if + * readByte() is inlined. So the loop was unwinded! + byte b = readByte(); + int i = b & 0x7F; + for (int shift = 7; (b & 0x80) != 0; shift += 7) { + b = readByte(); + i |= (b & 0x7F) << shift; + } + return i; + */ + byte b = readByte(); + int i = b & 0x7F; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7F) << 7; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7F) << 14; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7F) << 21; + if ((b & 0x80) == 0) return i; + b = readByte(); + assert (b & 0x80) == 0; + return i | ((b & 0x7F) << 28); + } + + /** Reads eight bytes and returns a long. + * @see DataOutput#writeLong(long) + */ + public long readLong() throws IOException { + return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); + } + + /** Reads a long stored in variable-length format. Reads between one and + * nine bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. */ + public long readVLong() throws IOException { + /* This is the original code of this method, + * but a Hotspot bug (see LUCENE-2975) corrupts the for-loop if + * readByte() is inlined. So the loop was unwinded! + byte b = readByte(); + long i = b & 0x7F; + for (int shift = 7; (b & 0x80) != 0; shift += 7) { + b = readByte(); + i |= (b & 0x7FL) << shift; + } + return i; + */ + byte b = readByte(); + long i = b & 0x7FL; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7FL) << 7; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7FL) << 14; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7FL) << 21; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7FL) << 28; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7FL) << 35; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7FL) << 42; + if ((b & 0x80) == 0) return i; + b = readByte(); + i |= (b & 0x7FL) << 49; + if ((b & 0x80) == 0) return i; + b = readByte(); + assert (b & 0x80) == 0; + return i | ((b & 0x7FL) << 56); + } + + /** Reads a string. + * @see DataOutput#writeString(String) + */ + public String readString() throws IOException { + if (preUTF8Strings) + return readModifiedUTF8String(); + int length = readVInt(); + final byte[] bytes = new byte[length]; + readBytes(bytes, 0, length); + return new String(bytes, 0, length, "UTF-8"); + } + + private String readModifiedUTF8String() throws IOException { + int length = readVInt(); + final char[] chars = new char[length]; + readChars(chars, 0, length); + return new String(chars, 0, length); + } + + /** Reads Lucene's old "modified UTF-8" encoded + * characters into an array. + * @param buffer the array to read characters into + * @param start the offset in the array to start storing characters + * @param length the number of characters to read + * @see DataOutput#writeChars(String,int,int) + * @deprecated -- please use readString or readBytes + * instead, and construct the string + * from those utf8 bytes + */ + @Deprecated + public void readChars(char[] buffer, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + byte b = readByte(); + if ((b & 0x80) == 0) + buffer[i] = (char)(b & 0x7F); + else if ((b & 0xE0) != 0xE0) { + buffer[i] = (char)(((b & 0x1F) << 6) + | (readByte() & 0x3F)); + } else { + buffer[i] = (char)(((b & 0x0F) << 12) + | ((readByte() & 0x3F) << 6) + | (readByte() & 0x3F)); + } + } + } + + /** Returns a clone of this stream. + * + *

Clones of a stream access the same data, and are positioned at the same + * point as the stream they were cloned from. + * + *

Expert: Subclasses must ensure that clones may be positioned at + * different points in the input from each other and from the stream they + * were cloned from. + */ + @Override + public Object clone() { + DataInput clone = null; + try { + clone = (DataInput)super.clone(); + } catch (CloneNotSupportedException e) {} + + return clone; + } + + public Map readStringStringMap() throws IOException { + final Map map = new HashMap(); + final int count = readInt(); + for(int i=0;i> 24)); + writeByte((byte)(i >> 16)); + writeByte((byte)(i >> 8)); + writeByte((byte) i); + } + + /** Writes an int in a variable-length format. Writes between one and + * five bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataInput#readVInt() + */ + public final void writeVInt(int i) throws IOException { + while ((i & ~0x7F) != 0) { + writeByte((byte)((i & 0x7f) | 0x80)); + i >>>= 7; + } + writeByte((byte)i); + } + + /** Writes a long as eight bytes. + * @see DataInput#readLong() + */ + public void writeLong(long i) throws IOException { + writeInt((int) (i >> 32)); + writeInt((int) i); + } + + /** Writes an long in a variable-length format. Writes between one and nine + * bytes. Smaller values take fewer bytes. Negative numbers are not + * supported. + * @see DataInput#readVLong() + */ + public final void writeVLong(long i) throws IOException { + while ((i & ~0x7F) != 0) { + writeByte((byte)((i & 0x7f) | 0x80)); + i >>>= 7; + } + writeByte((byte)i); + } + + /** Writes a string. + * @see DataInput#readString() + */ + public void writeString(String s) throws IOException { + final BytesRef utf8Result = new BytesRef(10); + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); + writeVInt(utf8Result.length); + writeBytes(utf8Result.bytes, 0, utf8Result.length); + } + + private static int COPY_BUFFER_SIZE = 16384; + private byte[] copyBuffer; + + /** Copy numBytes bytes from input to ourself. */ + public void copyBytes(DataInput input, long numBytes) throws IOException { + assert numBytes >= 0: "numBytes=" + numBytes; + long left = numBytes; + if (copyBuffer == null) + copyBuffer = new byte[COPY_BUFFER_SIZE]; + while(left > 0) { + final int toCopy; + if (left > COPY_BUFFER_SIZE) + toCopy = COPY_BUFFER_SIZE; + else + toCopy = (int) left; + input.readBytes(copyBuffer, 0, toCopy); + writeBytes(copyBuffer, 0, toCopy); + left -= toCopy; + } + } + + /** Writes a sub sequence of characters from s as the old + * format (modified UTF-8 encoded bytes). + * @param s the source of the characters + * @param start the first character in the sequence + * @param length the number of characters in the sequence + * @deprecated -- please pre-convert to utf8 bytes + * instead or use {@link #writeString} + */ + @Deprecated + public void writeChars(String s, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + final int code = s.charAt(i); + if (code >= 0x01 && code <= 0x7F) + writeByte((byte)code); + else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { + writeByte((byte)(0xC0 | (code >> 6))); + writeByte((byte)(0x80 | (code & 0x3F))); + } else { + writeByte((byte)(0xE0 | (code >>> 12))); + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); + writeByte((byte)(0x80 | (code & 0x3F))); + } + } + } + + /** Writes a sub sequence of characters from char[] as + * the old format (modified UTF-8 encoded bytes). + * @param s the source of the characters + * @param start the first character in the sequence + * @param length the number of characters in the sequence + * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} + */ + @Deprecated + public void writeChars(char[] s, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + final int code = s[i]; + if (code >= 0x01 && code <= 0x7F) + writeByte((byte)code); + else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { + writeByte((byte)(0xC0 | (code >> 6))); + writeByte((byte)(0x80 | (code & 0x3F))); + } else { + writeByte((byte)(0xE0 | (code >>> 12))); + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); + writeByte((byte)(0x80 | (code & 0x3F))); + } + } + } + + public void writeStringStringMap(Map map) throws IOException { + if (map == null) { + writeInt(0); + } else { + writeInt(map.size()); + for(final Map.Entry entry: map.entrySet()) { + writeString(entry.getKey()); + writeString(entry.getValue()); + } + } + } +} Property changes on: lucene/src/java/org/apache/lucene/store/DataOutput.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/CodecUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/CodecUtil.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/CodecUtil.java (revision 0) @@ -0,0 +1,79 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFormatTooNewException; +import org.apache.lucene.index.IndexFormatTooOldException; + +import java.io.IOException; + +/** + * @lucene.experimental + */ + +public final class CodecUtil { + private CodecUtil() {} // no instance + + private final static int CODEC_MAGIC = 0x3fd76c17; + + public static DataOutput writeHeader(DataOutput out, String codec, int version) + throws IOException { + BytesRef bytes = new BytesRef(codec); + if (bytes.length != codec.length() || bytes.length >= 128) { + throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]"); + } + out.writeInt(CODEC_MAGIC); + out.writeString(codec); + out.writeInt(version); + + return out; + } + + public static int headerLength(String codec) { + return 9+codec.length(); + } + + public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion) + throws IOException { + + // Safety to guard against reading a bogus string: + final int actualHeader = in.readInt(); + if (actualHeader != CODEC_MAGIC) { + throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC); + } + + final String actualCodec = in.readString(); + if (!actualCodec.equals(codec)) { + throw new CorruptIndexException("codec mismatch: actual codec=" + actualCodec + " vs expected codec=" + codec); + } + + final int actualVersion = in.readInt(); + if (actualVersion < minVersion) { + throw new IndexFormatTooOldException(null, actualVersion, minVersion, maxVersion); + } + if (actualVersion > maxVersion) { + throw new IndexFormatTooNewException(null, actualVersion, minVersion, maxVersion); + } + + return actualVersion; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/CodecUtil.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/IntsRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/IntsRef.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/IntsRef.java (revision 0) @@ -0,0 +1,140 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Represents int[], as a slice (offset + length) into an + * existing int[]. + * + * @lucene.internal */ +public final class IntsRef implements Comparable { + + public int[] ints; + public int offset; + public int length; + + public IntsRef() { + } + + public IntsRef(int capacity) { + ints = new int[capacity]; + } + + public IntsRef(int[] ints, int offset, int length) { + this.ints = ints; + this.offset = offset; + this.length = length; + } + + public IntsRef(IntsRef other) { + copy(other); + } + + @Override + public Object clone() { + return new IntsRef(this); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + final int end = offset + length; + for(int i = offset; i < end; i++) { + result = prime * result + ints[i]; + } + return result; + } + + @Override + public boolean equals(Object other) { + return this.intsEquals((IntsRef) other); + } + + public boolean intsEquals(IntsRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final int[] otherInts = other.ints; + final int end = offset + length; + for(int upto=offset;upto bInt) { + return 1; + } else if (aInt < bInt) { + return -1; + } + } + + // One is a prefix of the other, or, they are equal: + return this.length - other.length; + } + + public void copy(IntsRef other) { + if (ints == null) { + ints = new int[other.length]; + } else { + ints = ArrayUtil.grow(ints, other.length); + } + System.arraycopy(other.ints, other.offset, ints, 0, other.length); + length = other.length; + offset = 0; + } + + public void grow(int newLength) { + if (ints.length < newLength) { + ints = ArrayUtil.grow(ints, newLength); + } + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append('['); + final int end = offset + length; + for(int i=offset;i offset) { + sb.append(' '); + } + sb.append(Integer.toHexString(ints[i])); + } + sb.append(']'); + return sb.toString(); + } +} Property changes on: lucene/src/java/org/apache/lucene/util/IntsRef.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java (revision 0) @@ -0,0 +1,118 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Pairs up two outputs into one. + * + * @lucene.experimental + */ + +public class PairOutputs extends Outputs> { + + private final Pair NO_OUTPUT; + private final Outputs outputs1; + private final Outputs outputs2; + + public static class Pair { + public final A output1; + public final B output2; + + public Pair(A output1, B output2) { + this.output1 = output1; + this.output2 = output2; + } + + @Override @SuppressWarnings("rawtypes") + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (other instanceof Pair) { + Pair pair = (Pair) other; + return output1.equals(pair.output1) && output2.equals(pair.output2); + } else { + return false; + } + } + + @Override + public int hashCode() { + return output1.hashCode() + output2.hashCode(); + } + }; + + public PairOutputs(Outputs outputs1, Outputs outputs2) { + this.outputs1 = outputs1; + this.outputs2 = outputs2; + NO_OUTPUT = new Pair(outputs1.getNoOutput(), outputs2.getNoOutput()); + } + + public Pair get(A output1, B output2) { + if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) { + return NO_OUTPUT; + } else { + return new Pair(output1, output2); + } + } + + @Override + public Pair common(Pair pair1, Pair pair2) { + return get(outputs1.common(pair1.output1, pair2.output1), + outputs2.common(pair1.output2, pair2.output2)); + } + + @Override + public Pair subtract(Pair output, Pair inc) { + return get(outputs1.subtract(output.output1, inc.output1), + outputs2.subtract(output.output2, inc.output2)); + } + + @Override + public Pair add(Pair prefix, Pair output) { + return get(outputs1.add(prefix.output1, output.output1), + outputs2.add(prefix.output2, output.output2)); + } + + @Override + public void write(Pair output, DataOutput writer) throws IOException { + outputs1.write(output.output1, writer); + outputs2.write(output.output2, writer); + } + + @Override + public Pair read(DataInput in) throws IOException { + A output1 = outputs1.read(in); + B output2 = outputs2.read(in); + return get(output1, output2); + } + + @Override + public Pair getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Pair output) { + return ""; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java (revision 0) @@ -0,0 +1,138 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.BytesRef; + +/** + * Output is a sequence of bytes, for each input term. + * + * @lucene.experimental + */ + +public final class ByteSequenceOutputs extends Outputs { + + private final static BytesRef NO_OUTPUT = new BytesRef(); + + private ByteSequenceOutputs() { + } + + public static ByteSequenceOutputs getSingleton() { + return new ByteSequenceOutputs(); + } + + @Override + public BytesRef common(BytesRef output1, BytesRef output2) { + assert output1 != null; + assert output2 != null; + + int pos1 = output1.offset; + int pos2 = output2.offset; + int stopAt1 = pos1 + Math.min(output1.length, output2.length); + while(pos1 < stopAt1) { + if (output1.bytes[pos1] != output2.bytes[pos2]) { + break; + } + pos1++; + pos2++; + } + + if (pos1 == output1.offset) { + // no common prefix + return NO_OUTPUT; + } else if (pos1 == output1.offset + output1.length) { + // output1 is a prefix of output2 + return output1; + } else if (pos2 == output2.offset + output2.length) { + // output2 is a prefix of output1 + return output2; + } else { + return new BytesRef(output1.bytes, output1.offset, pos1-output1.offset); + } + } + + @Override + public BytesRef subtract(BytesRef output, BytesRef inc) { + assert output != null; + assert inc != null; + if (inc == NO_OUTPUT) { + // no prefix removed + return output; + } else if (inc.length == output.length) { + // entire output removed + return NO_OUTPUT; + } else { + assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; + assert inc.length > 0; + return new BytesRef(output.bytes, output.offset + inc.length, output.length-inc.length); + } + } + + @Override + public BytesRef add(BytesRef prefix, BytesRef output) { + assert prefix != null; + assert output != null; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + assert prefix.length > 0; + assert output.length > 0; + BytesRef result = new BytesRef(prefix.length + output.length); + System.arraycopy(prefix.bytes, prefix.offset, result.bytes, 0, prefix.length); + System.arraycopy(output.bytes, output.offset, result.bytes, prefix.length, output.length); + result.length = prefix.length + output.length; + return result; + } + } + + @Override + public void write(BytesRef prefix, DataOutput out) throws IOException { + assert prefix != null; + out.writeVInt(prefix.length); + out.writeBytes(prefix.bytes, prefix.offset, prefix.length); + } + + @Override + public BytesRef read(DataInput in) throws IOException { + final int len = in.readVInt(); + if (len == 0) { + return NO_OUTPUT; + } else { + final BytesRef output = new BytesRef(len); + in.readBytes(output.bytes, 0, len); + output.length = len; + return output; + } + } + + @Override + public BytesRef getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(BytesRef output) { + return output.utf8ToString(); + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/IntsRefFSTEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/IntsRefFSTEnum.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/IntsRefFSTEnum.java (revision 0) @@ -0,0 +1,108 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.IntsRef; + +import java.io.IOException; + +/** Can next() and advance() through the terms in an FST + * + * @lucene.experimental +*/ + +public final class IntsRefFSTEnum extends FSTEnum { + private final IntsRef current = new IntsRef(10); + private final InputOutput result = new InputOutput(); + private IntsRef target; + + public static class InputOutput { + public IntsRef input; + public T output; + } + + /** doFloor controls the behavior of advance: if it's true + * doFloor is true, advance positions to the biggest + * term before target. */ + public IntsRefFSTEnum(FST fst) { + super(fst); + result.input = current; + current.offset = 1; + } + + public InputOutput current() { + return result; + } + + public InputOutput next() throws IOException { + //System.out.println(" enum.next"); + doNext(); + return setResult(); + } + + /** Seeks to smallest term that's >= target. */ + public InputOutput seekCeil(IntsRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekCeil(); + return setResult(); + } + + /** Seeks to biggest term that's <= target. */ + public InputOutput seekFloor(IntsRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekFloor(); + return setResult(); + } + + @Override + protected int getTargetLabel() { + if (upto-1 == target.length) { + return FST.END_LABEL; + } else { + return target.ints[target.offset + upto - 1]; + } + } + + @Override + protected int getCurrentLabel() { + // current.offset fixed at 1 + return current.ints[upto]; + } + + @Override + protected void setCurrentLabel(int label) { + current.ints[upto] = label; + } + + @Override + protected void grow() { + current.grow(upto+1); + } + + private InputOutput setResult() { + if (upto == 0) { + return null; + } else { + current.length = upto-1; + result.output = output[upto]; + return result; + } + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/IntsRefFSTEnum.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/Util.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/Util.java (revision 0) @@ -0,0 +1,328 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.*; +import java.util.*; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +/** Static helper methods + * + * @lucene.experimental */ +public final class Util { + private Util() { + } + + /** Looks up the output for this input, or null if the + * input is not accepted. FST must be + * INPUT_TYPE.BYTE4. */ + public static T get(FST fst, IntsRef input) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE4; + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + // Accumulate output as we go + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + for(int i=0;i T get(FST fst, char[] input, int offset, int length) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE4; + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + int charIdx = offset; + final int charLimit = offset + length; + + // Accumulate output as we go + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + while(charIdx < charLimit) { + final int utf32 = Character.codePointAt(input, charIdx); + charIdx += Character.charCount(utf32); + + if (fst.findTargetArc(utf32, arc, arc) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + output = fst.outputs.add(output, arc.output); + } + } + + if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + return fst.outputs.add(output, arc.output); + } else { + return output; + } + } + + + /** Logically casts input to UTF32 ints then looks up the output + * or null if the input is not accepted. FST must be + * INPUT_TYPE.BYTE4. */ + public static T get(FST fst, CharSequence input) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE4; + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + int charIdx = 0; + final int charLimit = input.length(); + + // Accumulate output as we go + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + + while(charIdx < charLimit) { + final int utf32 = Character.codePointAt(input, charIdx); + charIdx += Character.charCount(utf32); + + if (fst.findTargetArc(utf32, arc, arc) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + output = fst.outputs.add(output, arc.output); + } + } + + if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + return fst.outputs.add(output, arc.output); + } else { + return output; + } + } + + /** Looks up the output for this input, or null if the + * input is not accepted */ + public static T get(FST fst, BytesRef input) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE1; + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + // Accumulate output as we go + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + for(int i=0;idot language description + * for visualization. Example of use: + * + *

+   * PrintStream ps = new PrintStream("out.dot");
+   * fst.toDot(ps);
+   * ps.close();
+   * 
+ * + * and then, from command line: + * + *
+   * dot -Tpng -o out.png out.dot
+   * 
+ * + *

+ * Note: larger FSTs (a few thousand nodes) won't even render, don't bother. + * + * @param sameRank + * If true, the resulting dot file will try + * to order states in layers of breadth-first traversal. This may + * mess up arcs, but makes the output FST's structure a bit clearer. + * + * @param labelStates + * If true states will have labels equal to their offsets in their + * binary format. Expands the graph considerably. + * + * @see "http://www.graphviz.org/" + */ + public static void toDot(FST fst, Writer out, boolean sameRank, boolean labelStates) + throws IOException { + final String expandedNodeColor = "blue"; + + // This is the start arc in the automaton (from the epsilon state to the first state + // with outgoing transitions. + final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); + + // A queue of transitions to consider for the next level. + final List> thisLevelQueue = new ArrayList>(); + + // A queue of transitions to consider when processing the next level. + final List> nextLevelQueue = new ArrayList>(); + nextLevelQueue.add(startArc); + + // A list of states on the same level (for ranking). + final List sameLevelStates = new ArrayList(); + + // A bitset of already seen states (target offset). + final BitSet seen = new BitSet(); + seen.set(startArc.target); + + // Shape for states. + final String stateShape = "circle"; + + // Emit DOT prologue. + out.write("digraph FST {\n"); + out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); + + if (!labelStates) { + out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); + } + + emitDotState(out, "initial", "point", "white", ""); + emitDotState(out, Integer.toString(startArc.target), stateShape, + fst.isExpandedTarget(startArc) ? expandedNodeColor : null, + ""); + out.write(" initial -> " + startArc.target + "\n"); + + final T NO_OUTPUT = fst.outputs.getNoOutput(); + int level = 0; + + while (!nextLevelQueue.isEmpty()) { + // we could double buffer here, but it doesn't matter probably. + thisLevelQueue.addAll(nextLevelQueue); + nextLevelQueue.clear(); + + level++; + out.write("\n // Transitions and states at level: " + level + "\n"); + while (!thisLevelQueue.isEmpty()) { + final FST.Arc arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); + + if (fst.targetHasArcs(arc)) { + // scan all arcs + final int node = arc.target; + fst.readFirstTargetArc(arc, arc); + + while (true) { + // Emit the unseen state and add it to the queue for the next level. + if (arc.target >= 0 && !seen.get(arc.target)) { + final boolean isExpanded = fst.isExpandedTarget(arc); + emitDotState(out, Integer.toString(arc.target), stateShape, + isExpanded ? expandedNodeColor : null, + labelStates ? Integer.toString(arc.target) : ""); + seen.set(arc.target); + nextLevelQueue.add(new FST.Arc().copyFrom(arc)); + sameLevelStates.add(arc.target); + } + + String outs; + if (arc.output != NO_OUTPUT) { + outs = "/" + fst.outputs.outputToString(arc.output); + } else { + outs = ""; + } + + final String cl; + if (arc.label == FST.END_LABEL) { + cl = "~"; + } else { + cl = printableLabel(arc.label); + } + + out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n"); + + // Break the loop if we're on the last arc of this state. + if (arc.isLast()) { + break; + } + fst.readNextArc(arc); + } + } + } + + // Emit state ranking information. + if (sameRank && sameLevelStates.size() > 1) { + out.write(" {rank=same; "); + for (int state : sameLevelStates) { + out.write(state + "; "); + } + out.write(" }\n"); + } + sameLevelStates.clear(); + } + + // Emit terminating state (always there anyway). + out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n"); + out.write(" {rank=sink; -1 }\n"); + + out.write("}\n"); + out.flush(); + } + + /** + * Emit a single state in the dot language. + */ + private static void emitDotState(Writer out, String name, String shape, + String color, String label) throws IOException { + out.write(" " + name + + " [" + + (shape != null ? "shape=" + shape : "") + " " + + (color != null ? "color=" + color : "") + " " + + (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " " + + "]\n"); + } + + /** + * Ensures an arc's label is indeed printable (dot uses US-ASCII). + */ + private static String printableLabel(int label) { + if (label >= 0x20 && label <= 0x7d) { + return Character.toString((char) label); + } else { + return "0x" + Integer.toHexString(label); + } + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/Util.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/package.html =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/package.html (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/package.html (revision 0) @@ -0,0 +1,25 @@ + + + + + + + +Finite state transducers + + Property changes on: lucene/src/java/org/apache/lucene/util/fst/package.html ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/NoOutputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/NoOutputs.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/NoOutputs.java (revision 0) @@ -0,0 +1,96 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Use this if you just want to build an FSA. + * + * @lucene.experimental + */ + +public final class NoOutputs extends Outputs { + + final Object NO_OUTPUT = new Object() { + // NodeHash calls hashCode for this output; we fix this + // so we get deterministic hashing. + @Override + public int hashCode() { + return 42; + } + + @Override + public boolean equals(Object other) { + return other == this; + } + }; + + private static final NoOutputs singleton = new NoOutputs(); + + private NoOutputs() { + } + + public static NoOutputs getSingleton() { + return singleton; + } + + @Override + public Object common(Object output1, Object output2) { + assert output1 == NO_OUTPUT; + assert output2 == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public Object subtract(Object output, Object inc) { + assert output == NO_OUTPUT; + assert inc == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public Object add(Object prefix, Object output) { + assert prefix == NO_OUTPUT: "got " + prefix; + assert output == NO_OUTPUT; + return NO_OUTPUT; + } + + @Override + public void write(Object prefix, DataOutput out) { + //assert false; + } + + @Override + public Object read(DataInput in) { + //assert false; + //return null; + return NO_OUTPUT; + } + + @Override + public Object getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Object output) { + return ""; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/NoOutputs.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (revision 0) @@ -0,0 +1,136 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Output is a long, for each input term. NOTE: the + * resulting FST is not guaranteed to be minimal! See + * {@link Builder}. You cannot store 0 output with this + * (that's reserved to mean "no output")! + * + * @lucene.experimental + */ + +public final class PositiveIntOutputs extends Outputs { + + private final static Long NO_OUTPUT = new Long(0); + + private final boolean doShare; + + private final static PositiveIntOutputs singletonShare = new PositiveIntOutputs(true); + private final static PositiveIntOutputs singletonNoShare = new PositiveIntOutputs(false); + + private PositiveIntOutputs(boolean doShare) { + this.doShare = doShare; + } + + public static PositiveIntOutputs getSingleton(boolean doShare) { + return doShare ? singletonShare : singletonNoShare; + } + + public Long get(long v) { + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } + + @Override + public Long common(Long output1, Long output2) { + assert valid(output1); + assert valid(output2); + if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) { + return NO_OUTPUT; + } else if (doShare) { + assert output1 > 0; + assert output2 > 0; + return Math.min(output1, output2); + } else if (output1.equals(output2)) { + return output1; + } else { + return NO_OUTPUT; + } + } + + @Override + public Long subtract(Long output, Long inc) { + assert valid(output); + assert valid(inc); + assert output >= inc; + + if (inc == NO_OUTPUT) { + return output; + } else if (output.equals(inc)) { + return NO_OUTPUT; + } else { + return output - inc; + } + } + + @Override + public Long add(Long prefix, Long output) { + assert valid(prefix); + assert valid(output); + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + return prefix + output; + } + } + + @Override + public void write(Long output, DataOutput out) throws IOException { + assert valid(output); + out.writeVLong(output); + } + + @Override + public Long read(DataInput in) throws IOException { + long v = in.readVLong(); + if (v == 0) { + return NO_OUTPUT; + } else { + return v; + } + } + + private boolean valid(Long o) { + assert o != null; + assert o instanceof Long; + assert o == NO_OUTPUT || o > 0; + return true; + } + + @Override + public Long getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Long output) { + return output.toString(); + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java (revision 0) @@ -0,0 +1,479 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; + +/** Can next() and advance() through the terms in an FST + * + * @lucene.experimental +*/ + +abstract class FSTEnum { + protected final FST fst; + + @SuppressWarnings("unchecked") protected FST.Arc[] arcs = new FST.Arc[10]; + // outputs are cumulative + @SuppressWarnings("unchecked") protected T[] output = (T[]) new Object[10]; + + protected final T NO_OUTPUT; + protected final FST.Arc scratchArc = new FST.Arc(); + + protected int upto; + protected int targetLength; + + /** doFloor controls the behavior of advance: if it's true + * doFloor is true, advance positions to the biggest + * term before target. */ + protected FSTEnum(FST fst) { + this.fst = fst; + NO_OUTPUT = fst.outputs.getNoOutput(); + fst.getFirstArc(getArc(0)); + output[0] = NO_OUTPUT; + } + + protected abstract int getTargetLabel(); + protected abstract int getCurrentLabel(); + + protected abstract void setCurrentLabel(int label); + protected abstract void grow(); + + /** Rewinds enum state to match the shared prefix between + * current term and target term */ + protected final void rewindPrefix() throws IOException { + if (upto == 0) { + //System.out.println(" init"); + upto = 1; + fst.readFirstTargetArc(getArc(0), getArc(1)); + return; + } + //System.out.println(" rewind upto=" + upto + " vs targetLength=" + targetLength); + + final int currentLimit = upto; + upto = 1; + while (upto < currentLimit && upto <= targetLength+1) { + final int cmp = getCurrentLabel() - getTargetLabel(); + if (cmp < 0) { + // seek forward + break; + } else if (cmp > 0) { + // seek backwards -- reset this arc to the first arc + final FST.Arc arc = getArc(upto); + fst.readFirstTargetArc(getArc(upto-1), arc); + //System.out.println(" seek first arc"); + break; + } + upto++; + } + } + + protected void doNext() throws IOException { + //System.out.println("FE: next upto=" + upto); + if (upto == 0) { + //System.out.println(" init"); + upto = 1; + fst.readFirstTargetArc(getArc(0), getArc(1)); + } else { + // pop + //System.out.println(" check pop curArc target=" + arcs[upto].target + " label=" + arcs[upto].label + " isLast?=" + arcs[upto].isLast()); + while (arcs[upto].isLast()) { + upto--; + if (upto == 0) { + //System.out.println(" eof"); + return; + } + } + fst.readNextArc(arcs[upto]); + } + + pushFirst(); + } + + // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / + // SEEK_END)? saves the eq check above? + + /** Seeks to smallest term that's >= target. */ + protected void doSeekCeil() throws IOException { + + //System.out.println(" advance len=" + target.length + " curlen=" + current.length); + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + //System.out.println("FE.seekCeil upto=" + upto); + + // Save time by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + //System.out.println(" after rewind upto=" + upto); + + FST.Arc arc = getArc(upto); + int targetLabel = getTargetLabel(); + //System.out.println(" init targetLabel=" + targetLabel); + + // Now scan forward, matching the new suffix of the target + while(true) { + + //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel); + + if (arc.bytesPerArc != 0 && arc.label != -1) { + + // Arcs are fixed array -- use binary search to find + // the target. + + final FST.BytesReader in = fst.getBytesReader(0); + int low = arc.arcIdx; + int high = arc.numArcs-1; + int mid = 0; + //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); + boolean found = false; + while (low <= high) { + mid = (low + high) >>> 1; + in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + final int midLabel = fst.readLabel(in); + final int cmp = midLabel - targetLabel; + //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); + if (cmp < 0) + low = mid + 1; + else if (cmp > 0) + high = mid - 1; + else { + found = true; + break; + } + } + + // NOTE: this code is dup'd w/ the code below (in + // the outer else clause): + if (found) { + // Match + arc.arcIdx = mid-1; + fst.readNextRealArc(arc); + assert arc.arcIdx == mid; + assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; + output[upto] = fst.outputs.add(output[upto-1], arc.output); + if (targetLabel == FST.END_LABEL) { + return; + } + setCurrentLabel(arc.label); + incr(); + arc = fst.readFirstTargetArc(arc, getArc(upto)); + targetLabel = getTargetLabel(); + continue; + } else if (low == arc.numArcs) { + // Dead end + arc.arcIdx = arc.numArcs-2; + fst.readNextRealArc(arc); + assert arc.isLast(); + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while(true) { + if (upto == 0) { + return; + } + final FST.Arc prevArc = getArc(upto); + //System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast()); + if (!prevArc.isLast()) { + fst.readNextArc(prevArc); + pushFirst(); + return; + } + upto--; + } + } else { + arc.arcIdx = (low > high ? low : high)-1; + fst.readNextRealArc(arc); + assert arc.label > targetLabel; + pushFirst(); + return; + } + } else { + // Arcs are not array'd -- must do linear scan: + if (arc.label == targetLabel) { + // recurse + output[upto] = fst.outputs.add(output[upto-1], arc.output); + if (targetLabel == FST.END_LABEL) { + return; + } + setCurrentLabel(arc.label); + incr(); + arc = fst.readFirstTargetArc(arc, getArc(upto)); + targetLabel = getTargetLabel(); + } else if (arc.label > targetLabel) { + pushFirst(); + return; + } else if (arc.isLast()) { + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while(true) { + if (upto == 0) { + return; + } + final FST.Arc prevArc = getArc(upto); + //System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast()); + if (!prevArc.isLast()) { + fst.readNextArc(prevArc); + pushFirst(); + return; + } + upto--; + } + } else { + // keep scanning + //System.out.println(" next scan"); + fst.readNextArc(arc); + } + } + } + } + + // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / + // SEEK_END)? saves the eq check above? + /** Seeks to largest term that's <= target. */ + protected void doSeekFloor() throws IOException { + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + //System.out.println("FE: seek floor upto=" + upto); + + // Save CPU by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + + //System.out.println("FE: after rewind upto=" + upto); + + FST.Arc arc = getArc(upto); + int targetLabel = getTargetLabel(); + + //System.out.println("FE: init targetLabel=" + targetLabel); + + // Now scan forward, matching the new suffix of the target + while(true) { + //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast()); + + if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { + // Arcs are fixed array -- use binary search to find + // the target. + + final FST.BytesReader in = fst.getBytesReader(0); + int low = arc.arcIdx; + int high = arc.numArcs-1; + int mid = 0; + //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); + boolean found = false; + while (low <= high) { + mid = (low + high) >>> 1; + in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + final int midLabel = fst.readLabel(in); + final int cmp = midLabel - targetLabel; + //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); + if (cmp < 0) + low = mid + 1; + else if (cmp > 0) + high = mid - 1; + else { + found = true; + break; + } + } + + // NOTE: this code is dup'd w/ the code below (in + // the outer else clause): + if (found) { + // Match -- recurse + //System.out.println(" match! arcIdx=" + mid); + arc.arcIdx = mid-1; + fst.readNextRealArc(arc); + assert arc.arcIdx == mid; + assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; + output[upto] = fst.outputs.add(output[upto-1], arc.output); + if (targetLabel == FST.END_LABEL) { + return; + } + setCurrentLabel(arc.label); + incr(); + arc = fst.readFirstTargetArc(arc, getArc(upto)); + targetLabel = getTargetLabel(); + continue; + } else if (high == -1) { + //System.out.println(" before first"); + // Very first arc is after our target + // TODO: if each arc could somehow read the arc just + // before, we can save this re-scan. The ceil case + // doesn't need this because it reads the next arc + // instead: + while(true) { + // First, walk backwards until we find a first arc + // that's before our target label: + fst.readFirstTargetArc(getArc(upto-1), arc); + if (arc.label < targetLabel) { + // Then, scan forwards to the arc just before + // the targetLabel: + while(!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) { + fst.readNextArc(arc); + } + pushLast(); + return; + } + upto--; + if (upto == 0) { + return; + } + targetLabel = getTargetLabel(); + arc = getArc(upto); + } + } else { + // There is a floor arc: + arc.arcIdx = (low > high ? high : low)-1; + //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1)); + fst.readNextRealArc(arc); + assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel; + assert arc.label < targetLabel; + pushLast(); + return; + } + } else { + + if (arc.label == targetLabel) { + // Match -- recurse + output[upto] = fst.outputs.add(output[upto-1], arc.output); + if (targetLabel == FST.END_LABEL) { + return; + } + setCurrentLabel(arc.label); + incr(); + arc = fst.readFirstTargetArc(arc, getArc(upto)); + targetLabel = getTargetLabel(); + } else if (arc.label > targetLabel) { + // TODO: if each arc could somehow read the arc just + // before, we can save this re-scan. The ceil case + // doesn't need this because it reads the next arc + // instead: + while(true) { + // First, walk backwards until we find a first arc + // that's before our target label: + fst.readFirstTargetArc(getArc(upto-1), arc); + if (arc.label < targetLabel) { + // Then, scan forwards to the arc just before + // the targetLabel: + while(!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) { + fst.readNextArc(arc); + } + pushLast(); + return; + } + upto--; + if (upto == 0) { + return; + } + targetLabel = getTargetLabel(); + arc = getArc(upto); + } + } else if (!arc.isLast()) { + //System.out.println(" check next label=" + fst.readNextArcLabel(arc) + " (" + (char) fst.readNextArcLabel(arc) + ")"); + if (fst.readNextArcLabel(arc) > targetLabel) { + pushLast(); + return; + } else { + // keep scanning + fst.readNextArc(arc); + } + } else { + pushLast(); + return; + } + } + } + } + + private void incr() { + upto++; + grow(); + if (arcs.length <= upto) { + @SuppressWarnings("unchecked") final FST.Arc[] newArcs = + new FST.Arc[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, newArcs, 0, arcs.length); + arcs = newArcs; + } + if (output.length <= upto) { + @SuppressWarnings("unchecked") final T[] newOutput = + (T[]) new Object[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(output, 0, newOutput, 0, output.length); + output = newOutput; + } + } + + // Appends current arc, and then recurses from its target, + // appending first arc all the way to the final node + private void pushFirst() throws IOException { + + FST.Arc arc = arcs[upto]; + assert arc != null; + + while (true) { + output[upto] = fst.outputs.add(output[upto-1], arc.output); + if (arc.label == FST.END_LABEL) { + // Final node + break; + } + //System.out.println(" pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" + fst.outputs.outputToString(output[upto])); + setCurrentLabel(arc.label); + incr(); + + final FST.Arc nextArc = getArc(upto); + fst.readFirstTargetArc(arc, nextArc); + arc = nextArc; + } + } + + // Recurses from current arc, appending last arc all the + // way to the first final node + private void pushLast() throws IOException { + + FST.Arc arc = arcs[upto]; + assert arc != null; + + while (true) { + setCurrentLabel(arc.label); + output[upto] = fst.outputs.add(output[upto-1], arc.output); + if (arc.label == FST.END_LABEL) { + // Final node + break; + } + incr(); + + arc = fst.readLastTargetArc(arc, getArc(upto)); + } + } + + private FST.Arc getArc(int idx) { + if (arcs[idx] == null) { + arcs[idx] = new FST.Arc(); + } + return arcs[idx]; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/Outputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/Outputs.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/Outputs.java (revision 0) @@ -0,0 +1,62 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Represents the outputs for an FST, providing the basic + * algebra needed for the FST. + * + * @lucene.experimental + */ + +public abstract class Outputs { + + // TODO: maybe change this API to allow for re-use of the + // output instances -- this is an insane amount of garbage + // (new object per byte/char/int) if eg used during + // analysis + + /** Eg common("foo", "foobar") -> "foo" */ + public abstract T common(T output1, T output2); + + /** Eg subtract("foobar", "foo") -> "bar" */ + public abstract T subtract(T output, T inc); + + /** Eg add("foo", "bar") -> "foobar" */ + public abstract T add(T prefix, T output); + + public abstract void write(T output, DataOutput out) throws IOException; + + public abstract T read(DataInput in) throws IOException; + + /** NOTE: this output is compared with == so you must + * ensure that all methods return the single object if + * it's really no output */ + public abstract T getNoOutput(); + + public abstract String outputToString(T output); + + public T merge(T first, T second) { + throw new UnsupportedOperationException(); + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/Outputs.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java (revision 0) @@ -0,0 +1,224 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Holds one or two longs for each input term. If it's a + * single output, Long is returned; else, TwoLongs. Order + * is preseved in the TwoLongs case, ie .first is the first + * input/output added to Builder, and .second is the + * second. You cannot store 0 output with this (that's + * reserved to mean "no output")! + * + * NOTE: the resulting FST is not guaranteed to be minimal! + * See {@link Builder}. + * + * @lucene.experimental + */ + +public final class UpToTwoPositiveIntOutputs extends Outputs { + + public final static class TwoLongs { + final long first; + final long second; + + public TwoLongs(long first, long second) { + this.first = first; + this.second = second; + assert first >= 0; + assert second >= 0; + } + + @Override + public String toString() { + return "TwoLongs:" + first + "," + second; + } + + @Override + public boolean equals(Object _other) { + if (_other instanceof TwoLongs) { + final TwoLongs other = (TwoLongs) _other; + return first == other.first && second == other.second; + } else { + return false; + } + } + + @Override + public int hashCode() { + return (int) ((first^(first>>>32)) ^ (second^(second>>32))); + } + } + + private final static Long NO_OUTPUT = new Long(0); + + private final boolean doShare; + + private final static UpToTwoPositiveIntOutputs singletonShare = new UpToTwoPositiveIntOutputs(true); + private final static UpToTwoPositiveIntOutputs singletonNoShare = new UpToTwoPositiveIntOutputs(false); + + private UpToTwoPositiveIntOutputs(boolean doShare) { + this.doShare = doShare; + } + + public static UpToTwoPositiveIntOutputs getSingleton(boolean doShare) { + return doShare ? singletonShare : singletonNoShare; + } + + public Long get(long v) { + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } + + public TwoLongs get(long first, long second) { + return new TwoLongs(first, second); + } + + @Override + public Long common(Object _output1, Object _output2) { + assert valid(_output1, false); + assert valid(_output2, false); + final Long output1 = (Long) _output1; + final Long output2 = (Long) _output2; + if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) { + return NO_OUTPUT; + } else if (doShare) { + assert output1 > 0; + assert output2 > 0; + return Math.min(output1, output2); + } else if (output1.equals(output2)) { + return output1; + } else { + return NO_OUTPUT; + } + } + + @Override + public Long subtract(Object _output, Object _inc) { + assert valid(_output, false); + assert valid(_inc, false); + final Long output = (Long) _output; + final Long inc = (Long) _inc; + assert output >= inc; + + if (inc == NO_OUTPUT) { + return output; + } else if (output.equals(inc)) { + return NO_OUTPUT; + } else { + return output - inc; + } + } + + @Override + public Object add(Object _prefix, Object _output) { + assert valid(_prefix, false); + assert valid(_output, true); + final Long prefix = (Long) _prefix; + if (_output instanceof Long) { + final Long output = (Long) _output; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + return prefix + output; + } + } else { + final TwoLongs output = (TwoLongs) _output; + final long v = prefix; + return new TwoLongs(output.first + v, output.second + v); + } + } + + @Override + public void write(Object _output, DataOutput out) throws IOException { + assert valid(_output, true); + if (_output instanceof Long) { + final Long output = (Long) _output; + out.writeVLong(output<<1); + } else { + final TwoLongs output = (TwoLongs) _output; + out.writeVLong((output.first<<1) | 1); + out.writeVLong(output.second); + } + } + + @Override + public Object read(DataInput in) throws IOException { + final long code = in.readVLong(); + if ((code & 1) == 0) { + // single long + final long v = code >>> 1; + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } else { + // two longs + final long first = code >>> 1; + final long second = in.readVLong(); + return new TwoLongs(first, second); + } + } + + private boolean valid(Long o) { + assert o != null; + assert o instanceof Long; + assert o == NO_OUTPUT || o > 0; + return true; + } + + // Used only by assert + private boolean valid(Object _o, boolean allowDouble) { + if (!allowDouble) { + assert _o instanceof Long; + return valid((Long) _o); + } else if (_o instanceof TwoLongs) { + return true; + } else { + return valid((Long) _o); + } + } + + @Override + public Object getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Object output) { + return output.toString(); + } + + @Override + public Object merge(Object first, Object second) { + assert valid(first, false); + assert valid(second, false); + return new TwoLongs((Long) first, (Long) second); + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/TODO =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/TODO (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/TODO (revision 0) @@ -0,0 +1,39 @@ +is threadlocal.get costly? if so maybe make an FSTReader? would hold this "relative" pos, and each thread'd use it for reading, instead of PosRef + +maybe changed Outputs class to "reuse" stuff? eg this new BytesRef in ByteSequenceOutputs.. + +do i even "need" both non_final_end_state and final_end_state? + +hmm -- can I get weights working here? + +can FST be used to index all internal substrings, mapping to term? + - maybe put back ability to add multiple outputs per input...? + +make this work w/ char...? + - then FSTCharFilter/FSTTokenFilter + - syn filter? + +experiment: try reversing terms before compressing -- how much smaller? + +maybe seprate out a 'writable/growing fst' from a read-only one? + +can we somehow [partially] tableize lookups like oal.util.automaton? + +make an FST terms index option for codecs...? + +make an FSTCharsMap? + +need a benchmark testing FST traversal -- just fix the static main to rewind & visit all terms + +thread state + +when writing FST to disk: +- Sequentially writing (would save memory in codec during indexing). We are now using DataOutput, which could also go directly to disk +- problem: size of BytesRef must be known before + +later + - maybe don't require FSTEnum.advance to be forward only? + - should i make a posIntOutputs separate from posLongOutputs? + - mv randomAccpetedWord / run / etc. from test into FST? + - hmm get multi-outputs working again? do we ever need this? + Index: lucene/src/java/org/apache/lucene/util/fst/BytesRefFSTEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/BytesRefFSTEnum.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/BytesRefFSTEnum.java (revision 0) @@ -0,0 +1,108 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.BytesRef; + +/** Can next() and advance() through the terms in an FST + * + * @lucene.experimental +*/ + +public final class BytesRefFSTEnum extends FSTEnum { + private final BytesRef current = new BytesRef(10); + private final InputOutput result = new InputOutput(); + private BytesRef target; + + public static class InputOutput { + public BytesRef input; + public T output; + } + + /** doFloor controls the behavior of advance: if it's true + * doFloor is true, advance positions to the biggest + * term before target. */ + public BytesRefFSTEnum(FST fst) { + super(fst); + result.input = current; + current.offset = 1; + } + + public InputOutput current() { + return result; + } + + public InputOutput next() throws IOException { + //System.out.println(" enum.next"); + doNext(); + return setResult(); + } + + /** Seeks to smallest term that's >= target. */ + public InputOutput seekCeil(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekCeil(); + return setResult(); + } + + /** Seeks to biggest term that's <= target. */ + public InputOutput seekFloor(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekFloor(); + return setResult(); + } + + @Override + protected int getTargetLabel() { + if (upto-1 == target.length) { + return FST.END_LABEL; + } else { + return target.bytes[target.offset + upto - 1] & 0xFF; + } + } + + @Override + protected int getCurrentLabel() { + // current.offset fixed at 1 + return current.bytes[upto] & 0xFF; + } + + @Override + protected void setCurrentLabel(int label) { + current.bytes[upto] = (byte) label; + } + + @Override + protected void grow() { + current.grow(upto+1); + } + + private InputOutput setResult() { + if (upto == 0) { + return null; + } else { + current.length = upto-1; + result.output = output[upto]; + return result; + } + } +} Property changes on: lucene/src/java/org/apache/lucene/util/fst/BytesRefFSTEnum.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/fst/Builder.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/Builder.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/fst/Builder.java (revision 0) @@ -0,0 +1,545 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +import java.io.IOException; + +/** + * Builds a compact FST (maps an IntsRef term to an arbitrary + * output) from pre-sorted terms with outputs (the FST + * becomes an FSA if you use NoOutputs). The FST is written + * on-the-fly into a compact serialized format byte array, which can + * be saved to / loaded from a Directory or used directly + * for traversal. The FST is always finite (no cycles). + * + *

NOTE: The algorithm is described at + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698

+ * + * If your outputs are ByteSequenceOutput then the final FST + * will be minimal, but if you use PositiveIntOutput then + * it's only "near minimal". For example, aa/0, aab/1, bbb/2 + * will produce 6 states when a 5 state fst is also + * possible. + * + * The parameterized type T is the output type. See the + * subclasses of {@link Outputs}. + * + * @lucene.experimental + */ + +public class Builder { + private final NodeHash dedupHash; + private final FST fst; + private final T NO_OUTPUT; + + // simplistic pruning: we prune node (and all following + // nodes) if less than this number of terms go through it: + private final int minSuffixCount1; + + // better pruning: we prune node (and all following + // nodes) if the prior node has less than this number of + // terms go through it: + private final int minSuffixCount2; + + private final IntsRef lastInput = new IntsRef(); + + // NOTE: cutting this over to ArrayList instead loses ~6% + // in build performance on 9.8M Wikipedia terms; so we + // left this as an array: + // current "frontier" + private UnCompiledNode[] frontier; + + public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs outputs) { + this.minSuffixCount1 = minSuffixCount1; + this.minSuffixCount2 = minSuffixCount2; + fst = new FST(inputType, outputs); + if (doMinSuffix) { + dedupHash = new NodeHash(fst); + } else { + dedupHash = null; + } + NO_OUTPUT = outputs.getNoOutput(); + + @SuppressWarnings("unchecked") final UnCompiledNode[] f = (UnCompiledNode[]) new UnCompiledNode[10]; + frontier = f; + for(int idx=0;idx(this, idx); + } + } + + public int getTotStateCount() { + return fst.nodeCount; + } + + public long getTermCount() { + return frontier[0].inputCount; + } + + public int getMappedStateCount() { + return dedupHash == null ? 0 : fst.nodeCount; + } + + private CompiledNode compileNode(UnCompiledNode n) throws IOException { + + final int address; + if (dedupHash != null) { + if (n.numArcs == 0) { + address = fst.addNode(n); + } else { + address = dedupHash.add(n); + } + } else { + address = fst.addNode(n); + } + assert address != -2; + + n.clear(); + + final CompiledNode fn = new CompiledNode(); + fn.address = address; + return fn; + } + + private void compilePrevTail(int prefixLenPlus1) throws IOException { + assert prefixLenPlus1 >= 1; + //System.out.println(" compileTail " + prefixLenPlus1); + for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { + boolean doPrune = false; + boolean doCompile = false; + + final UnCompiledNode node = frontier[idx]; + final UnCompiledNode parent = frontier[idx-1]; + + if (node.inputCount < minSuffixCount1) { + doPrune = true; + doCompile = true; + } else if (idx > prefixLenPlus1) { + // prune if parent's inputCount is less than suffixMinCount2 + if (parent.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.inputCount == 1) { + // my parent, about to be compiled, doesn't make the cut, so + // I'm definitely pruned + + // if pruneCount2 is 1, we keep only up + // until the 'distinguished edge', ie we keep only the + // 'divergent' part of the FST. if my parent, about to be + // compiled, has inputCount 1 then we are already past the + // distinguished edge. NOTE: this only works if + // the FST outputs are not "compressible" (simple + // ords ARE compressible). + doPrune = true; + } else { + // my parent, about to be compiled, does make the cut, so + // I'm definitely not pruned + doPrune = false; + } + doCompile = true; + } else { + // if pruning is disabled (count is 0) we can always + // compile current node + doCompile = minSuffixCount2 == 0; + } + + //System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune); + + if (node.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && node.inputCount == 1) { + // drop all arcs + for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; + target.clear(); + } + node.numArcs = 0; + } + + if (doPrune) { + // this node doesn't make it -- deref it + node.clear(); + parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node); + } else { + + if (minSuffixCount2 != 0) { + compileAllTargets(node); + } + final T nextFinalOutput = node.output; + + // We "fake" the node as being final if it has no + // outgoing arcs; in theory we could leave it + // as non-final (the FST can represent this), but + // FSTEnum, Util, etc., have trouble w/ non-final + // dead-end states: + final boolean isFinal = node.isFinal || node.numArcs == 0; + + if (doCompile) { + // this node makes it and we now compile it. first, + // compile any targets that were previously + // undecided: + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + compileNode(node), + nextFinalOutput, + isFinal); + } else { + // replaceLast just to install + // nextFinalOutput/isFinal onto the arc + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + node, + nextFinalOutput, + isFinal); + // this node will stay in play for now, since we are + // undecided on whether to prune it. later, it + // will be either compiled or pruned, so we must + // allocate a new node: + frontier[idx] = new UnCompiledNode(this, idx); + } + } + } + } + + private final IntsRef scratchIntsRef = new IntsRef(10); + + public void add(BytesRef input, T output) throws IOException { + assert fst.getInputType() == FST.INPUT_TYPE.BYTE1; + scratchIntsRef.grow(input.length); + for(int i=0;i= pos1Stop || lastInput.ints[pos1] != input.ints[pos2]) { + break; + } + pos1++; + pos2++; + } + final int prefixLenPlus1 = pos1+1; + + if (frontier.length < input.length+1) { + @SuppressWarnings("unchecked") final UnCompiledNode[] next = + new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(frontier, 0, next, 0, frontier.length); + for(int idx=frontier.length;idx(this, idx); + } + frontier = next; + } + + // minimize/compile states from previous input's + // orphan'd suffix + compilePrevTail(prefixLenPlus1); + + // init tail states for current input + for(int idx=prefixLenPlus1;idx<=input.length;idx++) { + frontier[idx-1].addArc(input.ints[input.offset + idx - 1], + frontier[idx]); + //System.out.println(" incr tail " + idx); + frontier[idx].inputCount++; + } + + final UnCompiledNode lastNode = frontier[input.length]; + lastNode.isFinal = true; + lastNode.output = NO_OUTPUT; + + // push conflicting outputs forward, only as far as + // needed + for(int idx=1;idx node = frontier[idx]; + final UnCompiledNode parentNode = frontier[idx-1]; + + final T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]); + assert validOutput(lastOutput); + + final T commonOutputPrefix; + final T wordSuffix; + + if (lastOutput != NO_OUTPUT) { + commonOutputPrefix = fst.outputs.common(output, lastOutput); + assert validOutput(commonOutputPrefix); + wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix); + assert validOutput(wordSuffix); + parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix); + node.prependOutput(wordSuffix); + } else { + commonOutputPrefix = wordSuffix = NO_OUTPUT; + } + + output = fst.outputs.subtract(output, commonOutputPrefix); + assert validOutput(output); + } + + if (lastInput.length == input.length && prefixLenPlus1 == 1+input.length) { + // same input more than 1 time in a row, mapping to + // multiple outputs + lastNode.output = fst.outputs.merge(lastNode.output, output); + } else { + // this new arc is private to this new input; set its + // arc output to the leftover output: + frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output); + } + + // save last input + lastInput.copy(input); + + //System.out.println(" count[0]=" + frontier[0].inputCount); + } + + private boolean validOutput(T output) { + return output == NO_OUTPUT || !output.equals(NO_OUTPUT); + } + + /** Returns final FST. NOTE: this will return null if + * nothing is accepted by the FST. */ + public FST finish() throws IOException { + + // minimize nodes in the last word's suffix + compilePrevTail(1); + //System.out.println("finish: inputCount=" + frontier[0].inputCount); + if (frontier[0].inputCount < minSuffixCount1 || frontier[0].inputCount < minSuffixCount2 || frontier[0].numArcs == 0) { + if (fst.emptyOutput == null) { + return null; + } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) { + // empty string got pruned + return null; + } else { + fst.finish(compileNode(frontier[0]).address); + //System.out.println("compile addr = " + fst.getStartNode()); + return fst; + } + } else { + if (minSuffixCount2 != 0) { + compileAllTargets(frontier[0]); + } + //System.out.println("NOW: " + frontier[0].numArcs); + fst.finish(compileNode(frontier[0]).address); + } + + return fst; + } + + private void compileAllTargets(UnCompiledNode node) throws IOException { + for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + if (!arc.target.isCompiled()) { + // not yet compiled + @SuppressWarnings("unchecked") final UnCompiledNode n = (UnCompiledNode) arc.target; + if (n.numArcs == 0) { + //System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label); + arc.isFinal = n.isFinal = true; + } + arc.target = compileNode(n); + } + } + } + + static class Arc { + public int label; // really an "unsigned" byte + public Node target; + public boolean isFinal; + public T output; + public T nextFinalOutput; + } + + // NOTE: not many instances of Node or CompiledNode are in + // memory while the FST is being built; it's only the + // current "frontier": + + static interface Node { + boolean isCompiled(); + } + + static final class CompiledNode implements Node { + int address; + public boolean isCompiled() { + return true; + } + } + + static final class UnCompiledNode implements Node { + final Builder owner; + int numArcs; + Arc[] arcs; + T output; + boolean isFinal; + long inputCount; + + /** This node's depth, starting from the automaton root. */ + final int depth; + + /** + * @param depth + * The node's depth starting from the automaton root. Needed for + * LUCENE-2934 (node expansion based on conditions other than the + * fanout size). + */ + @SuppressWarnings("unchecked") + public UnCompiledNode(Builder owner, int depth) { + this.owner = owner; + arcs = (Arc[]) new Arc[1]; + arcs[0] = new Arc(); + output = owner.NO_OUTPUT; + this.depth = depth; + } + + public boolean isCompiled() { + return false; + } + + public void clear() { + numArcs = 0; + isFinal = false; + output = owner.NO_OUTPUT; + inputCount = 0; + + // We don't clear the depth here because it never changes + // for nodes on the frontier (even when reused). + } + + public T getLastOutput(int labelToMatch) { + assert numArcs > 0; + assert arcs[numArcs-1].label == labelToMatch; + return arcs[numArcs-1].output; + } + + public void addArc(int label, Node target) { + assert label >= 0; + assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs; + if (numArcs == arcs.length) { + @SuppressWarnings("unchecked") final Arc[] newArcs = + new Arc[ArrayUtil.oversize(numArcs+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, newArcs, 0, arcs.length); + for(int arcIdx=numArcs;arcIdx(); + } + arcs = newArcs; + } + final Arc arc = arcs[numArcs++]; + arc.label = label; + arc.target = target; + arc.output = arc.nextFinalOutput = owner.NO_OUTPUT; + arc.isFinal = false; + } + + public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) { + assert numArcs > 0; + final Arc arc = arcs[numArcs-1]; + assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; + arc.target = target; + //assert target.address != -2; + arc.nextFinalOutput = nextFinalOutput; + arc.isFinal = isFinal; + } + + public void deleteLast(int label, Node target) { + assert numArcs > 0; + assert label == arcs[numArcs-1].label; + assert target == arcs[numArcs-1].target; + numArcs--; + } + + public void setLastOutput(int labelToMatch, T newOutput) { + assert owner.validOutput(newOutput); + assert numArcs > 0; + final Arc arc = arcs[numArcs-1]; + assert arc.label == labelToMatch; + arc.output = newOutput; + } + + // pushes an output prefix forward onto all arcs + public void prependOutput(T outputPrefix) { + assert owner.validOutput(outputPrefix); + + for(int arcIdx=0;arcIdx { + + private final static IntsRef NO_OUTPUT = new IntsRef(); + + private IntSequenceOutputs() { + } + + public static IntSequenceOutputs getSingleton() { + return new IntSequenceOutputs(); + } + + @Override + public IntsRef common(IntsRef output1, IntsRef output2) { + assert output1 != null; + assert output2 != null; + + int pos1 = output1.offset; + int pos2 = output2.offset; + int stopAt1 = pos1 + Math.min(output1.length, output2.length); + while(pos1 < stopAt1) { + if (output1.ints[pos1] != output2.ints[pos2]) { + break; + } + pos1++; + pos2++; + } + + if (pos1 == output1.offset) { + // no common prefix + return NO_OUTPUT; + } else if (pos1 == output1.offset + output1.length) { + // output1 is a prefix of output2 + return output1; + } else if (pos2 == output2.offset + output2.length) { + // output2 is a prefix of output1 + return output2; + } else { + return new IntsRef(output1.ints, output1.offset, pos1-output1.offset); + } + } + + @Override + public IntsRef subtract(IntsRef output, IntsRef inc) { + assert output != null; + assert inc != null; + if (inc == NO_OUTPUT) { + // no prefix removed + return output; + } else if (inc.length == output.length) { + // entire output removed + return NO_OUTPUT; + } else { + assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; + assert inc.length > 0; + return new IntsRef(output.ints, output.offset + inc.length, output.length-inc.length); + } + } + + @Override + public IntsRef add(IntsRef prefix, IntsRef output) { + assert prefix != null; + assert output != null; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + assert prefix.length > 0; + assert output.length > 0; + IntsRef result = new IntsRef(prefix.length + output.length); + System.arraycopy(prefix.ints, prefix.offset, result.ints, 0, prefix.length); + System.arraycopy(output.ints, output.offset, result.ints, prefix.length, output.length); + result.length = prefix.length + output.length; + return result; + } + } + + @Override + public void write(IntsRef prefix, DataOutput out) throws IOException { + assert prefix != null; + out.writeVInt(prefix.length); + for(int idx=0;idx { + + private int[] table; + private int count; + private int mask; + private final FST fst; + private final FST.Arc scratchArc = new FST.Arc(); + + public NodeHash(FST fst) { + table = new int[16]; + mask = 15; + this.fst = fst; + } + + private boolean nodesEqual(Builder.UnCompiledNode node, int address) throws IOException { + fst.readFirstRealArc(address, scratchArc); + if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { + return false; + } + for(int arcUpto=0;arcUpto arc = node.arcs[arcUpto]; + if (arc.label != scratchArc.label || + !arc.output.equals(scratchArc.output) || + ((Builder.CompiledNode) arc.target).address != scratchArc.target || + !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) || + arc.isFinal != scratchArc.isFinal()) { + return false; + } + + if (scratchArc.isLast()) { + if (arcUpto == node.numArcs-1) { + return true; + } else { + return false; + } + } + fst.readNextRealArc(scratchArc); + } + + return false; + } + + // hash code for an unfrozen node. This must be identical + // to the un-frozen case (below)!! + private int hash(Builder.UnCompiledNode node) { + final int PRIME = 31; + //System.out.println("hash unfrozen"); + int h = 0; + // TODO: maybe if number of arcs is high we can safely subsample? + for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); + h = PRIME * h + arc.label; + h = PRIME * h + ((Builder.CompiledNode) arc.target).address; + h = PRIME * h + arc.output.hashCode(); + h = PRIME * h + arc.nextFinalOutput.hashCode(); + if (arc.isFinal) { + h += 17; + } + } + //System.out.println(" ret " + (h&Integer.MAX_VALUE)); + return h & Integer.MAX_VALUE; + } + + // hash code for a frozen node + private int hash(int node) throws IOException { + final int PRIME = 31; + //System.out.println("hash frozen"); + int h = 0; + fst.readFirstRealArc(node, scratchArc); + while(true) { + //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal()); + h = PRIME * h + scratchArc.label; + h = PRIME * h + scratchArc.target; + h = PRIME * h + scratchArc.output.hashCode(); + h = PRIME * h + scratchArc.nextFinalOutput.hashCode(); + if (scratchArc.isFinal()) { + h += 17; + } + if (scratchArc.isLast()) { + break; + } + fst.readNextRealArc(scratchArc); + } + //System.out.println(" ret " + (h&Integer.MAX_VALUE)); + return h & Integer.MAX_VALUE; + } + + public int add(Builder.UnCompiledNode node) throws IOException { + // System.out.println("hash: add count=" + count + " vs " + table.length); + final int h = hash(node); + int pos = h & mask; + int c = 0; + while(true) { + final int v = table[pos]; + if (v == 0) { + // freeze & add + final int address = fst.addNode(node); + //System.out.println(" now freeze addr=" + address); + assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h; + count++; + table[pos] = address; + if (table.length < 2*count) { + rehash(); + } + return address; + } else if (nodesEqual(node, v)) { + // same node is already here + return v; + } + + // quadratic probe + pos = (pos + (++c)) & mask; + } + } + + // called only by rehash + private void addNew(int address) throws IOException { + int pos = hash(address) & mask; + int c = 0; + while(true) { + if (table[pos] == 0) { + table[pos] = address; + break; + } + + // quadratic probe + pos = (pos + (++c)) & mask; + } + } + + private void rehash() throws IOException { + final int[] oldTable = table; + table = new int[2*table.length]; + mask = table.length-1; + for(int idx=0;idx The format is similar to what's used by Morfologik + * (http://sourceforge.net/projects/morfologik). + * + * @lucene.experimental + */ +public class FST { + public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4}; + public final INPUT_TYPE inputType; + + private final static int BIT_FINAL_ARC = 1 << 0; + private final static int BIT_LAST_ARC = 1 << 1; + private final static int BIT_TARGET_NEXT = 1 << 2; + private final static int BIT_STOP_NODE = 1 << 3; + private final static int BIT_ARC_HAS_OUTPUT = 1 << 4; + private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; + + // Arcs are stored as fixed-size (per entry) array, so + // that we can find an arc using binary search. We do + // this when number of arcs is > NUM_ARCS_ARRAY: + private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node. + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5; + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10; + + private int[] bytesPerArc = new int[0]; + + // Increment version to change it + private final static String FILE_FORMAT_NAME = "FST"; + private final static int VERSION_START = 0; + private final static int VERSION_CURRENT = VERSION_START; + + // Never serialized; just used to represent the virtual + // final node w/ no arcs: + private final static int FINAL_END_NODE = -1; + + // Never serialized; just used to represent the virtual + // non-final node w/ no arcs: + private final static int NON_FINAL_END_NODE = 0; + + // if non-null, this FST accepts the empty string and + // produces this output + T emptyOutput; + private byte[] emptyOutputBytes; + + private byte[] bytes; + int byteUpto = 0; + + private int startNode = -1; + + public final Outputs outputs; + + private int lastFrozenNode; + + private final T NO_OUTPUT; + + public int nodeCount; + public int arcCount; + public int arcWithOutputCount; + + // If arc has this label then that arc is final/accepted + public static final int END_LABEL = -1; + + public final static class Arc { + public int label; + public T output; + + int target; + + byte flags; + T nextFinalOutput; + int nextArc; + + // This is non-zero if current arcs are fixed array: + int posArcsStart; + int bytesPerArc; + int arcIdx; + int numArcs; + + /** Returns this */ + public Arc copyFrom(Arc other) { + label = other.label; + target = other.target; + flags = other.flags; + output = other.output; + nextFinalOutput = other.nextFinalOutput; + nextArc = other.nextArc; + if (other.bytesPerArc != 0) { + bytesPerArc = other.bytesPerArc; + posArcsStart = other.posArcsStart; + arcIdx = other.arcIdx; + numArcs = other.numArcs; + } else { + bytesPerArc = 0; + } + return this; + } + + boolean flag(int flag) { + return FST.flag(flags, flag); + } + + public boolean isLast() { + return flag(BIT_LAST_ARC); + } + + boolean isFinal() { + return flag(BIT_FINAL_ARC); + } + }; + + static boolean flag(int flags, int bit) { + return (flags & bit) != 0; + } + + private final BytesWriter writer; + + // make a new empty FST, for building + public FST(INPUT_TYPE inputType, Outputs outputs) { + this.inputType = inputType; + this.outputs = outputs; + bytes = new byte[128]; + NO_OUTPUT = outputs.getNoOutput(); + + writer = new BytesWriter(); + + emptyOutput = null; + } + + // create an existing FST + public FST(DataInput in, Outputs outputs) throws IOException { + this.outputs = outputs; + writer = null; + CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START); + if (in.readByte() == 1) { + // accepts empty string + int numBytes = in.readVInt(); + // messy + bytes = new byte[numBytes]; + in.readBytes(bytes, 0, numBytes); + emptyOutput = outputs.read(getBytesReader(numBytes-1)); + } else { + emptyOutput = null; + } + final byte t = in.readByte(); + switch(t) { + case 0: + inputType = INPUT_TYPE.BYTE1; + break; + case 1: + inputType = INPUT_TYPE.BYTE2; + break; + case 2: + inputType = INPUT_TYPE.BYTE4; + break; + default: + throw new IllegalStateException("invalid input type " + t); + } + startNode = in.readVInt(); + nodeCount = in.readVInt(); + arcCount = in.readVInt(); + arcWithOutputCount = in.readVInt(); + + bytes = new byte[in.readVInt()]; + in.readBytes(bytes, 0, bytes.length); + NO_OUTPUT = outputs.getNoOutput(); + } + + public INPUT_TYPE getInputType() { + return inputType; + } + + /** Returns bytes used to represent the FST */ + public int sizeInBytes() { + return bytes.length; + } + + void finish(int startNode) { + if (startNode == FINAL_END_NODE && emptyOutput != null) { + startNode = 0; + } + if (this.startNode != -1) { + throw new IllegalStateException("already finished"); + } + byte[] finalBytes = new byte[writer.posWrite]; + System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite); + bytes = finalBytes; + this.startNode = startNode; + } + + void setEmptyOutput(T v) throws IOException { + if (emptyOutput != null) { + emptyOutput = outputs.merge(emptyOutput, v); + } else { + emptyOutput = v; + } + + // TODO: this is messy -- replace with sillyBytesWriter; maybe make + // bytes private + final int posSave = writer.posWrite; + outputs.write(emptyOutput, writer); + emptyOutputBytes = new byte[writer.posWrite-posSave]; + + // reverse + final int stopAt = (writer.posWrite - posSave)/2; + int upto = 0; + while(upto < stopAt) { + final byte b = bytes[posSave + upto]; + bytes[posSave+upto] = bytes[writer.posWrite-upto-1]; + bytes[writer.posWrite-upto-1] = b; + upto++; + } + System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.posWrite-posSave); + writer.posWrite = posSave; + } + + public void save(DataOutput out) throws IOException { + if (startNode == -1) { + throw new IllegalStateException("call finish first"); + } + CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); + // TODO: really we should encode this as an arc, arriving + // to the root node, instead of special casing here: + if (emptyOutput != null) { + out.writeByte((byte) 1); + out.writeVInt(emptyOutputBytes.length); + out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); + } else { + out.writeByte((byte) 0); + } + final byte t; + if (inputType == INPUT_TYPE.BYTE1) { + t = 0; + } else if (inputType == INPUT_TYPE.BYTE2) { + t = 1; + } else { + t = 2; + } + out.writeByte(t); + out.writeVInt(startNode); + out.writeVInt(nodeCount); + out.writeVInt(arcCount); + out.writeVInt(arcWithOutputCount); + out.writeVInt(bytes.length); + out.writeBytes(bytes, 0, bytes.length); + } + + private void writeLabel(int v) throws IOException { + assert v >= 0: "v=" + v; + if (inputType == INPUT_TYPE.BYTE1) { + assert v <= 255: "v=" + v; + writer.writeByte((byte) v); + } else if (inputType == INPUT_TYPE.BYTE2) { + assert v <= 65535: "v=" + v; + writer.writeVInt(v); + } else { + //writeInt(v); + writer.writeVInt(v); + } + } + + int readLabel(DataInput in) throws IOException { + final int v; + if (inputType == INPUT_TYPE.BYTE1) { + v = in.readByte()&0xFF; + } else { + v = in.readVInt(); + } + return v; + } + + // returns true if the node at this address has any + // outgoing arcs + public boolean targetHasArcs(Arc arc) { + return arc.target > 0; + } + + // serializes new node by appending its bytes to the end + // of the current byte[] + int addNode(Builder.UnCompiledNode node) throws IOException { + //System.out.println("FST.addNode pos=" + posWrite + " numArcs=" + node.numArcs); + if (node.numArcs == 0) { + if (node.isFinal) { + return FINAL_END_NODE; + } else { + return NON_FINAL_END_NODE; + } + } + + int startAddress = writer.posWrite; + //System.out.println(" startAddr=" + startAddress); + + final boolean doFixedArray = shouldExpand(node); + final int fixedArrayStart; + if (doFixedArray) { + if (bytesPerArc.length < node.numArcs) { + bytesPerArc = new int[ArrayUtil.oversize(node.numArcs, 1)]; + } + // write a "false" first arc: + writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY); + writer.writeVInt(node.numArcs); + // placeholder -- we'll come back and write the number + // of bytes per arc here: + writer.writeByte((byte) 0); + fixedArrayStart = writer.posWrite; + //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); + } else { + fixedArrayStart = 0; + } + + nodeCount++; + arcCount += node.numArcs; + + final int lastArc = node.numArcs-1; + + int lastArcStart = writer.posWrite; + int maxBytesPerArc = 0; + for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; + int flags = 0; + + if (arcIdx == lastArc) { + flags += BIT_LAST_ARC; + } + + if (lastFrozenNode == target.address && !doFixedArray) { + flags += BIT_TARGET_NEXT; + } + + if (arc.isFinal) { + flags += BIT_FINAL_ARC; + if (arc.nextFinalOutput != NO_OUTPUT) { + flags += BIT_ARC_HAS_FINAL_OUTPUT; + } + } else { + assert arc.nextFinalOutput == NO_OUTPUT; + } + + boolean targetHasArcs = target.address > 0; + + if (!targetHasArcs) { + flags += BIT_STOP_NODE; + } + + if (arc.output != NO_OUTPUT) { + flags += BIT_ARC_HAS_OUTPUT; + } + + writer.writeByte((byte) flags); + writeLabel(arc.label); + + //System.out.println(" write arc: label=" + arc.label + " flags=" + flags); + + if (arc.output != NO_OUTPUT) { + outputs.write(arc.output, writer); + arcWithOutputCount++; + } + if (arc.nextFinalOutput != NO_OUTPUT) { + outputs.write(arc.nextFinalOutput, writer); + } + + if (targetHasArcs && (doFixedArray || lastFrozenNode != target.address)) { + assert target.address > 0; + writer.writeInt(target.address); + } + + // just write the arcs "like normal" on first pass, + // but record how many bytes each one took, and max + // byte size: + if (doFixedArray) { + bytesPerArc[arcIdx] = writer.posWrite - lastArcStart; + lastArcStart = writer.posWrite; + maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]); + //System.out.println(" bytes=" + bytesPerArc[arcIdx]); + } + } + + if (doFixedArray) { + assert maxBytesPerArc > 0; + // 2nd pass just "expands" all arcs to take up a fixed + // byte size + final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc; + bytes = ArrayUtil.grow(bytes, sizeNeeded); + if (maxBytesPerArc > 255) { + throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + ")"); + } + bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; + + // expand the arcs in place, backwards + int srcPos = writer.posWrite; + int destPos = fixedArrayStart + node.numArcs*maxBytesPerArc; + writer.posWrite = destPos; + for(int arcIdx=node.numArcs-1;arcIdx>=0;arcIdx--) { + //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); + destPos -= maxBytesPerArc; + srcPos -= bytesPerArc[arcIdx]; + if (srcPos != destPos) { + assert destPos > srcPos; + System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]); + } + } + } + + // reverse bytes in-place; we do this so that the + // "BIT_TARGET_NEXT" opto can work, ie, it reads the + // node just before the current one + final int endAddress = lastFrozenNode = writer.posWrite - 1; + + int left = startAddress; + int right = endAddress; + while (left < right) { + final byte b = bytes[left]; + bytes[left++] = bytes[right]; + bytes[right--] = b; + } + + return endAddress; + } + + /** Fills virtual 'start' arc, ie, an empty incoming arc to + * the FST's start node */ + public Arc getFirstArc(Arc arc) { + if (emptyOutput != null) { + arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; + arc.nextFinalOutput = emptyOutput; + } else { + arc.flags = BIT_LAST_ARC; + arc.nextFinalOutput = NO_OUTPUT; + } + arc.output = NO_OUTPUT; + + // If there are no nodes, ie, the FST only accepts the + // empty string, then startNode is 0, and then readFirstTargetArc + arc.target = startNode; + return arc; + } + + /** Follows the follow arc and reads the last + * arc of its target; this changes the provided + * arc (2nd arg) in-place and returns it. + * + * @return Returns the second argument + * (arc). */ + public Arc readLastTargetArc(Arc follow, Arc arc) throws IOException { + //System.out.println("readLast"); + if (!targetHasArcs(follow)) { + //System.out.println(" end node"); + assert follow.isFinal(); + arc.label = -1; + arc.output = follow.nextFinalOutput; + arc.flags = BIT_LAST_ARC; + return arc; + } else { + final BytesReader in = getBytesReader(follow.target); + arc.flags = in.readByte(); + if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + // array: jump straight to end + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readByte() & 0xFF; + //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); + arc.posArcsStart = in.pos; + arc.arcIdx = arc.numArcs - 2; + } else { + // non-array: linear scan + arc.bytesPerArc = 0; + //System.out.println(" scan"); + while(!arc.isLast()) { + // skip this arc: + readLabel(in); + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + outputs.read(in); + } + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.read(in); + } + if (arc.flag(BIT_STOP_NODE)) { + } else if (arc.flag(BIT_TARGET_NEXT)) { + } else { + in.pos -= 4; + } + arc.flags = in.readByte(); + } + arc.nextArc = in.pos+1; + } + readNextRealArc(arc); + assert arc.isLast(); + return arc; + } + } + + /** + * Follow the follow arc and read the first arc of its target; + * this changes the provided arc (2nd arg) in-place and returns + * it. + * + * @return Returns the second argument (arc). + */ + public Arc readFirstTargetArc(Arc follow, Arc arc) throws IOException { + //int pos = address; + //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal()); + if (follow.isFinal()) { + // Insert "fake" final first arc: + arc.label = -1; + arc.output = follow.nextFinalOutput; + if (follow.target <= 0) { + arc.flags = BIT_LAST_ARC; + } else { + arc.flags = 0; + arc.nextArc = follow.target; + } + //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); + return arc; + } else { + return readFirstRealArc(follow.target, arc); + } + } + + // Not private because NodeHash needs access: + Arc readFirstRealArc(int address, Arc arc) throws IOException { + + final BytesReader in = getBytesReader(address); + + arc.flags = in.readByte(); + + if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + //System.out.println(" fixedArray"); + // this is first arc in a fixed-array + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readByte() & 0xFF; + arc.arcIdx = -1; + arc.nextArc = arc.posArcsStart = in.pos; + //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); + } else { + arc.nextArc = address; + arc.bytesPerArc = 0; + } + return readNextRealArc(arc); + } + + /** + * Checks if arc's target state is in expanded (or vector) format. + * + * @return Returns true if arc points to a state in an + * expanded array format. + */ + boolean isExpandedTarget(Arc follow) throws IOException { + if (!targetHasArcs(follow)) { + return false; + } else { + final BytesReader in = getBytesReader(follow.target); + final byte b = in.readByte(); + return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; + } + } + + /** In-place read; returns the arc. */ + public Arc readNextArc(Arc arc) throws IOException { + if (arc.label == -1) { + // This was a fake inserted "final" arc + if (arc.nextArc <= 0) { + // This arc went to virtual final node, ie has no outgoing arcs + return null; + } + return readFirstRealArc(arc.nextArc, arc); + } else { + return readNextRealArc(arc); + } + } + + /** Peeks at next arc's label; does not alter arc. Do + * not call this if arc.isLast()! */ + public int readNextArcLabel(Arc arc) throws IOException { + assert !arc.isLast(); + + final BytesReader in; + if (arc.label == END_LABEL) { + //System.out.println(" nextArc fake " + arc.nextArc); + in = getBytesReader(arc.nextArc); + byte flags = bytes[in.pos]; + if (flag(flags, BIT_ARCS_AS_FIXED_ARRAY)) { + //System.out.println(" nextArc fake array"); + in.pos--; + in.readVInt(); + in.readByte(); + } + } else { + if (arc.bytesPerArc != 0) { + //System.out.println(" nextArc real array"); + // arcs are at fixed entries + in = getBytesReader(arc.posArcsStart - (1+arc.arcIdx)*arc.bytesPerArc); + } else { + // arcs are packed + //System.out.println(" nextArc real packed"); + in = getBytesReader(arc.nextArc); + } + } + // skip flags + in.readByte(); + return readLabel(in); + } + + Arc readNextRealArc(Arc arc) throws IOException { + // this is a continuing arc in a fixed array + final BytesReader in; + if (arc.bytesPerArc != 0) { + // arcs are at fixed entries + arc.arcIdx++; + assert arc.arcIdx < arc.numArcs; + in = getBytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc); + } else { + // arcs are packed + in = getBytesReader(arc.nextArc); + } + arc.flags = in.readByte(); + arc.label = readLabel(in); + + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + arc.output = outputs.read(in); + } else { + arc.output = outputs.getNoOutput(); + } + + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + arc.nextFinalOutput = outputs.read(in); + } else { + arc.nextFinalOutput = outputs.getNoOutput(); + } + + if (arc.flag(BIT_STOP_NODE)) { + if (arc.flag(BIT_FINAL_ARC)) { + arc.target = FINAL_END_NODE; + } else { + arc.target = NON_FINAL_END_NODE; + } + arc.nextArc = in.pos; + } else if (arc.flag(BIT_TARGET_NEXT)) { + arc.nextArc = in.pos; + if (!arc.flag(BIT_LAST_ARC)) { + if (arc.bytesPerArc == 0) { + // must scan + seekToNextNode(in); + } else { + in.pos = arc.posArcsStart - arc.bytesPerArc * arc.numArcs; + } + } + arc.target = in.pos; + } else { + arc.target = in.readInt(); + arc.nextArc = in.pos; + } + + return arc; + } + + /** Finds an arc leaving the incoming arc, replacing the arc in place. + * This returns null if the arc was not found, else the incoming arc. */ + public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc) throws IOException { + + if (labelToMatch == END_LABEL) { + if (follow.isFinal()) { + arc.output = follow.nextFinalOutput; + arc.label = END_LABEL; + return arc; + } else { + return null; + } + } + + if (!targetHasArcs(follow)) { + return null; + } + + // TODO: maybe make an explicit thread state that holds + // reusable stuff eg BytesReader: + final BytesReader in = getBytesReader(follow.target); + + if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) { + // Arcs are full array; do binary search: + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readByte() & 0xFF; + arc.posArcsStart = in.pos; + int low = 0; + int high = arc.numArcs-1; + while (low <= high) { + int mid = (low + high) >>> 1; + in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + int midLabel = readLabel(in); + final int cmp = midLabel - labelToMatch; + if (cmp < 0) + low = mid + 1; + else if (cmp > 0) + high = mid - 1; + else { + arc.arcIdx = mid-1; + return readNextRealArc(arc); + } + } + + return null; + } + + // Linear scan + readFirstTargetArc(follow, arc); + while(true) { + if (arc.label == labelToMatch) { + return arc; + } else if (arc.label > labelToMatch) { + return null; + } else if (arc.isLast()) { + return null; + } else { + readNextArc(arc); + } + } + } + + private void seekToNextNode(BytesReader in) throws IOException { + + while(true) { + + final int flags = in.readByte(); + readLabel(in); + + if (flag(flags, BIT_ARC_HAS_OUTPUT)) { + outputs.read(in); + } + + if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.read(in); + } + + if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { + in.readInt(); + } + + if (flag(flags, BIT_LAST_ARC)) { + return; + } + } + } + + public int getNodeCount() { + // 1+ in order to count the -1 implicit final node + return 1+nodeCount; + } + + public int getArcCount() { + return arcCount; + } + + public int getArcWithOutputCount() { + return arcWithOutputCount; + } + + /** + * Nodes will be expanded if their depth (distance from the root node) is + * <= this value and their number of arcs is >= + * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}. + * + *

+ * Fixed array consumes more RAM but enables binary search on the arcs + * (instead of a linear scan) on lookup by arc label. + * + * @return true if node should be stored in an + * expanded (array) form. + * + * @see #FIXED_ARRAY_NUM_ARCS_DEEP + * @see Builder.UnCompiledNode#depth + */ + private boolean shouldExpand(UnCompiledNode node) { + return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) || + node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP; + } + + // Non-static: writes to FST's byte[] + class BytesWriter extends DataOutput { + int posWrite; + + public BytesWriter() { + // pad: ensure no node gets address 0 which is reserved to mean + // the stop state w/ no arcs + posWrite = 1; + } + + @Override + public void writeByte(byte b) { + if (bytes.length == posWrite) { + bytes = ArrayUtil.grow(bytes); + } + assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length; + bytes[posWrite++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + final int size = posWrite + length; + bytes = ArrayUtil.grow(bytes, size); + System.arraycopy(b, offset, bytes, posWrite, length); + posWrite += length; + } + } + + final BytesReader getBytesReader(int pos) { + // TODO: maybe re-use via ThreadLocal? + return new BytesReader(pos); + } + + // Non-static: reads byte[] from FST + class BytesReader extends DataInput { + int pos; + + public BytesReader(int pos) { + this.pos = pos; + } + + @Override + public byte readByte() { + return bytes[pos--]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for(int i=0;i { + + static final int HASH_PRIME = 31; + public static final byte[] EMPTY_BYTES = new byte[0]; + + /** The contents of the BytesRef. Should never be {@code null}. */ + public byte[] bytes; + + /** Offset of first valid byte. */ + public int offset; + + /** Length of used bytes. */ + public int length; + + public BytesRef() { + bytes = EMPTY_BYTES; + } + + /** This instance will directly reference bytes w/o making a copy. + * bytes should not be null. + */ + public BytesRef(byte[] bytes, int offset, int length) { + assert bytes != null; + this.bytes = bytes; + this.offset = offset; + this.length = length; + } + + /** This instance will directly reference bytes w/o making a copy. + * bytes should not be null */ + public BytesRef(byte[] bytes) { + assert bytes != null; + this.bytes = bytes; + this.offset = 0; + this.length = bytes.length; + } + + public BytesRef(int capacity) { + this.bytes = new byte[capacity]; + } + + /** + * @param text Initialize the byte[] from the UTF8 bytes + * for the provided Sring. This must be well-formed + * unicode text, with no unpaired surrogates or U+FFFF. + */ + public BytesRef(CharSequence text) { + this(); + copy(text); + } + + /** + * @param text Initialize the byte[] from the UTF8 bytes + * for the provided array. This must be well-formed + * unicode text, with no unpaired surrogates or U+FFFF. + */ + public BytesRef(char text[], int offset, int length) { + this(length * 4); + copy(text, offset, length); + } + + public BytesRef(BytesRef other) { + this(); + copy(other); + } + + /* // maybe? + public BytesRef(BytesRef other, boolean shallow) { + this(); + if (shallow) { + offset = other.offset; + length = other.length; + bytes = other.bytes; + } else { + copy(other); + } + } + */ + + /** + * Copies the UTF8 bytes for this string. + * + * @param text Must be well-formed unicode text, with no + * unpaired surrogates or invalid UTF16 code units. + */ + public void copy(CharSequence text) { + UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this); + } + + /** + * Copies the UTF8 bytes for this string. + * + * @param text Must be well-formed unicode text, with no + * unpaired surrogates or invalid UTF16 code units. + */ + public void copy(char text[], int offset, int length) { + UnicodeUtil.UTF16toUTF8(text, offset, length, this); + } + + public boolean bytesEquals(BytesRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final byte[] otherBytes = other.bytes; + final int end = offset + length; + for(int upto=offset;uptoIt is defined as: + *

+   *  int hash = 0;
+   *  for (int i = offset; i < offset + length; i++) {
+   *    hash = 31*hash + bytes[i];
+   *  }
+   * 
+ */ + @Override + public int hashCode() { + int result = 0; + final int end = offset + length; + for(int i=offset;i offset) { + sb.append(' '); + } + sb.append(Integer.toHexString(bytes[i]&0xff)); + } + sb.append(']'); + return sb.toString(); + } + + public void copy(BytesRef other) { + if (bytes.length < other.length) { + bytes = new byte[other.length]; + } + System.arraycopy(other.bytes, other.offset, bytes, 0, other.length); + length = other.length; + offset = 0; + } + + public void append(BytesRef other) { + int newLen = length + other.length; + if (bytes.length < newLen) { + byte[] newBytes = new byte[newLen]; + System.arraycopy(bytes, offset, newBytes, 0, length); + offset = 0; + bytes = newBytes; + } + System.arraycopy(other.bytes, other.offset, bytes, length+offset, other.length); + length = newLen; + } + + public void grow(int newLength) { + bytes = ArrayUtil.grow(bytes, newLength); + } + + /** Unsigned byte order comparison */ + public int compareTo(BytesRef other) { + if (this == other) return 0; + + final byte[] aBytes = this.bytes; + int aUpto = this.offset; + final byte[] bBytes = other.bytes; + int bUpto = other.offset; + + final int aStop = aUpto + Math.min(this.length, other.length); + + while(aUpto < aStop) { + int aByte = aBytes[aUpto++] & 0xff; + int bByte = bBytes[bUpto++] & 0xff; + int diff = aByte - bByte; + if (diff != 0) return diff; + } + + // One is a prefix of the other, or, they are equal: + return this.length - other.length; + } + + private final static Comparator utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator(); + + public static Comparator getUTF8SortedAsUnicodeComparator() { + return utf8SortedAsUnicodeSortOrder; + } + + private static class UTF8SortedAsUnicodeComparator implements Comparator { + // Only singleton + private UTF8SortedAsUnicodeComparator() {}; + + public int compare(BytesRef a, BytesRef b) { + final byte[] aBytes = a.bytes; + int aUpto = a.offset; + final byte[] bBytes = b.bytes; + int bUpto = b.offset; + + final int aStop; + if (a.length < b.length) { + aStop = aUpto + a.length; + } else { + aStop = aUpto + b.length; + } + + while(aUpto < aStop) { + int aByte = aBytes[aUpto++] & 0xff; + int bByte = bBytes[bUpto++] & 0xff; + + int diff = aByte - bByte; + if (diff != 0) { + return diff; + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + } + + private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); + + public static Comparator getUTF8SortedAsUTF16Comparator() { + return utf8SortedAsUTF16SortOrder; + } + + private static class UTF8SortedAsUTF16Comparator implements Comparator { + // Only singleton + private UTF8SortedAsUTF16Comparator() {}; + + public int compare(BytesRef a, BytesRef b) { + + final byte[] aBytes = a.bytes; + int aUpto = a.offset; + final byte[] bBytes = b.bytes; + int bUpto = b.offset; + + final int aStop; + if (a.length < b.length) { + aStop = aUpto + a.length; + } else { + aStop = aUpto + b.length; + } + + while(aUpto < aStop) { + int aByte = aBytes[aUpto++] & 0xff; + int bByte = bBytes[bUpto++] & 0xff; + + if (aByte != bByte) { + + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + + // NOTE: instead of moving supplementary code points (0xee and 0xef) to the unused 0xfe and 0xff, + // we move them to the unused 0xfc and 0xfd [reserved for future 6-byte character sequences] + // this reserves 0xff for preflex's term reordering (surrogate dance), and if unicode grows such + // that 6-byte sequences are needed we have much bigger problems anyway. + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0xe; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0xe; + } + } + return aByte - bByte; + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + } +} Property changes on: lucene/src/java/org/apache/lucene/util/BytesRef.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 1127319) +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -109,6 +109,10 @@ private static final long HALF_SHIFT = 10; private static final long HALF_MASK = 0x3FFL; + private static final int SURROGATE_OFFSET = + Character.MIN_SUPPLEMENTARY_CODE_POINT - + (UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START; + /** * @lucene.internal */ @@ -304,6 +308,114 @@ result.length = upto; } + /** Encode characters from this String, starting at offset + * for length characters. After encoding, result.offset will always be 0. + */ + public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) { + final int end = offset + length; + + byte[] out = result.bytes; + result.offset = 0; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) + out = result.bytes = new byte[maxLen]; + + int upto = 0; + for(int i=offset;i> 6)); + out[upto++] = (byte)(0x80 | (code & 0x3F)); + } else if (code < 0xD800 || code > 0xDFFF) { + out[upto++] = (byte)(0xE0 | (code >> 12)); + out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); + out[upto++] = (byte)(0x80 | (code & 0x3F)); + } else { + // surrogate pair + // confirm valid high surrogate + if (code < 0xDC00 && (i < end-1)) { + int utf32 = (int) s.charAt(i+1); + // confirm valid low surrogate and write pair + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { + utf32 = (code << 10) + utf32 + SURROGATE_OFFSET; + i++; + out[upto++] = (byte)(0xF0 | (utf32 >> 18)); + out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); + out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); + out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); + continue; + } + } + // replace unpaired surrogate or out-of-order low surrogate + // with substitution character + out[upto++] = (byte) 0xEF; + out[upto++] = (byte) 0xBF; + out[upto++] = (byte) 0xBD; + } + } + //assert matches(s, offset, length, out, upto); + result.length = upto; + } + + /** Encode characters from a char[] source, starting at + * offset for length chars. After encoding, result.offset will always be 0. + */ + public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) { + + int upto = 0; + int i = offset; + final int end = offset + length; + byte[] out = result.bytes; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) + out = result.bytes = new byte[maxLen]; + result.offset = 0; + + while(i < end) { + + final int code = (int) source[i++]; + + if (code < 0x80) + out[upto++] = (byte) code; + else if (code < 0x800) { + out[upto++] = (byte) (0xC0 | (code >> 6)); + out[upto++] = (byte)(0x80 | (code & 0x3F)); + } else if (code < 0xD800 || code > 0xDFFF) { + out[upto++] = (byte)(0xE0 | (code >> 12)); + out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); + out[upto++] = (byte)(0x80 | (code & 0x3F)); + } else { + // surrogate pair + // confirm valid high surrogate + if (code < 0xDC00 && i < end) { + int utf32 = (int) source[i]; + // confirm valid low surrogate and write pair + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { + utf32 = (code << 10) + utf32 + SURROGATE_OFFSET; + i++; + out[upto++] = (byte)(0xF0 | (utf32 >> 18)); + out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); + out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); + out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); + continue; + } + } + // replace unpaired surrogate or out-of-order low surrogate + // with substitution character + out[upto++] = (byte) 0xEF; + out[upto++] = (byte) 0xBF; + out[upto++] = (byte) 0xBD; + } + } + //assert matches(source, offset, length, out, upto); + result.length = upto; + } + /** Convert UTF8 bytes into UTF16 characters. If offset * is non-zero, conversion starts at that starting point * in utf8, re-using the results from the previous call Index: lucene/src/test-framework/org/apache/lucene/store/MockIndexOutputWrapper.java =================================================================== --- lucene/src/test-framework/org/apache/lucene/store/MockIndexOutputWrapper.java (revision 1127319) +++ lucene/src/test-framework/org/apache/lucene/store/MockIndexOutputWrapper.java (working copy) @@ -151,7 +151,7 @@ } @Override - public void copyBytes(IndexInput input, long numBytes) throws IOException { + public void copyBytes(DataInput input, long numBytes) throws IOException { delegate.copyBytes(input, numBytes); // TODO: we may need to check disk full here as well dir.maybeThrowDeterministicException(); Index: lucene/src/test-framework/org/apache/lucene/util/ThrottledIndexOutput.java =================================================================== --- lucene/src/test-framework/org/apache/lucene/util/ThrottledIndexOutput.java (revision 1127319) +++ lucene/src/test-framework/org/apache/lucene/util/ThrottledIndexOutput.java (working copy) @@ -18,7 +18,7 @@ */ import java.io.IOException; -import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexOutput; public class ThrottledIndexOutput extends IndexOutput { @@ -141,7 +141,7 @@ } @Override - public void copyBytes(IndexInput input, long numBytes) throws IOException { + public void copyBytes(DataInput input, long numBytes) throws IOException { delegate.copyBytes(input, numBytes); } }