diff -r 2b956f1d334f lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Fri Apr 29 17:25:46 2011 -0400 +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Sat Apr 30 16:57:26 2011 -0400 @@ -261,9 +261,12 @@ add(scratchIntsRef, output); } + /** It's OK to add the same input twice in a row with + * different outputs, as long as outputs impls the merge + * method. */ public void add(IntsRef input, T output) throws IOException { //System.out.println("\nFST ADD: input=" + input + " output=" + fst.outputs.outputToString(output)); - assert lastInput.length == 0 || input.compareTo(lastInput) > 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input; + assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input; assert validOutput(output); //System.out.println("\nadd: " + input); @@ -347,8 +350,15 @@ assert validOutput(output); } - // push remaining output: - frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output); + if (lastInput.length == input.length && prefixLenPlus1 == 1+input.length) { + // same input more than 1 time in a row, mapping to + // multiple outputs + lastNode.output = fst.outputs.merge(lastNode.output, output); + } else { + // this new arc is private to this new input; set its + // arc output to the leftover output: + frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output); + } // save last input lastInput.copy(input); diff -r 2b956f1d334f lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Fri Apr 29 17:25:46 2011 -0400 +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Sat Apr 30 16:57:26 2011 -0400 @@ -231,10 +231,13 @@ } void setEmptyOutput(T v) throws IOException { - if (emptyOutput != null && !emptyOutput.equals(v)) { - throw new IllegalStateException("empty output is already set: " + outputs.outputToString(emptyOutput) + " vs " + outputs.outputToString(v)); + if (emptyOutput != null) { + if (!emptyOutput.equals(v)) { + emptyOutput = outputs.merge(emptyOutput, v); + } + } else { + emptyOutput = v; } - emptyOutput = v; // TODO: this is messy -- replace with sillyBytesWriter; maybe make // bytes private diff -r 2b956f1d334f lucene/src/java/org/apache/lucene/util/automaton/fst/Outputs.java --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Outputs.java Fri Apr 29 17:25:46 2011 -0400 +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Outputs.java Sat Apr 30 16:57:26 2011 -0400 @@ -54,4 +54,8 @@ public abstract T getNoOutput(); public abstract String outputToString(T output); + + public T merge(T first, T second) { + throw new UnsupportedOperationException(); + } } diff -r 2b956f1d334f lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java Fri Apr 29 17:25:46 2011 -0400 +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java Sat Apr 30 16:57:26 2011 -0400 @@ -22,14 +22,11 @@ import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; -// TODO: make a sharing and non-sharing variant; eg if you -// output docFreq per term the FST will be smaller if you -// don't share since they are not "well shared" - /** * Output is a long, for each input term. NOTE: the * resulting FST is not guaranteed to be minimal! See - * {@link Builder}. + * {@link Builder}. You cannot store 0 output with this + * (that's reserved to mean "no output")! * @lucene.experimental */ diff -r 2b956f1d334f lucene/src/java/org/apache/lucene/util/automaton/fst/UpToTwoPositiveIntOutputs.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/UpToTwoPositiveIntOutputs.java Sat Apr 30 16:57:26 2011 -0400 @@ -0,0 +1,224 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Holds one or two longs for each input term. If it's a + * single output, Long is returned; else, TwoLongs. Order + * is preseved in the TwoLongs case, ie .first is the first + * input/output added to Builder, and .second is the + * second. You cannot store 0 output with this (that's + * reserved to mean "no output")! + * + * NOTE: the resulting FST is not guaranteed to be minimal! + * See {@link Builder}. + * + * @lucene.experimental + */ + +public final class UpToTwoPositiveIntOutputs extends Outputs { + + public final static class TwoLongs { + final long first; + final long second; + + public TwoLongs(long first, long second) { + this.first = first; + this.second = second; + assert first >= 0; + assert second >= 0; + } + + @Override + public String toString() { + return "TwoLongs:" + first + "," + second; + } + + @Override + public boolean equals(Object _other) { + if (_other instanceof TwoLongs) { + final TwoLongs other = (TwoLongs) _other; + return first == other.first && second == other.second; + } else { + return false; + } + } + + @Override + public int hashCode() { + return (int) ((first^(first>>>32)) ^ (second^(second>>32))); + } + } + + private final static Long NO_OUTPUT = new Long(0); + + private final boolean doShare; + + private final static UpToTwoPositiveIntOutputs singletonShare = new UpToTwoPositiveIntOutputs(true); + private final static UpToTwoPositiveIntOutputs singletonNoShare = new UpToTwoPositiveIntOutputs(false); + + private UpToTwoPositiveIntOutputs(boolean doShare) { + this.doShare = doShare; + } + + public static UpToTwoPositiveIntOutputs getSingleton(boolean doShare) { + return doShare ? singletonShare : singletonNoShare; + } + + public Long get(long v) { + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } + + public TwoLongs get(long first, long second) { + return new TwoLongs(first, second); + } + + @Override + public Long common(Object _output1, Object _output2) { + assert valid(_output1, false); + assert valid(_output2, false); + final Long output1 = (Long) _output1; + final Long output2 = (Long) _output2; + if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) { + return NO_OUTPUT; + } else if (doShare) { + assert output1 > 0; + assert output2 > 0; + return Math.min(output1, output2); + } else if (output1.equals(output2)) { + return output1; + } else { + return NO_OUTPUT; + } + } + + @Override + public Long subtract(Object _output, Object _inc) { + assert valid(_output, false); + assert valid(_inc, false); + final Long output = (Long) _output; + final Long inc = (Long) _inc; + assert output >= inc; + + if (inc == NO_OUTPUT) { + return output; + } else if (output.equals(inc)) { + return NO_OUTPUT; + } else { + return output - inc; + } + } + + @Override + public Object add(Object _prefix, Object _output) { + assert valid(_prefix, false); + assert valid(_output, true); + final Long prefix = (Long) _prefix; + if (_output instanceof Long) { + final Long output = (Long) _output; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + return prefix + output; + } + } else { + final TwoLongs output = (TwoLongs) _output; + final long v = prefix; + return new TwoLongs(output.first + v, output.second + v); + } + } + + @Override + public void write(Object _output, DataOutput out) throws IOException { + assert valid(_output, true); + if (_output instanceof Long) { + final Long output = (Long) _output; + out.writeVLong(output<<1); + } else { + final TwoLongs output = (TwoLongs) _output; + out.writeVLong((output.first<<1) | 1); + out.writeVLong(output.second); + } + } + + @Override + public Object read(DataInput in) throws IOException { + final long code = in.readVLong(); + if ((code & 1) == 0) { + // single long + final long v = code >>> 1; + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } else { + // two longs + final long first = code >>> 1; + final long second = in.readVLong(); + return new TwoLongs(first, second); + } + } + + private boolean valid(Long o) { + assert o != null; + assert o instanceof Long; + assert o == NO_OUTPUT || o > 0; + return true; + } + + // Used only by assert + private boolean valid(Object _o, boolean allowDouble) { + if (!allowDouble) { + assert _o instanceof Long; + return valid((Long) _o); + } else if (_o instanceof TwoLongs) { + return true; + } else { + return valid((Long) _o); + } + } + + @Override + public Object getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Object output) { + return output.toString(); + } + + @Override + public Object merge(Object first, Object second) { + assert valid(first, false); + assert valid(second, false); + return new TwoLongs((Long) first, (Long) second); + } +} diff -r 2b956f1d334f lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java --- a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Fri Apr 29 17:25:46 2011 -0400 +++ b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Sat Apr 30 16:57:26 2011 -0400 @@ -288,6 +288,36 @@ } new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); } + + // Up to two positive ints, shared, generally but not + // monotonically increasing + { + if (VERBOSE) { + System.out.println("TEST: now test UpToTwoPositiveIntOutputs"); + } + final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true); + final List> pairs = new ArrayList>(terms.length); + long lastOutput = 0; + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } } private static class FSTTester { @@ -328,11 +358,13 @@ // no pruning doTest(0, 0); - // simple pruning - doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0); - - // leafy pruning - doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size())); + if (!(outputs instanceof UpToTwoPositiveIntOutputs)) { + // simple pruning + doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0); + + // leafy pruning + doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size())); + } } // runs the term, returning the output, or null if term @@ -421,7 +453,14 @@ prune1==0 && prune2==0, outputs); for(InputOutput pair : pairs) { - builder.add(pair.input, pair.output); + if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { + final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs; + final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output; + ((Builder) builder).add(pair.input, (Object) _outputs.get(twoLongs.first)); + ((Builder) builder).add(pair.input, (Object) _outputs.get(twoLongs.second)); + } else { + builder.add(pair.input, pair.output); + } } FST fst = builder.finish();