Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java (revision 1405787) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java (working copy) @@ -80,50 +80,6 @@ assertEquals(sorted, actual); } - - public void testRaw() throws Exception { - int num = atLeast(10000); - - Comparator comparator = BytesRef.getUTF8SortedAsUnicodeComparator(); - BytesRefHash sorted = new BytesRefHash(); - TermFreq[] unsorted = new TermFreq[num]; - byte[] buffer = new byte[0]; - ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); - - final Random random = new Random(random().nextLong()); - for (int i = 0; i < num; i++) { - BytesRef spare; - long weight; - do { - spare = new BytesRef(_TestUtil.randomUnicodeString(random)); - if (spare.length + 8 >= buffer.length) { - buffer = ArrayUtil.grow(buffer, spare.length + 8); - } - output.reset(buffer); - output.writeBytes(spare.bytes, spare.offset, spare.length); - weight = random.nextLong(); - output.writeLong(weight); - - } while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0); - unsorted[i] = new TermFreq(spare, weight); - } - - // test the sorted iterator wrapper - TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true); - int[] sort = sorted.sort(comparator); - int size = sorted.size(); - BytesRef spare = new BytesRef(); - for (int i = 0; i < size; i++) { - sorted.get(sort[i], spare); - spare.length -= 8; // sub the long value - assertEquals(spare, wrapper.next()); - spare.offset = spare.offset + spare.length; - spare.length = 8; - assertEquals(asLong(spare), wrapper.weight()); - } - assertNull(wrapper.next()); - } - public static long asLong(BytesRef b) { return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b, b.offset + 4) & 0xFFFFFFFFL); Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java (revision 1405787) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java (working copy) @@ -22,12 +22,13 @@ import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.TermFreq; import org.apache.lucene.search.suggest.TermFreqArrayIterator; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; public class WFSTCompletionTest extends LuceneTestCase { - public void test() throws Exception { + public void testBasic() throws Exception { TermFreq keys[] = new TermFreq[] { new TermFreq("foo", 50), new TermFreq("bar", 10), @@ -194,4 +195,18 @@ } } } + + public void test0ByteKeys() throws Exception { + BytesRef key1 = new BytesRef(4); + key1.length = 4; + BytesRef key2 = new BytesRef(3); + key1.length = 3; + + WFSTCompletionLookup suggester = new WFSTCompletionLookup(false); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq(key1, 50), + new TermFreq(key2, 50), + })); + } } Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 1405787) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (working copy) @@ -19,10 +19,10 @@ import java.io.File; import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; import java.io.InputStream; -import java.io.FileOutputStream; import java.io.OutputStream; -import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; @@ -39,6 +39,7 @@ import org.apache.lucene.analysis.CannedBinaryTokenStream; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockBytesAttributeFactory; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; @@ -503,6 +504,8 @@ private int numStopChars; private boolean preserveHoles; + private final MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) { this.preserveHoles = preserveHoles; this.numStopChars = numStopChars; @@ -510,7 +513,7 @@ @Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { - MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + MockTokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); tokenizer.setEnableChecks(true); TokenStream next; if (numStopChars != 0) { @@ -983,4 +986,49 @@ assertEquals("b", results.get(1).key); assertEquals(5, results.get(1).value); } + + public void test0ByteKeys() throws Exception { + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + return new TokenStreamComponents(tokenizer) { + int tokenStreamCounter = 0; + final TokenStream[] tokenStreams = new TokenStream[] { + new CannedBinaryTokenStream(new BinaryToken[] { + token(new BytesRef(new byte[] {0x0, 0x0, 0x0})), + }), + new CannedBinaryTokenStream(new BinaryToken[] { + token(new BytesRef(new byte[] {0x0, 0x0})), + }), + new CannedBinaryTokenStream(new BinaryToken[] { + token(new BytesRef(new byte[] {0x0, 0x0, 0x0})), + }), + new CannedBinaryTokenStream(new BinaryToken[] { + token(new BytesRef(new byte[] {0x0, 0x0})), + }), + }; + + @Override + public TokenStream getTokenStream() { + TokenStream result = tokenStreams[tokenStreamCounter]; + tokenStreamCounter++; + return result; + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("a a", 50), + new TermFreq("a b", 50), + })); + } } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (revision 1405787) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (working copy) @@ -94,8 +94,7 @@ @Override public void build(TermFreqIterator iterator) throws IOException { BytesRef scratch = new BytesRef(); - TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, - BytesRef.getUTF8SortedAsUnicodeComparator()); + TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); @@ -247,28 +246,26 @@ private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper { - WFSTTermFreqIteratorWrapper(TermFreqIterator source, - Comparator comparator) throws IOException { - super(source, comparator, true); + WFSTTermFreqIteratorWrapper(TermFreqIterator source) throws IOException { + super(source); } @Override protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException { - if (spare.length + 5 >= buffer.length) { - buffer = ArrayUtil.grow(buffer, spare.length + 5); + if (spare.length + 4 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 4); } output.reset(buffer); output.writeBytes(spare.bytes, spare.offset, spare.length); - output.writeByte((byte)0); // separator: not used, just for sort order output.writeInt(encodeWeight(weight)); writer.write(buffer, 0, output.getPosition()); } @Override protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { - tmpInput.reset(scratch.bytes); - tmpInput.skipBytes(scratch.length - 4); // suggestion + separator - scratch.length -= 5; // sep + long + scratch.length -= 4; // int + // skip suggestion: + tmpInput.reset(scratch.bytes, scratch.offset+scratch.length, 4); return tmpInput.readInt(); } } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1405787) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -320,6 +320,56 @@ return new TokenStreamToAutomaton(); } } + + private Comparator sortComparator = new Comparator() { + private final ByteArrayDataInput readerA = new ByteArrayDataInput(); + private final ByteArrayDataInput readerB = new ByteArrayDataInput(); + private final BytesRef scratchA = new BytesRef(); + private final BytesRef scratchB = new BytesRef(); + + @Override + public int compare(BytesRef a, BytesRef b) { + + // First by analyzed form: + readerA.reset(a.bytes, a.offset, a.length); + scratchA.length = readerA.readShort(); + scratchA.bytes = a.bytes; + scratchA.offset = readerA.getPosition(); + + readerB.reset(b.bytes, b.offset, b.length); + scratchB.bytes = b.bytes; + scratchB.length = readerB.readShort(); + scratchB.offset = readerB.getPosition(); + + int cmp = scratchA.compareTo(scratchB); + if (cmp != 0) { + return cmp; + } + + // Next by cost: + long aCost = readerA.readInt(); + long bCost = readerB.readInt(); + + if (aCost < bCost) { + return -1; + } else if (aCost > bCost) { + return 1; + } + + // Finally by surface form: + scratchA.offset = readerA.getPosition(); + scratchA.length = a.length - scratchA.offset; + scratchB.offset = readerB.getPosition(); + scratchB.length = b.length - scratchB.offset; + + cmp = scratchA.compareTo(scratchB); + if (cmp != 0) { + return cmp; + } + + return 0; + } + }; @Override public void build(TermFreqIterator iterator) throws IOException { @@ -350,42 +400,43 @@ Util.toBytesRef(path, scratch); // length of the analyzed text (FST input) + if (scratch.length > Short.MAX_VALUE-2) { + throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")"); + } short analyzedLength = (short) scratch.length; + // compute the required length: - // analyzed sequence + 12 (separator) + weight (4) + surface + analyzedLength (short) - int requiredLength = analyzedLength + 2 + 4 + surfaceForm.length + 2; + // analyzed sequence + weight (4) + surface + analyzedLength (short) + int requiredLength = analyzedLength + 4 + surfaceForm.length + 2; buffer = ArrayUtil.grow(buffer, requiredLength); output.reset(buffer); + + output.writeShort(analyzedLength); + output.writeBytes(scratch.bytes, scratch.offset, scratch.length); - output.writeByte((byte)0); // separator: not used, just for sort order - output.writeByte((byte)0); // separator: not used, just for sort order - // NOTE: important that writeInt is big-endian, - // because this means we sort secondarily by - // cost ascending (= weight descending) so that - // when we discard too many surface forms for a - // single analyzed form we are discarding the - // least weight ones: output.writeInt(encodeWeight(iterator.weight())); output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); - output.writeShort(analyzedLength); + + assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength; + writer.write(buffer, 0, output.getPosition()); } } writer.close(); // Sort all input/output pairs (required by FST.Builder): - new Sort().sort(tempInput, tempSorted); + new Sort(sortComparator).sort(tempInput, tempSorted); reader = new Sort.ByteSequencesReader(tempSorted); PairOutputs outputs = new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()); Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: - BytesRef previous = null; + BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); IntsRef scratchInts = new IntsRef(); @@ -394,24 +445,21 @@ int dedup = 0; while (reader.read(scratch)) { input.reset(scratch.bytes, scratch.offset, scratch.length); - input.setPosition(input.length()-2); short analyzedLength = input.readShort(); + analyzed.grow(analyzedLength+2); + input.readBytes(analyzed.bytes, 0, analyzedLength); + analyzed.length = analyzedLength; - analyzed.bytes = scratch.bytes; - analyzed.offset = scratch.offset; - analyzed.length = analyzedLength; - - input.setPosition(analyzedLength + 2); // analyzed sequence + separator long cost = input.readInt(); - + surface.bytes = scratch.bytes; surface.offset = input.getPosition(); - surface.length = input.length() - input.getPosition() - 2; - - if (previous == null) { - previous = new BytesRef(); - previous.copyBytes(analyzed); - } else if (analyzed.equals(previous)) { + surface.length = scratch.length - surface.offset; + + if (previousAnalyzed == null) { + previousAnalyzed = new BytesRef(); + previousAnalyzed.copyBytes(analyzed); + } else if (analyzed.equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm @@ -420,11 +468,9 @@ } } else { dedup = 0; - previous.copyBytes(analyzed); + previousAnalyzed.copyBytes(analyzed); } - analyzed.grow(analyzed.length+2); - // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of @@ -433,8 +479,8 @@ // NOTE: must be byte 0 so we sort before whatever // is next - analyzed.bytes[analyzed.length] = 0; - analyzed.bytes[analyzed.length+1] = (byte) dedup; + analyzed.bytes[analyzed.offset+analyzed.length] = 0; + analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup; analyzed.length += 2; Util.toIntsRef(analyzed, scratchInts); Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java (revision 1405787) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java (working copy) @@ -41,31 +41,36 @@ private File tempInput; private File tempSorted; private final ByteSequencesReader reader; + private final Comparator comparator; private boolean done = false; private long weight; private final BytesRef scratch = new BytesRef(); - private final Comparator comparator; - /** - * Calls {@link #SortedTermFreqIteratorWrapper(TermFreqIterator, Comparator, boolean) - * SortedTermFreqIteratorWrapper(source, comparator, false)} - */ - public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comparator) throws IOException { - this(source, comparator, false); + /** + * Creates a new sorted wrapper, using {@link + * BytesRef#getUTF8SortedAsUnicodeComparator} for + * sorting. */ + public SortedTermFreqIteratorWrapper(TermFreqIterator source) throws IOException { + this(source, BytesRef.getUTF8SortedAsUnicodeComparator()); } - + /** - * Creates a new sorted wrapper. if compareRawBytes is true, then - * only the bytes (not the weight) will be used for comparison. + * Creates a new sorted wrapper, sorting by BytesRef + * (ascending) then cost (ascending). */ - public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comparator, boolean compareRawBytes) throws IOException { + public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comparator) throws IOException { this.source = source; this.comparator = comparator; - this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator)); + this.reader = sort(); } @Override + public Comparator getComparator() { + return comparator; + } + + @Override public BytesRef next() throws IOException { boolean success = false; if (done) { @@ -90,16 +95,43 @@ } @Override - public Comparator getComparator() { - return comparator; - } - - @Override public long weight() { return weight; } + + /** Sortes by BytesRef (ascending) then cost (ascending). */ + private final Comparator tieBreakByCostComparator = new Comparator() { + + private final BytesRef leftScratch = new BytesRef(); + private final BytesRef rightScratch = new BytesRef(); + private final ByteArrayDataInput input = new ByteArrayDataInput(); + + @Override + public int compare(BytesRef left, BytesRef right) { + // Make shallow copy in case decode changes the BytesRef: + leftScratch.bytes = left.bytes; + leftScratch.offset = left.offset; + leftScratch.length = left.length; + rightScratch.bytes = right.bytes; + rightScratch.offset = right.offset; + rightScratch.length = right.length; + long leftCost = decode(leftScratch, input); + long rightCost = decode(rightScratch, input); + int cmp = comparator.compare(leftScratch, rightScratch); + if (cmp != 0) { + return cmp; + } + if (leftCost < rightCost) { + return -1; + } else if (rightCost < leftCost) { + return 1; + } else { + return 0; + } + } + }; - private Sort.ByteSequencesReader sort(Comparator comparator) throws IOException { + private Sort.ByteSequencesReader sort() throws IOException { String prefix = getClass().getSimpleName(); File directory = Sort.defaultTempDir(); tempInput = File.createTempFile(prefix, ".input", directory); @@ -116,7 +148,7 @@ encode(writer, output, buffer, spare, source.weight()); } writer.close(); - new Sort(comparator).sort(tempInput, tempSorted); + new Sort(tieBreakByCostComparator).sort(tempInput, tempSorted); ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted); success = true; return reader; @@ -131,7 +163,6 @@ close(); } } - } } @@ -145,31 +176,6 @@ } } - private final static class BytesOnlyComparator implements Comparator { - - final Comparator other; - private final BytesRef leftScratch = new BytesRef(); - private final BytesRef rightScratch = new BytesRef(); - - public BytesOnlyComparator(Comparator other) { - this.other = other; - } - - @Override - public int compare(BytesRef left, BytesRef right) { - wrap(leftScratch, left); - wrap(rightScratch, right); - return other.compare(leftScratch, rightScratch); - } - - private void wrap(BytesRef wrapper, BytesRef source) { - wrapper.bytes = source.bytes; - wrapper.offset = source.offset; - wrapper.length = source.length - 8; - - } - } - /** encodes an entry (bytes+weight) to the provided writer */ protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException { if (spare.length + 8 >= buffer.length) { @@ -184,9 +190,8 @@ /** decodes the weight at the current position */ protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { tmpInput.reset(scratch.bytes); - tmpInput.skipBytes(scratch.length - 8); // suggestion + separator - scratch.length -= 8; // sep + long + tmpInput.skipBytes(scratch.length - 8); // suggestion + scratch.length -= 8; // long return tmpInput.readLong(); } - } Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1405787) +++ lucene/CHANGES.txt (working copy) @@ -122,6 +122,9 @@ * LUCENE-4513: Fixed that deleted nested docs are scored into the parent doc when using ToParentBlockJoinQuery. (Martijn van Groningen) +* LUCENE-4534: Fixed WFSTCompletionLookup and Analyzing/FuzzySuggester + to allow 0 byte values in the lookup keys. (Mike McCandless) + Optimizations * LUCENE-4512: Additional memory savings for CompressingStoredFieldsFormat.