diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/values/Bytes.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/values/Bytes.java index 9616e6e..2c73da8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/values/Bytes.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/values/Bytes.java @@ -141,7 +141,9 @@ public final class Bytes { } else if (mode == Mode.DEREF) { return new FixedDerefBytesImpl.Writer(dir, id, bytesUsed, context); } else if (mode == Mode.SORTED) { - return new FixedSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio); + + return new FSTFixedSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio, true); + //return new FixedSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio); } } else { if (mode == Mode.STRAIGHT) { @@ -149,7 +151,8 @@ public final class Bytes { } else if (mode == Mode.DEREF) { return new VarDerefBytesImpl.Writer(dir, id, bytesUsed, context); } else if (mode == Mode.SORTED) { - return new VarSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio); + return new FSTFixedSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio, false); +// return new VarSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio); } } @@ -192,7 +195,9 @@ public final class Bytes { } else if (mode == Mode.DEREF) { return new FixedDerefBytesImpl.FixedDerefReader(dir, id, maxDoc, context); } else if (mode == Mode.SORTED) { - return new FixedSortedBytesImpl.Reader(dir, id, maxDoc, context, Type.BYTES_FIXED_SORTED, sortComparator); + return new FSTFixedSortedBytesImpl.Reader(dir, id, maxDoc, context, Type.BYTES_FIXED_SORTED, sortComparator); + //return new FixedSortedBytesImpl.Reader(dir, id, maxDoc, context, Type.BYTES_FIXED_SORTED, sortComparator); + } } else { if (mode == Mode.STRAIGHT) { @@ -200,7 +205,9 @@ public final class Bytes { } else if (mode == Mode.DEREF) { return new VarDerefBytesImpl.VarDerefReader(dir, id, maxDoc, context); } else if (mode == Mode.SORTED) { - return new VarSortedBytesImpl.Reader(dir, id, maxDoc,context, Type.BYTES_VAR_SORTED, sortComparator); + //return new VarSortedBytesImpl.Reader(dir, id, maxDoc,context, Type.BYTES_VAR_SORTED, sortComparator); + return new FSTFixedSortedBytesImpl.Reader(dir, id, maxDoc, context, Type.BYTES_VAR_SORTED, sortComparator); + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/values/FSTFixedSortedBytesImpl.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/values/FSTFixedSortedBytesImpl.java new file mode 100644 index 0000000..04b1f1d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/values/FSTFixedSortedBytesImpl.java @@ -0,0 +1,219 @@ +package org.apache.lucene.codecs.lucene40.values; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.codecs.lucene40.values.Bytes.BytesReaderBase; +import org.apache.lucene.codecs.lucene40.values.Bytes.DerefBytesWriterBase; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValues.SortedSource; +import org.apache.lucene.index.DocValues.Type; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SortedBytesMergeUtils; +import org.apache.lucene.index.SortedBytesMergeUtils.MergeContext; +import org.apache.lucene.index.SortedBytesMergeUtils.SortedSourceSlice; +import org.apache.lucene.index.SortedBytesMergeUtils.ToFSTBytesRefConsumer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.PackedInts; + +// Stores fixed-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] + +/** + * @lucene.experimental + */ +class FSTFixedSortedBytesImpl { + + static final String CODEC_NAME_IDX = "FSTFixedSortedBytesIdx"; + static final String CODEC_NAME_DAT = "FSTFixedSortedBytesDat"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static final class Writer extends DerefBytesWriterBase { + private final Comparator comp; + private final Type type; + + public Writer(Directory dir, String id, Comparator comp, + Counter bytesUsed, IOContext context, float acceptableOverheadRatio, boolean fixed) throws IOException { + super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_CURRENT, bytesUsed, context, acceptableOverheadRatio, Type.BYTES_FIXED_SORTED); + this.comp = comp; + if (fixed) { + type = Type.BYTES_FIXED_SORTED; + } else { + size = 0; + type = Type.BYTES_VAR_SORTED; + } + } + + @Override + public void merge(MergeState mergeState, DocValues[] docValues) + throws IOException { + boolean success = false; + try { + final MergeContext ctx = SortedBytesMergeUtils.init(type, docValues, comp, mergeState.segmentInfo.getDocCount()); + List slices = SortedBytesMergeUtils.buildSlices(mergeState.docBase, mergeState.docMaps, docValues, ctx); + final IndexOutput datOut = getOrCreateDataOut(); + datOut.writeInt(ctx.sizePerValues); + final int maxOrd = SortedBytesMergeUtils.mergeRecords(ctx, new ToFSTBytesRefConsumer(datOut, acceptableOverheadRatio), slices); + + final IndexOutput idxOut = getOrCreateIndexOut(); + idxOut.writeInt(maxOrd); + final PackedInts.Writer ordsWriter = PackedInts.getWriter(idxOut, ctx.docToEntry.length, + PackedInts.bitsRequired(maxOrd), PackedInts.DEFAULT); + for (SortedSourceSlice slice : slices) { + slice.writeOrds(ordsWriter); + } + ordsWriter.finish(); + success = true; + } finally { + releaseResources(); + if (success) { + IOUtils.close(getIndexOut(), getDataOut()); + } else { + IOUtils.closeWhileHandlingException(getIndexOut(), getDataOut()); + } + + } + } + + @Override + protected void checkSize(BytesRef bytes) { + if (type == Type.BYTES_FIXED_SORTED) { + super.checkSize(bytes); + } + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finishInternal(int docCount) throws IOException { + fillDefault(docCount); + final IndexOutput datOut = getOrCreateDataOut(); + final int count = hash.size(); + final int[] address = new int[count]; + datOut.writeInt(type == Type.BYTES_VAR_SORTED ? -1 : size); + final ToFSTBytesRefConsumer consumer = new ToFSTBytesRefConsumer(datOut, acceptableOverheadRatio); + final int[] sortedEntries = hash.sort(comp); + // first dump bytes data, recording address as we go + final BytesRef spare = new BytesRef(size); + for (int i = 0; i < count; i++) { + final int e = sortedEntries[i]; + final BytesRef bytes = hash.get(e, spare); + assert type == Type.BYTES_VAR_SORTED || bytes.length == size : bytes.length + " " + size + " " + type; + consumer.consume(bytes, i, -1); + address[e] = i; + } + consumer.flush(); + final IndexOutput idxOut = getOrCreateIndexOut(); + idxOut.writeInt(count); + writeIndex(idxOut, docCount, count, address, docToEntry); + } + } + + static final class Reader extends BytesReaderBase { + private final int size; + private final int valueCount; + private final Comparator comparator; + + public Reader(Directory dir, String id, int maxDoc, IOContext context, + Type type, Comparator comparator) throws IOException { + super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_START, true, context, type); + size = datIn.readInt(); + valueCount = idxIn.readInt(); + this.comparator = comparator; + } + + @Override + public Source load() throws IOException { + return new FixedSortedSource(cloneData(), cloneIndex(), valueCount, + comparator); + } + + @Override + public Source getDirectSource() throws IOException { + return this.getSource(); //nocommit doesn't support direct source for now + } + + @Override + public int getValueSize() { + return size; + } + } + + static final class FixedSortedSource extends SortedSource { + private final int valueCount; + private final PackedInts.Reader docToOrdIndex; + private final FST fst; + + FixedSortedSource(IndexInput datIn, IndexInput idxIn, + int numValues, Comparator comp) throws IOException { + super(Type.BYTES_FIXED_SORTED, comp); + docToOrdIndex = PackedInts.getReader(idxIn); + fst = new FST(datIn, PositiveIntOutputs.getSingleton(true)); + this.valueCount = numValues; + IOUtils.close(datIn, idxIn); + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public boolean hasPackedDocToOrd() { + return true; + } + + @Override + public PackedInts.Reader getDocToOrd() { + return docToOrdIndex; + } + + @Override + public int ord(int docID) { + assert docToOrdIndex.get(docID) < getValueCount(); + return (int) docToOrdIndex.get(docID); + } + + @Override + public BytesRef getByOrd(int ord, BytesRef bytesRef) { + try { + final IntsRef ic = Util.getByOutput(fst, ord); + assert ic != null : "ord=" + ord; + assert bytesRef != null; + return Util.toBytesRef(ic, bytesRef); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java index f658fac..22f8b31 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java @@ -445,6 +445,8 @@ public class MultiDocValues extends DocValues { ordToOffset[ord+1] = offset; } } + @Override + public void flush() {} } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedBytesMergeUtils.java b/lucene/core/src/java/org/apache/lucene/index/SortedBytesMergeUtils.java index 2a93ea0..a531a73 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedBytesMergeUtils.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedBytesMergeUtils.java @@ -28,8 +28,13 @@ import org.apache.lucene.index.DocValues.Type; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.fst.FST; /** * @lucene.internal @@ -164,6 +169,7 @@ public final class SortedBytesMergeUtils { consumer.consume(currentMergedBytes, merger.currentOrd, offset); merger.pushTop(); } + consumer.flush(); ctx.offsets = offsets; assert offsets == null || offsets[merger.currentOrd - 1] == offset; return merger.currentOrd; @@ -171,6 +177,7 @@ public final class SortedBytesMergeUtils { public static interface BytesRefConsumer { public void consume(BytesRef ref, int ord, long offset) throws IOException; + public void flush() throws IOException; } public static final class IndexOutputBytesRefConsumer implements BytesRefConsumer { @@ -185,7 +192,40 @@ public final class SortedBytesMergeUtils { datOut.writeBytes(currentMergedBytes.bytes, currentMergedBytes.offset, currentMergedBytes.length); } + + @Override + public void flush() {} } + + public static final class ToFSTBytesRefConsumer implements BytesRefConsumer { + private final IndexOutput datOut; + private final PositiveIntOutputs fstOutputs; + private final Builder builder; + private final float acceptableOverheadRatio; + public ToFSTBytesRefConsumer(IndexOutput datOut, float acceptableOverheadRatio) { + this.datOut = datOut; + this.fstOutputs = PositiveIntOutputs.getSingleton(true); + builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, fstOutputs, null, true); + this.acceptableOverheadRatio = acceptableOverheadRatio; + } + + @Override + public void consume(BytesRef currentMergedBytes, int ord, long offset) throws IOException { + final IntsRef scratchIntsRef = new IntsRef(); + builder.add(Util.toIntsRef(currentMergedBytes, scratchIntsRef), (long) ord); + + } + + @Override + public void flush() throws IOException { + FST fst = builder.finish(); + //nocommit - are those values ok? + FST packed = fst.pack(3, 1000000, acceptableOverheadRatio); + packed.save(datOut); + + } + } + private static final class RecordMerger { private final MergeQueue queue;