Index: src/test/org/apache/lucene/index/values/TestIndexValues.java =================================================================== --- src/test/org/apache/lucene/index/values/TestIndexValues.java (revision 0) +++ src/test/org/apache/lucene/index/values/TestIndexValues.java (revision 0) @@ -0,0 +1,209 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Random; + +import org.apache.lucene.util.*; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.store.*; +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; + +public class TestIndexValues extends LuceneTestCase { + + // nocommit -- for sorted test, do our own Sort of the + // values and verify it's identical + public void testBytes() throws IOException { + final Random rand = newRandom(); + final boolean SIMPLE_ASCII = true; + + for(int t=0;t<6;t++) { + // System.out.println("\nTEST t=" + t); + final BytesValues.Mode mode; + if (t%3 == 0) { + mode = BytesValues.Mode.STRAIGHT; + } else if (t%3 == 1) { + mode = BytesValues.Mode.DEREF; + } else { + mode = BytesValues.Mode.SORTED; + } + + final boolean fixedSize; + if (t/3 == 0) { + fixedSize = true; + } else { + fixedSize = false; + } + + final BytesRef.Comparator comp = mode == BytesValues.Mode.SORTED ? BytesRef.getUTF8SortedAsUTF16Comparator() : null; + + Directory dir = new MockRAMDirectory(); + BytesValues.Writer w = BytesValues.Writer.create(dir, "test", + mode, + comp, + fixedSize); + Field f = new Field("abc", "", Field.Store.NO, Field.Index.ANALYZED); + + final String[] values = new String[220]; + final int lenMin, lenMax; + if (fixedSize) { + lenMin = lenMax = 6; + } else { + lenMin = 1; + lenMax = 20; + } + for(int i=0;i<100;i++) { + final String s; + if (i > 0 && rand.nextInt(5) <= 2) { + // use prior value + s = values[2*rand.nextInt(i)]; + } else { + s = new String(_TestUtil.getRandomText(rand, lenMin, lenMax, true, SIMPLE_ASCII || fixedSize)); + } + f.setValue(s); + values[2*i] = s; + w.add(2*i, f); + } + w.finish(220); + + BytesValues.Reader r = BytesValues.Reader.create(dir, "test", mode, comp, fixedSize, 220); + + // Verify we can load source twice: + for(int iter=0;iter<2;iter++) { + Values.BytesSource s = r.load(); + for(int i=0;i<100;i++) { + final int idx = 2*i; + assertNotNull("doc " + idx + "; value=" + values[idx] + " t=" + t, s.get(idx)); + assertEquals("doc " + idx, values[idx], s.get(idx).utf8ToString()); + if (mode == BytesValues.Mode.SORTED) { + Values.SortedBytesSource ss = (Values.SortedBytesSource) s; + assertEquals("doc " + idx, values[idx], ss.getByOrd(ss.ord(idx)).utf8ToString()); + + Values.SortedBytesSource.LookupResult result = ss.getByValue(new BytesRef(values[idx])); + assertTrue(result.found); + assertEquals(ss.ord(idx), result.ord); + } + } + + // Lookup random strings: + if (mode == BytesValues.Mode.SORTED) { + Values.SortedBytesSource ss = (Values.SortedBytesSource) s; + BytesRef b2 = ss.getByOrd(1); + final String first = new String(b2.bytes, b2.offset, b2.length, "UTF-8"); + final int numValues = ss.getValueCount(); + b2 = ss.getByOrd(numValues); + final String last = new String(b2.bytes, b2.offset, b2.length, "UTF-8"); + for(int i=0;i<100;i++) { + final String s2 = new String(_TestUtil.getRandomText(rand, lenMin, lenMax, true, SIMPLE_ASCII || fixedSize)); + BytesRef b = new BytesRef(s2); + Values.SortedBytesSource.LookupResult result = ss.getByValue(b); + if (result.found) { + assert result.ord > 0; + assertTrue(b.bytesEquals(ss.getByOrd(result.ord))); + int count = 0; + for(int k=0;k<100;k++) { + if (s2.equals(values[2*k])) { + assertEquals(ss.ord(2*k), result.ord); + count++; + } + } + assertTrue(count > 0); + } else { + assert result.ord >= 0; + if (result.ord == 0) { + // random string was before our first + assertTrue(first.compareTo(s2) > 0); + } else if (result.ord == numValues) { + // random string was after our last + assertTrue(last.compareTo(s2) < 0); + } else { + // random string fell between two of our + // values + b2 = ss.getByOrd(result.ord); + final String s3 = new String(b2.bytes, b2.offset, b2.length, "UTF-8"); + b2 = ss.getByOrd(result.ord+1); + final String s4 = new String(b2.bytes, b2.offset, b2.length, "UTF-8"); + assertTrue(s3.compareTo(s2) < 0); + assertTrue(s2.compareTo(s4) < 0); + } + } + } + } + } + + r.close(); + dir.close(); + } + } + + public void testInts() throws IOException { + Directory dir = new MockRAMDirectory(); + IntValues.Writer w = new IntValues.Writer(dir, "test"); + NumericField f = new NumericField("abc"); + final int NUM_VALUES = 1000; + final long[] values = new long[NUM_VALUES]; + final Random rand = newRandom(); + for(int i=0;i files) throws IOException; + public abstract void finish(int docCount) throws IOException; + public abstract long ramBytesUsed() throws IOException; + } + + /** Base class for reading values from the index. The + * Reader will hold open any necessary files, but will + * not load anything until you call load(). You can + * call load() more than once, eg if you discarded the + * previously loaded Source. */ + public abstract static class Reader implements Closeable { + public abstract void close() throws IOException; + + /** Loads the actual values. You may call this more + * than once, eg if you already previously loaded but + * then discarded the Source. */ + public abstract Source load() throws IOException; + + // nocommit -- add DocsEnum iterator() to this API with flex + } + + /** Base class for source of values */ + public abstract static class Source { + // nocommit: close? + public abstract long ramBytesUsed() throws IOException; + } + + /** Source of byte[], per document. This can only be used + * for retrieving the byte[] for each document; if you + * need to sort, or efficiently compare byte[] according + * to {@link Comparator}, you should use + * SortedBytesSource instead. + * + *

Create a BytesSource by calling {@link + * BytesValues.Writer#create}; get a reader by calling + * {@link BytesValues.Reader#create}. */ + public static abstract class BytesSource extends Source { + + /** Returns value for specified docID. The returned + * BytesRef may be reused across calls. If this docID + * had not been added to the Writer, the returned + * result is null if the source is {@link + * SortedBytesSource}, else the result is undefined. */ + public abstract BytesRef get(int docID) throws IOException; + } + + public static abstract class SortedBytesSource extends BytesSource { + + @Override + public BytesRef get(int docID) throws IOException { + return getByOrd(ord(docID)); + } + + /** Returns ord for specified docID. If this docID had + * not been added to the Writer, the ord is 0. Ord is + * dense, ie, starts at 0, then increments by 1 for the + * next (as defined by {@link Comparator} value. */ + public abstract int ord(int docID) throws IOException; + + /** Returns value for specified ord. */ + public abstract BytesRef getByOrd(int ord) throws IOException; + + public static class LookupResult { + public boolean found; + public int ord; + } + + /** Finds the largest ord whose value is <= the + * requested value. If {@link LookupResult#found} is + * true, then ord is an exact match. The returned + * {@link LookupResult} may be reused across calls. */ + public abstract LookupResult getByValue(BytesRef value) throws IOException; + + // nocommit -- not great that this method is here, + // because the deref impls can also return the unique + // value count (but, are not sorted) + /** Returns number of unique values */ + public abstract int getValueCount() throws IOException; + } + + /** Source of integer (returned as java long), per + * document. The underlying implementation may use + * different numbers of bits per value. */ + public static abstract class IntSource extends Source { + public abstract long get(int docID) throws IOException; + } + + /** Source of floating point (returned as java double), + * per document. The underlying implementation may use 4 + * or 8 bytes. */ + public static abstract class FloatSource extends Source { + public abstract double get(int docID) throws IOException; + } +} + Index: src/java/org/apache/lucene/index/values/VarSortedBytesValues.java =================================================================== --- src/java/org/apache/lucene/index/values/VarSortedBytesValues.java (revision 0) +++ src/java/org/apache/lucene/index/values/VarSortedBytesValues.java (revision 0) @@ -0,0 +1,230 @@ +package org.apache.lucene.index.values; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesHash; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PackedInts; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.CodecUtil; +import java.io.IOException; + +import java.util.Collection; + +// Stores variable-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] and both +// docs reference that single source + +class VarSortedBytesValues extends BytesValues { + + static final String CODEC_NAME = "VarDerefBytesValues"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesValues.Writer { + private Entry[] docToEntry; + private final BytesRef.Comparator comp; + + final class Entry extends BytesHash.Entry { + int index; + long offset; + } + + private final BytesHash hash = new BytesHash(Entry.class) { + @Override + protected VarSortedBytesValues.Writer.Entry newEntry() { + return new VarSortedBytesValues.Writer.Entry(); + } + @Override + public long bytesPerEntry() { + return super.bytesPerEntry() + RamUsageEstimator.NUM_BYTES_INT + RamUsageEstimator.NUM_BYTES_LONG; + } + }; + + public Writer(Directory dir, String id, BytesRef.Comparator comp) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true); + this.comp = comp; + docToEntry = new Entry[1]; + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (docID >= docToEntry.length) { + Entry[] newArray = new Entry[ArrayUtil.getNextSize(1+docID)]; + System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length); + docToEntry = newArray; + } + docToEntry[docID] = hash.add(bytes); + } + + @Override + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_OBJ_REF * docToEntry.length + hash.ramBytesUsed(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finish(int docCount) throws IOException { + + Entry[] sortedEntries = hash.sort(comp); + final int count = hash.size(); + + // first dump bytes data, recording index & offset as + // we go + long offset = 0; + long lastOffset = 0; + for(int i=0;i 1+ord + PackedInts.WriteOnceWriter w = new PackedInts.WriteOnceWriter(idxOut, count, docCount); + final int limit; + if (docCount > docToEntry.length) { + limit = docToEntry.length; + } else { + limit = docCount; + } + for(int i=0;i offset + w = new PackedInts.WriteOnceWriter(idxOut, lastOffset, count); + for(int i=0;i>> 1; + deref(mid); + int cmp = comp.compare(bytesRef, b); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + lookupResult.ord = mid+1; + lookupResult.found = true; + return lookupResult; + } + } + assert comp.compare(bytesRef, b) != 0; + lookupResult.ord = low; + lookupResult.found = false; + return lookupResult; + } + } + } +} Index: src/java/org/apache/lucene/index/values/VarStraightBytesValues.java =================================================================== --- src/java/org/apache/lucene/index/values/VarStraightBytesValues.java (revision 0) +++ src/java/org/apache/lucene/index/values/VarStraightBytesValues.java (revision 0) @@ -0,0 +1,125 @@ +package org.apache.lucene.index.values; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PackedInts; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.CodecUtil; +import java.io.IOException; + +import java.util.Collection; + +// Variable length byte[] per document, no sharing + +class VarStraightBytesValues extends BytesValues { + + static final String CODEC_NAME = "VarStraightBytesValues"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesValues.Writer { + private int address; + private int lastDocID; + private int[] docToAddress; + + public Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true); + docToAddress = new int[1]; + } + + // Fills up to but not including this docID + private void fill(final int docID) { + if (docID >= docToAddress.length) { + docToAddress = ArrayUtil.grow(docToAddress, 1+docID); + } + for(int i=lastDocID+1;i hash = new BytesHash(Entry.class) { + @Override + protected VarDerefBytesValues.Writer.Entry newEntry() { + return new VarDerefBytesValues.Writer.Entry(); + } + @Override + public long bytesPerEntry() { + return super.bytesPerEntry() + RamUsageEstimator.NUM_BYTES_INT; + } + }; + + public Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true); + docToAddress = new int[1]; + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + Entry e = hash.add(bytes); + + if (docID >= docToAddress.length) { + docToAddress = ArrayUtil.grow(docToAddress, 1+docID); + } + if (e.address == 0) { + e.address = address; + // New + if (bytes.length < 128) { + // 1 byte to store length + datOut.writeByte((byte) bytes.length); + address++; + } else { + // 2 byte to store length + datOut.writeByte((byte) (0x80 | (bytes.length & 0x7f))); + datOut.writeByte((byte) ((bytes.length>>7) & 0xff)); + address += 2; + } + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + address += bytes.length; + } + + docToAddress[docID] = e.address; + } + + @Override + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_INT * docToAddress.length + hash.ramBytesUsed(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finish(int docCount) throws IOException { + + idxOut.writeInt(address-1); + + // write index + PackedInts.WriteOnceWriter w = new PackedInts.WriteOnceWriter(idxOut, address-1, docCount); + final int limit; + if (docCount > docToAddress.length) { + limit = docToAddress.length; + } else { + limit = docCount; + } + for(int i=0;i= docToValue.length) { + double[] newArray = new double[ArrayUtil.getNextSize(1+docID)]; + System.arraycopy(docToValue, 0, newArray, 0, docToValue.length); + docToValue = newArray; + } + docToValue[docID] = n.doubleValue(); + } else { + throw new IllegalArgumentException("field must be NumericField"); + } + } + + public void files(Collection files) { + files.add(id + DATA_EXTENSION); + } + + public void finish(int docCount) throws IOException { + final int limit; + if (docCount > docToValue.length) { + limit = docToValue.length; + } else { + limit = docCount; + } + for(int i=0;iNOTE: The total amount of byte[] data stored (across a + * single segment) cannot exceed 2GB.

+ *

NOTE: Each byte[] must be <= 32768 bytes in length

*/ + +public class BytesValues { + + public static enum Mode {STRAIGHT, DEREF, SORTED}; + + public static abstract class Writer extends Values.Writer { + + protected final Directory dir; + protected final IndexOutput idxOut; + protected final IndexOutput datOut; + protected final String id; + private final BytesRef bytesRef = new BytesRef(); + private boolean didIndex; + + protected Writer(Directory dir, String id, String codecName, int version, boolean doIndex) throws IOException { + this.id = id; + this.dir = dir; + datOut = dir.createOutput(id + Values.DATA_EXTENSION); + CodecUtil.writeHeader(datOut, codecName, version); + assert datOut.getFilePointer() == CodecUtil.headerLength(codecName); + if (doIndex) { + idxOut = dir.createOutput(id + Values.INDEX_EXTENSION); + CodecUtil.writeHeader(idxOut, codecName, version); + } else { + idxOut = null; + } + } + + private final UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); + + public void add(int docID, Fieldable field) throws IOException { + // nocommit -- should we insist on upfront binding, ie, + // "this field will be binary" or "this field will be + // String", and enforce here? as is, one could flip + // back and forth per document + if (field.isBinary()) { + bytesRef.bytes = field.getBinaryValue(); + bytesRef.length = field.getBinaryLength(); + bytesRef.offset = field.getBinaryOffset(); + } else { + final String v = field.stringValue(); + if (v == null) { + throw new IllegalArgumentException("field must be binary or String"); + } + UnicodeUtil.UTF16toUTF8(v, 0, v.length(), utf8Result); + bytesRef.bytes = utf8Result.result; + bytesRef.length = utf8Result.length; + bytesRef.offset = 0; + } + + add(docID, bytesRef); + } + + /** Must be called only with increasing docIDs. It's OK + * for some docIDs to be skipped; they will be filled + * with 0 bytes. */ + public abstract void add(int docID, BytesRef bytes) throws IOException; + + public void files(Collection files) { + files.add(id + Values.DATA_EXTENSION); + if (idxOut != null) { + files.add(id + Values.INDEX_EXTENSION); + } + } + + public static BytesValues.Writer create(Directory dir, String id, Mode mode, BytesRef.Comparator comp, boolean fixedSize) + throws IOException { + if (mode != Mode.SORTED && comp != null) { + throw new IllegalArgumentException("comparator should be null (got " + comp + ") when not using BytesWriterMode.SORTED"); + } + + if (fixedSize) { + if (mode == Mode.STRAIGHT) { + return new FixedStraightBytesValues.Writer(dir, id); + } else if (mode == Mode.DEREF) { + return new FixedDerefBytesValues.Writer(dir, id); + } else if (mode == Mode.SORTED) { + return new FixedSortedBytesValues.Writer(dir, id, comp); + } + } else { + if (mode == Mode.STRAIGHT) { + return new VarStraightBytesValues.Writer(dir, id); + } else if (mode == Mode.DEREF) { + return new VarDerefBytesValues.Writer(dir, id); + } else if (mode == Mode.SORTED) { + return new VarSortedBytesValues.Writer(dir, id, comp); + } + } + + throw new IllegalArgumentException(""); + } + + public void finish(int docCount) throws IOException { + datOut.close(); + if (idxOut != null) { + idxOut.close(); + } + } + } + + /** Opens all necessary files, but does not read any data + * in until you call {@link #load}. */ + public static abstract class Reader extends Values.Reader { + + protected final IndexInput idxIn; + protected final IndexInput datIn; + protected final int version; + protected final String id; + + protected Reader(Directory dir, String id, String codecName, int maxVersion, boolean doIndex) throws IOException { + this.id = id; + datIn = dir.openInput(id + Values.DATA_EXTENSION); + version = CodecUtil.checkHeader(datIn, codecName, maxVersion); + + if (doIndex) { + idxIn = dir.openInput(id + Values.INDEX_EXTENSION); + final int version2 = CodecUtil.checkHeader(idxIn, codecName, maxVersion); + assert version == version2; + } else { + idxIn = null; + } + } + + public static BytesValues.Reader create(Directory dir, String id, Mode mode, BytesRef.Comparator comp, boolean fixedSize, int maxDoc) + throws IOException { + if (mode != Mode.SORTED && comp != null) { + throw new IllegalArgumentException("comparator should be null (got " + comp + ") when not using BytesWriterMode.SORTED"); + } + if (fixedSize) { + if (mode == Mode.STRAIGHT) { + return new FixedStraightBytesValues.Reader(dir, id, maxDoc); + } else if (mode == Mode.DEREF) { + return new FixedDerefBytesValues.Reader(dir, id, maxDoc); + } else if (mode == Mode.SORTED) { + return new FixedSortedBytesValues.Reader(dir, id, maxDoc, comp); + } + } else { + if (mode == Mode.STRAIGHT) { + return new VarStraightBytesValues.Reader(dir, id, maxDoc); + } else if (mode == Mode.DEREF) { + return new VarDerefBytesValues.Reader(dir, id, maxDoc); + } else if (mode == Mode.SORTED) { + return new VarSortedBytesValues.Reader(dir, id, maxDoc, comp); + } + } + + throw new IllegalArgumentException(""); + } + + public abstract Values.BytesSource load() throws IOException; + + public void close() throws IOException { + if (datIn != null) { + datIn.close(); + } + if (idxIn != null) { + idxIn.close(); + } + } + } +} \ No newline at end of file Index: src/java/org/apache/lucene/index/values/FixedSortedBytesValues.java =================================================================== --- src/java/org/apache/lucene/index/values/FixedSortedBytesValues.java (revision 0) +++ src/java/org/apache/lucene/index/values/FixedSortedBytesValues.java (revision 0) @@ -0,0 +1,212 @@ +package org.apache.lucene.index.values; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesHash; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PackedInts; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.CodecUtil; +import java.io.IOException; + +import java.util.Collection; + +// Stores fixed-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] + +class FixedSortedBytesValues extends BytesValues { + + static final String CODEC_NAME = "FixedSortedBytesValues"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesValues.Writer { + private int size = -1; + private Entry[] docToEntry; + private final BytesRef.Comparator comp; + + final static class Entry extends BytesHash.Entry { + int address; + } + + private final BytesHash hash = new BytesHash(Entry.class) { + @Override + protected FixedSortedBytesValues.Writer.Entry newEntry() { + return new FixedSortedBytesValues.Writer.Entry(); + } + @Override + public long bytesPerEntry() { + return super.bytesPerEntry() + RamUsageEstimator.NUM_BYTES_INT; + } + }; + + public Writer(Directory dir, String id, BytesRef.Comparator comp) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true); + docToEntry = new Entry[1]; + this.comp = comp; + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (size == -1) { + size = bytes.length; + datOut.writeInt(size); + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + " but got " + bytes.length); + } + if (docID >= docToEntry.length) { + Entry[] newArray = new Entry[ArrayUtil.getNextSize(1+docID)]; + System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length); + docToEntry = newArray; + } + docToEntry[docID] = hash.add(bytes); + } + + @Override + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_OBJ_REF * docToEntry.length + hash.ramBytesUsed(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finish(int docCount) throws IOException { + + Entry[] sortedEntries = hash.sort(comp); + final int count = hash.size(); + + // first dump bytes data, recording address as we go + for(int i=0;i docToEntry.length) { + limit = docToEntry.length; + } else { + limit = docCount; + } + for(int i=0;i>> 1; + bytesRef.offset = mid * size; + int cmp = comp.compare(bytesRef, b); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + lookupResult.ord = mid+1; + lookupResult.found = true; + return lookupResult; + } + } + lookupResult.ord = low; + lookupResult.found = false; + return lookupResult; + } + } + } +} Index: src/java/org/apache/lucene/index/values/IntValues.java =================================================================== --- src/java/org/apache/lucene/index/values/IntValues.java (revision 0) +++ src/java/org/apache/lucene/index/values/IntValues.java (revision 0) @@ -0,0 +1,147 @@ +package org.apache.lucene.index.values; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PackedInts; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Collection; + +/** Exposes writer & reader for integer values. The writer + * uses {@link PackedInts} for simplistic compression when + * the range is limited. */ +public class IntValues { + private static final String CODEC_NAME = "IntValues"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + private static final String DATA_EXTENSION = ".dat"; + + public static class Writer extends Values.Writer { + + protected final Directory dir; + protected final IndexOutput datOut; + protected final String id; + private final BytesRef bytesRef = new BytesRef(); + private boolean didIndex; + private long[] docToValue; + private long minValue; + private long maxValue; + private boolean started; + + protected Writer(Directory dir, String id) throws IOException { + this.id = id; + this.dir = dir; + datOut = dir.createOutput(id + DATA_EXTENSION); + CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); + assert datOut.getFilePointer() == CodecUtil.headerLength(CODEC_NAME); + docToValue = new long[1]; + } + + public void add(int docID, Fieldable field) throws IOException { + + if (field instanceof NumericField) { + Number n = ((NumericField) field).getNumericValue(); + long v = n.longValue(); + if (!started) { + minValue = maxValue = v; + started = true; + } else { + if (v < minValue) { + minValue = v; + } else if (v > maxValue) { + maxValue = v; + } + } + if (docID >= docToValue.length) { + docToValue = ArrayUtil.grow(docToValue, 1+docID); + } + docToValue[docID] = v; + } else { + throw new IllegalArgumentException("field must be NumericField"); + } + } + + public void files(Collection files) { + files.add(id + DATA_EXTENSION); + } + + public void finish(int docCount) throws IOException { + datOut.writeLong(minValue); + + PackedInts.WriteOnceWriter w = new PackedInts.WriteOnceWriter(datOut, maxValue-minValue, docCount); + final int limit; + if (docCount > docToValue.length) { + limit = docToValue.length; + } else { + limit = docCount; + } + for(int i=0;i hash = new BytesHash(Entry.class) { + @Override + protected FixedDerefBytesValues.Writer.Entry newEntry() { + return new FixedDerefBytesValues.Writer.Entry(); + } + @Override + public long bytesPerEntry() { + return super.bytesPerEntry() + RamUsageEstimator.NUM_BYTES_INT; + } + }; + + public Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true); + docToID = new int[1]; + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (size == -1) { + size = bytes.length; + datOut.writeInt(size); + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + " but got " + bytes.length); + } + final int idUptoStart = idUpto; + Entry e = hash.add(bytes); + + if (e.id == idUptoStart) { + // New + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } + + if (docID >= docToID.length) { + docToID = ArrayUtil.grow(docToID, 1+docID); + } + docToID[docID] = e.id; + } + + @Override + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_INT * docToID.length + hash.ramBytesUsed(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finish(int docCount) throws IOException { + + idxOut.writeInt(idUpto-1); + + // write index + PackedInts.WriteOnceWriter w = new PackedInts.WriteOnceWriter(idxOut, idUpto-1, docCount); + final int limit; + if (docCount > docToID.length) { + limit = docToID.length; + } else { + limit = docCount; + } + for(int i=0;iNOTE: this class is meant only to be used internally + * by Lucene; it's only public so it can be shared across + * packages. This means the API is freely subject to + * change, and, the class could be removed entirely, in any + * Lucene release. Use directly at your own risk! + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.index.CorruptIndexException; + +import java.io.IOException; + +public final class CodecUtil { + private final static int CODEC_MAGIC = 0x3fd76c17; + + public static void writeHeader(IndexOutput out, String codec, int version) + throws IOException { + final long start = out.getFilePointer(); + out.writeInt(CODEC_MAGIC); + out.writeString(codec); + out.writeInt(version); + + // We require this so we can easily pre-compute header length + if (out.getFilePointer()-start != codec.length()+9) { + throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]"); + } + } + + public static int headerLength(String codec) { + return 9+codec.length(); + } + + public static int checkHeader(IndexInput in, String codec, int maxVersion) + throws IOException { + final int actualHeader = in.readInt(); + if (actualHeader != CODEC_MAGIC) { + throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC); + } + final String actualCodec = in.readString(); + if (!actualCodec.equals(codec)) { + throw new CorruptIndexException("codec mismatch: actual codec=" + actualCodec + " vs expected codec=" + codec); + } + final int actualVersion = in.readInt(); + if (actualVersion > maxVersion) { + throw new CorruptIndexException("version " + actualVersion + " is too new (expected <= version " + maxVersion + ")"); + } + + return actualVersion; + } +} Index: src/java/org/apache/lucene/util/PackedInts.java =================================================================== --- src/java/org/apache/lucene/util/PackedInts.java (revision 0) +++ src/java/org/apache/lucene/util/PackedInts.java (revision 0) @@ -0,0 +1,222 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.IndexInput; + +import java.io.IOException; + +/** + * Simplistic compression for array of long values, where + * each value is >= 0 and <= a specified maximum value. The + * values are stored as packed ints, with each value + * consuming a fixed number of bits. + * + *

NOTE: this class is meant only to be used internally + * by Lucene; it's only public so it can be shared across + * packages. This means the API is freely subject to + * change, and, the class could be removed entirely, in any + * Lucene release. Use directly at your own risk! + */ + +// nocommit +// - do we need int/long variants (for perf)? or long +// only suffices? +// - what native type is best perf? long/int/short/byte? + +public class PackedInts { + + private final static String CODEC_NAME = "PackedInts"; + private final static int VERSION_START = 0; + private final static int VERSION_CURRENT = 0; + + private static int size(int bitsPerValue, int valueCount) { + final long totBitCount = (long) valueCount * bitsPerValue; + return (int) (totBitCount/64 + ((totBitCount % 64 == 0 ) ? 0:1)); + } + + // nocommit: bulk add(int[]) api? + public static class WriteOnceWriter { + private final IndexOutput out; + private final int bitsPerValue; + private final long maxValue; + private long pending; + private int pendingBitsLeft; + private final int valueCount; + private final long fpStart; + + // masks[n-1] masks for bottom n bits + private final long[] masks; + + // nocommit -- allow minValue too? ie not just minValue==0 + public WriteOnceWriter(IndexOutput out, long maxValue, int valueCount) + throws IOException { + this.out = out; + this.maxValue = maxValue; + this.valueCount = valueCount; + + long size = 2; + int bits = 1; + while(size <= maxValue && bits < 63) { + size *= 2; + bits++; + } + + bitsPerValue = bits; + assert bitsPerValue <= 63; + + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_START); + out.writeVInt(bitsPerValue); + out.writeVInt(valueCount); + out.writeVLong(maxValue); + fpStart = out.getFilePointer(); + + pendingBitsLeft = 64; + masks = new long[bitsPerValue-1]; + + int v = 1; + for(int i=0;i> pendingBitsLeft; + + pendingBitsLeft = 64-(bitsPerValue-pendingBitsLeft); + } + } + + public void finish() throws IOException { + if (pendingBitsLeft != 64) { + out.writeLong(pending); + } + + assert out.getFilePointer() == fpStart + 8*size(bitsPerValue, valueCount): + "fp=" + out.getFilePointer() + " fpStart=" + fpStart + " bitsPerValue=" + bitsPerValue + " valueCount=" + valueCount + " size=" + size(bitsPerValue, valueCount); + } + } + + public static abstract class Reader { + private final long maxValue; + + protected Reader(long maxValue) { + this.maxValue = maxValue; + } + + public long getMaxValue() { + return maxValue; + } + abstract public long get(int index) throws IOException; + abstract public long ramBytesUsed(); + } + + /** Loads full array into RAM, on startup, then performs + * lookups via the RAM array. */ + static class SimpleRAMReader extends Reader { + private final int bitsPerValue; + private final long[] data; + private final long[] masks; + private final long mask; + + SimpleRAMReader(IndexInput in, int bitsPerValue, long maxValue, int valueCount) + throws IOException { + + super(maxValue); + + this.bitsPerValue = bitsPerValue; + final int size = size(bitsPerValue, valueCount); + data = new long[size]; + long v = 1; + masks = new long[bitsPerValue]; + for(int i=0;i= bitsPerValue) { + // No wrapping + return (data[loc] >> bitPos) & mask; + } else { + // Wrap: top left bits of data[loc], remainder from data[1+loc] + return (data[loc] >> bitPos) & masks[left-1] | + (data[1+loc] & masks[bitsPerValue-left-1]) << left; + } + } + } + + public static Reader getReader(IndexInput in) throws IOException { + CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START); + final int bitsPerValue = in.readVInt(); + final int valueCount = in.readVInt(); + final long maxValue = in.readVLong(); + // TODO an mmap reader as well? + // TODO: specialize anon impls for performance + return new SimpleRAMReader(in, bitsPerValue, maxValue, valueCount); + } +} Index: src/java/org/apache/lucene/util/BytesHash.java =================================================================== --- src/java/org/apache/lucene/util/BytesHash.java (revision 0) +++ src/java/org/apache/lucene/util/BytesHash.java (revision 0) @@ -0,0 +1,374 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// nocommit -- move to util? +import org.apache.lucene.index.ByteBlockPool; +import java.lang.reflect.Array; +import java.util.Arrays; + +/** + * Hash's BytesRefs. BytesRef must be no longer than XXX in + * length. + * + *

NOTE: this class is meant only to be used internally + * by Lucene; it's only public so it can be shared across + * packages. This means the API is freely subject to + * change, and, the class could be removed entirely, in any + * Lucene release. Use directly at your own risk! + */ + +// nocommit -- reuse Entry instances? +public abstract class BytesHash { + + // nocommit -- factor properly so the byte pool uses this + // NOT DW's + public final static int BYTES_BLOCK_SHIFT = 15; + public final static int BYTES_BLOCK_SIZE = 1 << BYTES_BLOCK_SHIFT; + public final static int BYTES_BLOCK_MASK = BYTES_BLOCK_SIZE - 1; + + // nocommit -- reuse? + private static class ByteBlockAllocator extends ByteBlockPool.Allocator { + int blockUsedCount; + + @Override + public byte[] getByteBlock(boolean trackAllocations) { + blockUsedCount++; + return new byte[BYTES_BLOCK_SIZE]; + } + + @Override + public void recycleByteBlocks(byte[][] blocks, int start, int end) { + blockUsedCount -= end-start; + } + + @Override + public long ramBytesUsed() { + return blockUsedCount * BYTES_BLOCK_SIZE; + } + } + + public static class Entry { + public int bytesStart; + } + + private final Class cl; + public final ByteBlockPool pool; + private int hashSize = 4; + private int hashHalfSize = hashSize/2; + private int hashMask = hashSize-1; + private int count; + private int lastCount = -1; + private final ByteBlockAllocator allocator; + private T[] hash; + + @SuppressWarnings("unchecked") + public BytesHash(Class cl) { + this.cl = cl; + allocator = new ByteBlockAllocator(); + pool = new ByteBlockPool(allocator, false); + hash = (T[]) Array.newInstance(cl, hashSize); + } + + private void compactPostings() { + } + + public int size() { + return count; + } + + public BytesRef getBytes(T e) { + return deref(e.bytesStart, scratch1); + } + + /** Destructive operation -- returns all Entry instances, + * in arbitrary order */ + public T[] compact() { + int upto = 0; + for(int i=0;i= hi) + return; + else if (hi == 1+lo) { + if (compare(comp, entries[lo], entries[hi]) > 0) { + final T tmp = entries[lo]; + entries[lo] = entries[hi]; + entries[hi] = tmp; + } + return; + } + + int mid = (lo + hi) >>> 1; + + if (compare(comp, entries[lo], entries[mid]) > 0) { + T tmp = entries[lo]; + entries[lo] = entries[mid]; + entries[mid] = tmp; + } + + if (compare(comp, entries[mid], entries[hi]) > 0) { + T tmp = entries[mid]; + entries[mid] = entries[hi]; + entries[hi] = tmp; + + if (compare(comp, entries[lo], entries[mid]) > 0) { + T tmp2 = entries[lo]; + entries[lo] = entries[mid]; + entries[mid] = tmp2; + } + } + + int left = lo + 1; + int right = hi - 1; + + if (left >= right) + return; + + T partition = entries[mid]; + + for (; ;) { + while (compare(comp, entries[right], partition) > 0) + --right; + + while (left < right && compare(comp, entries[left], partition) <= 0) + ++left; + + if (left < right) { + T tmp = entries[left]; + entries[left] = entries[right]; + entries[right] = tmp; + --right; + } else { + break; + } + } + + quickSort(comp, entries, lo, left); + quickSort(comp, entries, left + 1, hi); + } + + private final BytesRef scratch1 = new BytesRef(); + private final BytesRef scratch2 = new BytesRef(); + + private final BytesRef deref(int bytesStart, BytesRef b) { + b.bytes = pool.buffers[bytesStart >> BYTES_BLOCK_SHIFT]; + int pos = bytesStart & BYTES_BLOCK_MASK; + + if ((b.bytes[pos] & 0x80) == 0) { + // length is 1 byte + b.length = b.bytes[pos]; + pos += 1; + } else { + // length is 2 bytes + b.length = (b.bytes[pos]&0x7f) + ((b.bytes[pos+1]&0xff)<<7); + pos += 2; + } + b.offset = pos; + return b; + } + + private boolean equals(T e, BytesRef b) { + return deref(e.bytesStart, scratch1).bytesEquals(b); + } + + private int compare(BytesRef.Comparator comp, T e1, T e2) { + return comp.compare(deref(e1.bytesStart, scratch1), + deref(e2.bytesStart, scratch2)); + } + + @SuppressWarnings("unchecked") + private boolean shrink(int targetSize) { + + // Cannot use ArrayUtil.shrink because we require power + // of 2: + int newSize = hashSize; + while(newSize >= 8 && newSize/4 > targetSize) { + newSize /= 2; + } + + if (newSize != hashSize) { + hashSize = newSize; + hash = (T[]) Array.newInstance(cl, hashSize); + hashHalfSize = newSize/2; + hashMask = newSize-1; + return true; + } else { + return false; + } + } + + public void clear() { + lastCount = count; + count = 0; + if (lastCount != -1) { + if (shrink(lastCount)) { + // shrink clears the hash entries + return; + } + } + Arrays.fill(hash, null); + } + + public T add(BytesRef bytes) { + int code = 0; + final int end = bytes.offset + bytes.length; + for(int i=bytes.offset;i>8)+code)|1; + do { + code += inc; + hashPos = code & hashMask; + e = hash[hashPos]; + } while (e != null && !equals(e, bytes)); + } + + if (e == null) { + // new entry + + final int len2 = 2+bytes.length; + if (len2 + pool.byteUpto > BYTES_BLOCK_SIZE) { + if (len2 > BYTES_BLOCK_SIZE) { + throw new IllegalArgumentException("bytes can be at most " + (BYTES_BLOCK_SIZE-2) + " in length; got " + bytes.length); + } + pool.nextBuffer(); + } + + e = newEntry(); + + final byte[] buffer = pool.buffer; + final int bufferUpto = pool.byteUpto; + e.bytesStart = bufferUpto + pool.byteOffset; + + // We first encode the length, followed by the + // bytes. Length is encoded as vInt, but will consume + // 1 or 2 bytes at most (we reject too-long terms, + // above). + if (bytes.length < 128) { + // 1 byte to store length + buffer[bufferUpto] = (byte) bytes.length; + pool.byteUpto += bytes.length + 1; + System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto+1, bytes.length); + } else { + // 2 byte to store length + buffer[bufferUpto] = (byte) (0x80 | (bytes.length & 0x7f)); + buffer[bufferUpto+1] = (byte) ((bytes.length>>7) & 0xff); + pool.byteUpto += bytes.length + 2; + System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto+2, bytes.length); + } + assert hash[hashPos] == null; + hash[hashPos] = e; + count++; + + if (count == hashHalfSize) { + rehash(2*hashSize); + } + } + return e; + } + + /** Called when postings hash is too small (> 50% + * occupied) or too large (< 20% occupied). */ + void rehash(final int newSize) { + + final int newMask = newSize-1; + + @SuppressWarnings("unchecked") + T[] newHash = (T[]) Array.newInstance(cl, newSize); + for(int i=0;i> BYTES_BLOCK_SHIFT]; + code = 0; + + final int len; + int pos; + if ((bytes[start] & 0x80) == 0) { + // length is 1 byte + len = bytes[start]; + pos = start+1; + } else { + len = (bytes[start]&0x7f) + ((bytes[start+1]&0xff)<<7); + pos = start+2; + } + + final int endPos = pos+len; + while(pos < endPos) { + code = (code*31) + bytes[pos++]; + } + + int hashPos = code & newMask; + assert hashPos >= 0; + if (newHash[hashPos] != null) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + hashPos = code & newMask; + } while (newHash[hashPos] != null); + } + newHash[hashPos] = e0; + } + } + + hashMask = newMask; + hash = newHash; + hashSize = newSize; + hashHalfSize = newSize >> 1; + } + + protected abstract T newEntry(); + + public long ramBytesUsed() { + return allocator.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJ_REF * hashSize + count * bytesPerEntry(); + } + + protected long bytesPerEntry() { + return RamUsageEstimator.NUM_BYTES_OBJ_HEADER + RamUsageEstimator.NUM_BYTES_INT; + } +} \ No newline at end of file Index: src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- src/java/org/apache/lucene/util/BytesRef.java (revision 0) +++ src/java/org/apache/lucene/util/BytesRef.java (revision 0) @@ -0,0 +1,118 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.UnsupportedEncodingException; + +// nocommit -- share w/ flex's TermRef +public class BytesRef { + + public byte[] bytes; + public int offset; + public int length; + + public abstract static class Comparator { + abstract public int compare(BytesRef a, BytesRef b); + } + + public BytesRef() { + } + + /** Creates bytes ref, wrapping UTF8 bytes from the + * provided string. */ + public BytesRef(String s) { + try { + bytes = s.getBytes("UTF-8"); + } catch (UnsupportedEncodingException uee) { + throw new RuntimeException(uee); + } + offset = 0; + length = bytes.length; + } + + public boolean bytesEquals(BytesRef other) { + if (length == other.length) { + int upto = offset; + int otherUpto = other.offset; + final byte[] otherBytes = other.bytes; + for(int i=0;i= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + } +} Index: src/java/org/apache/lucene/util/RamUsageEstimator.java =================================================================== --- src/java/org/apache/lucene/util/RamUsageEstimator.java (revision 895019) +++ src/java/org/apache/lucene/util/RamUsageEstimator.java (working copy) @@ -35,6 +35,15 @@ * estimate is complete. */ public final class RamUsageEstimator { + + public static int NUM_BYTES_INT = 4; + public static int NUM_BYTES_LONG = 8; + public static int NUM_BYTES_FLOAT = 4; + public static int NUM_BYTES_DOUBLE = 8; + public static int NUM_BYTES_OBJ_HEADER = 8; + public static int NUM_BYTES_OBJ_REF = Constants.JRE_IS_64BIT ? 8 : 4; + public static int NUM_BYTES_ARRAY_HEADER = NUM_BYTES_OBJ_HEADER + NUM_BYTES_INT + NUM_BYTES_OBJ_REF; + private MemoryModel memoryModel; private final Map seen;