Index: src/test/org/apache/lucene/index/TestTermInfosReaderIndex.java =================================================================== --- src/test/org/apache/lucene/index/TestTermInfosReaderIndex.java (revision 0) +++ src/test/org/apache/lucene/index/TestTermInfosReaderIndex.java (revision 0) @@ -0,0 +1,147 @@ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; + +public class TestTermInfosReaderIndex extends LuceneTestCase { + + private static final int NUMBER_OF_DOCUMENTS = 1000; + private static final int NUMBER_OF_FIELDS = 100; + private TermInfosReaderIndex index; + private Directory directory; + private SegmentTermEnum termEnum; + private int indexDivisor = 1; + private int termIndexInterval; + private int readBufferSize = 1024; + private IndexReader reader; + private List sampleTerms; + + @Override + public void setUp() throws Exception { + super.setUp(); + + directory = new RAMDirectory(); + termIndexInterval = populate(directory); + + String segment = "_0"; + + FieldInfos fieldInfos = new FieldInfos(directory, IndexFileNames.segmentFileName(segment, IndexFileNames.FIELD_INFOS_EXTENSION)); + String segmentFileName = IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION); + long tiiFileLength = directory.fileLength(segmentFileName); + IndexInput input = directory.openInput(segmentFileName, readBufferSize); + termEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION), readBufferSize), fieldInfos, false); + int totalIndexInterval = termEnum.indexInterval * indexDivisor; + + SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true); + index = new TermInfosReaderIndex(indexEnum, indexDivisor, tiiFileLength, totalIndexInterval); + indexEnum.close(); + input.close(); + + reader = IndexReader.open(directory); + sampleTerms = sample(reader,1000); + + } + + @Override + public void tearDown() throws Exception { + super.tearDown(); + termEnum.close(); + reader.close(); + directory.close(); + } + + public void testSeekEnum() throws CorruptIndexException, IOException { + int indexPosition = 3; + SegmentTermEnum clone = (SegmentTermEnum) termEnum.clone(); + Term term = findTermThatWouldBeAtIndex(clone, indexPosition); + clone.close(); + SegmentTermEnum enumerator = clone; + index.seekEnum(enumerator, indexPosition); + assertEquals(term, enumerator.term()); + } + + public void testCompareTo() throws IOException { + Term term = new Term("field" + random.nextInt(NUMBER_OF_FIELDS) ,getText()); + BytesRef termBytesRef = new BytesRef(term.text); + for (int i = 0; i < index.length(); i++) { + Term t = index.getTerm(i); + int compareTo = term.compareTo(t); + assertEquals(compareTo, index.compareTo(term, termBytesRef, i)); + } + } + + public void testRandomSearchPerformance() throws CorruptIndexException, IOException { + IndexSearcher searcher = new IndexSearcher(reader); + for (Term t : sampleTerms) { + TermQuery query = new TermQuery(t); + TopDocs topDocs = searcher.search(query, 10); + assertTrue(topDocs.totalHits > 0); + } + searcher.close(); + } + + private List sample(IndexReader reader, int size) throws IOException { + List sample = new ArrayList(); + Random random = new Random(); + TermEnum terms = reader.terms(); + while (terms.next()) { + if (sample.size() >= size) { + int pos = random.nextInt(size); + sample.set(pos, terms.term()); + } else { + sample.add(terms.term()); + } + } + terms.close(); + Collections.shuffle(sample); + return sample; + } + + private Term findTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) throws IOException { + int termPosition = index * termIndexInterval; + for (int i = 0; i < termPosition; i++) { + if (!termEnum.next()) { + fail("Should not have run out of terms."); + } + } + return termEnum.term(); + } + + private int populate(Directory directory) throws CorruptIndexException, LockObtainFailedException, IOException { + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new KeywordAnalyzer()); + IndexWriter writer = new IndexWriter(directory, config); + for (int i = 0; i < NUMBER_OF_DOCUMENTS; i++) { + Document document = new Document(); + for (int f = 0; f < NUMBER_OF_FIELDS; f++) { + document.add(new Field("field" + f,getText(),Store.NO,Index.NOT_ANALYZED_NO_NORMS)); + } + writer.addDocument(document); + } + writer.optimize(); + writer.close(); + return config.getTermIndexInterval(); + } + + private String getText() { + return Long.toString(random.nextLong(),Character.MAX_RADIX); + } +} Index: src/java/org/apache/lucene/index/TermInfosReaderIndex.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReaderIndex.java (revision 0) +++ src/java/org/apache/lucene/index/TermInfosReaderIndex.java (revision 0) @@ -0,0 +1,250 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.store.PagedBytesDataInput; +import org.apache.lucene.store.PagedBytesDataOutput; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.PagedBytes.Reader; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PackedInts.Writer; + +/** + * This stores a monotonically increasing set of pairs in an + * index segment. Pairs are accessed either by Term or by ordinal position the + * set. The Terms and TermInfo are actually serialized and stored into a byte + * array and pointers to the position of each are stored in a int array. + */ +class TermInfosReaderIndex { + + private static final int MAX_NUMBER_OF_BITS = 91; // 128 MB block + private Term[] fields; + private int totalIndexInterval; + private Comparator comparator = BytesRef.getUTF8SortedAsUTF16Comparator(); + private final PagedBytesDataInput dataInput; + private final PagedBytesDataInput indexInput; + private org.apache.lucene.util.packed.PackedInts.Reader packedInts; + private final int indexSize; + + /** + * Loads the segment information at segment load time. + * + * @param indexEnum + * the term enum. + * @param indexDivisor + * the index divisor. + * @param tiiFileLength + * the size of the tii file, used to approximate the size of the + * buffer. + * @param totalIndexInterval + * the total index interval. + */ + TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException { + this.totalIndexInterval = totalIndexInterval; + indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; + long[] indexToTerms = new long[indexSize]; + // this is only an inital size, it will be GCed once the build is complete + long initialSize = (long) (tiiFileLength * 1.5); + PagedBytes dataPagedBytes = new PagedBytes(estimateBits(initialSize)); + PagedBytesDataOutput dataOutput = new PagedBytesDataOutput(dataPagedBytes); + + String currentField = null; + List fieldStrs = new ArrayList(); + int fieldCounter = -1; + for (int i = 0; indexEnum.next(); i++) { + Term term = indexEnum.term(); + if (currentField != term.field) { + currentField = term.field; + fieldStrs.add(currentField); + fieldCounter++; + } + TermInfo termInfo = indexEnum.termInfo(); + indexToTerms[i] = dataOutput.getPosition(); + dataOutput.writeVInt(fieldCounter); + dataOutput.writeString(term.text()); + dataOutput.writeVInt(termInfo.docFreq); + dataOutput.writeVInt(termInfo.skipOffset); + dataOutput.writeVLong(termInfo.freqPointer); + dataOutput.writeVLong(termInfo.proxPointer); + dataOutput.writeVLong(indexEnum.indexPointer); + for (int j = 1; j < indexDivisor; j++) + if (!indexEnum.next()) + break; + } + fields = new Term[fieldStrs.size()]; + for (int i = 0; i < fields.length; i++) { + fields[i] = new Term(fieldStrs.get(i)); + } + + Reader reader = dataPagedBytes.freeze(true); + dataInput = new PagedBytesDataInput(reader); + + PagedBytes indexPagedBytes = new PagedBytes(estimateBits(indexToTerms.length * 8)); + PagedBytesDataOutput indexOutput = new PagedBytesDataOutput(indexPagedBytes); + Writer writer = PackedInts.getWriter(indexOutput, indexToTerms.length, BitUtil.nlz(indexToTerms[indexToTerms.length - 1])); + for (int i = 0; i < indexToTerms.length; i++) { + writer.add(indexToTerms[i]); + } + writer.finish(); + indexInput = new PagedBytesDataInput(indexPagedBytes.freeze(true)); + packedInts = PackedInts.getReader(indexInput); + } + + static int estimateBits(long estSize) { + return Math.min(64 - BitUtil.nlz(estSize), MAX_NUMBER_OF_BITS); + } + + void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + + input.setPosition(packedInts.get(indexOffset)); + + // read the term + int fieldId = input.readVInt(); + Term field = fields[fieldId]; + Term term = field.createTerm(input.readString()); + + // read the terminfo + TermInfo termInfo = new TermInfo(); + termInfo.docFreq = input.readVInt(); + termInfo.skipOffset = input.readVInt(); + termInfo.freqPointer = input.readVLong(); + termInfo.proxPointer = input.readVLong(); + + long pointer = input.readVLong(); + + // perform the seek + enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo); + } + + /** + * Binary search for the given term. + * + * @param term + * the term to locate. + * @throws IOException + */ + int getIndexOffset(Term term, BytesRef termBytesRef) throws IOException { + int lo = 0; + int hi = indexSize - 1; + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + BytesRef bytesRef = new BytesRef(); + while (hi >= lo) { + int mid = (lo + hi) >>> 1; + int delta = compareTo(term, termBytesRef, mid, input, bytesRef); + if (delta < 0) + hi = mid - 1; + else if (delta > 0) + lo = mid + 1; + else + return mid; + } + return hi; + } + + /** + * Gets the term at the given position. For testing. + * + * @param termIndex + * the position to read the term from the index. + * @return the term. + * @throws IOException + */ + Term getTerm(int termIndex) throws IOException { + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + input.setPosition(packedInts.get(termIndex)); + + // read the term + int fieldId = input.readVInt(); + Term field = fields[fieldId]; + return field.createTerm(input.readString()); + } + + /** + * Returns the number of terms. + * + * @return int. + */ + int length() { + return indexSize; + } + + /** + * The compares the given term against the term in the index specified by the + * term index. ie It returns negative N when term is less than index term; + * + * @param term + * the given term. + * @param termIndex + * the index of the of term to compare. + * @return int. + * @throws IOException + */ + int compareTo(Term term, BytesRef termBytesRef, int termIndex) throws IOException { + return compareTo(term, termBytesRef, termIndex, (PagedBytesDataInput) dataInput.clone(), new BytesRef()); + } + + /** + * Compare the fields of the terms first, and if not equals return from + * compare. If equal compare terms. + * + * @param term + * the term to compare. + * @param termIndex + * the position of the term in the input to compare + * @param input + * the input buffer. + * @return int. + * @throws IOException + */ + private int compareTo(Term term, BytesRef termBytesRef, int termIndex, PagedBytesDataInput input, BytesRef inputBytesRef) throws IOException { + // if term field does not equal mid's field index, then compare fields + // else if they are equal, compare term's string values... + int c = compareField(term, termIndex, input); + if (c == 0) { + inputBytesRef.length = input.readVInt(); + input.fill(inputBytesRef); + return comparator.compare(termBytesRef, inputBytesRef); + } + return c; + } + + /** + * Compares the fields before checking the text of the terms. + * + * @param term + * the given term. + * @param termIndex + * the term that exists in the data block. + * @param input + * the data block. + * @return int. + * @throws IOException + */ + private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException { + input.setPosition(packedInts.get(termIndex)); + return term.field.compareTo(fields[input.readVInt()].field); + } +} \ No newline at end of file Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 1177966) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.DoubleBarrelLRUCache; import org.apache.lucene.util.CloseableThreadLocal; @@ -37,9 +38,8 @@ private final SegmentTermEnum origEnum; private final long size; - private final Term[] indexTerms; - private final TermInfo[] indexInfos; - private final long[] indexPointers; + private final TermInfosReaderIndex index; + private final int indexLength; private final int totalIndexInterval; @@ -111,32 +111,17 @@ totalIndexInterval = origEnum.indexInterval * indexDivisor; final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION), readBufferSize), fieldInfos, true); - try { - int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index - - indexTerms = new Term[indexSize]; - indexInfos = new TermInfo[indexSize]; - indexPointers = new long[indexSize]; - - for (int i = 0; indexEnum.next(); i++) { - indexTerms[i] = indexEnum.term(); - indexInfos[i] = indexEnum.termInfo(); - indexPointers[i] = indexEnum.indexPointer; - - for (int j = 1; j < indexDivisor; j++) - if (!indexEnum.next()) - break; - } + index = new TermInfosReaderIndex(indexEnum, indexDivisor, (int) dir.fileLength(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION), totalIndexInterval); + indexLength = index.length(); } finally { indexEnum.close(); } } else { // Do not load terms index: totalIndexInterval = -1; - indexTerms = null; - indexInfos = null; - indexPointers = null; + index = null; + indexLength = -1; } success = true; } finally { @@ -180,38 +165,14 @@ return resources; } - - /** Returns the offset of the greatest index entry which is less than or equal to term.*/ - private final int getIndexOffset(Term term) { - int lo = 0; // binary search indexTerms[] - int hi = indexTerms.length - 1; - - while (hi >= lo) { - int mid = (lo + hi) >>> 1; - int delta = term.compareTo(indexTerms[mid]); - if (delta < 0) - hi = mid - 1; - else if (delta > 0) - lo = mid + 1; - else - return mid; - } - return hi; - } - - private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { - enumerator.seek(indexPointers[indexOffset], - ((long) indexOffset * totalIndexInterval) - 1, - indexTerms[indexOffset], indexInfos[indexOffset]); - } - /** Returns the TermInfo for a Term in the set, or null. */ TermInfo get(Term term) throws IOException { - return get(term, false); + BytesRef termBytesRef = new BytesRef(term.text); + return get(term, false, termBytesRef); } /** Returns the TermInfo for a Term in the set, or null. */ - private TermInfo get(Term term, boolean mustSeekEnum) throws IOException { + private TermInfo get(Term term, boolean mustSeekEnum, BytesRef termBytesRef) throws IOException { if (size == 0) return null; ensureIndexIsRead(); @@ -231,8 +192,8 @@ && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) || term.compareTo(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; - if (indexTerms.length == enumOffset // but before end of block - || term.compareTo(indexTerms[enumOffset]) < 0) { + if (indexLength == enumOffset // but before end of block + || index.compareTo(term,termBytesRef,enumOffset) < 0) { // no need to seek final TermInfo ti; @@ -267,10 +228,10 @@ indexPos = (int) (tiOrd.termOrd / totalIndexInterval); } else { // Must do binary search: - indexPos = getIndexOffset(term); + indexPos = index.getIndexOffset(term,termBytesRef); } - seekEnum(enumerator, indexPos); + index.seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { @@ -313,7 +274,7 @@ } private void ensureIndexIsRead() { - if (indexTerms == null) { + if (index == null) { throw new IllegalStateException("terms index was not loaded when this reader was created"); } } @@ -323,10 +284,11 @@ if (size == 0) return -1; ensureIndexIsRead(); - int indexOffset = getIndexOffset(term); + BytesRef termBytesRef = new BytesRef(term.text); + int indexOffset = index.getIndexOffset(term,termBytesRef); SegmentTermEnum enumerator = getThreadResources().termEnum; - seekEnum(enumerator, indexOffset); + index.seekEnum(enumerator, indexOffset); while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} @@ -343,7 +305,8 @@ /** Returns an enumeration of terms starting at or after the named term. */ public SegmentTermEnum terms(Term term) throws IOException { - get(term, true); + BytesRef termBytesRef = new BytesRef(term.text); + get(term, true, termBytesRef); return (SegmentTermEnum)getThreadResources().termEnum.clone(); } } Index: src/java/org/apache/lucene/store/PagedBytesDataInput.java =================================================================== --- src/java/org/apache/lucene/store/PagedBytesDataInput.java (revision 0) +++ src/java/org/apache/lucene/store/PagedBytesDataInput.java (revision 0) @@ -0,0 +1,79 @@ +package org.apache.lucene.store; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.PagedBytes.Reader; + +public final class PagedBytesDataInput extends DataInput { + + private final Reader reader; + private final long length; + private long pos; + private BytesRef ref; + + public PagedBytesDataInput(PagedBytes.Reader reader) { + this.reader = reader; + this.ref = new BytesRef(); + byte[][] blocks = reader.getBlocks(); + long length = 0l; + for (int i = 0; i < blocks.length; i++) { + length += blocks[i].length; + } + this.length = length; + } + + @Override + public Object clone() { + PagedBytesDataInput clone = (PagedBytesDataInput) super.clone(); + clone.ref = new BytesRef(); + return clone; + } + + public long getLength() { + return length; + } + + public long getPosition() { + return pos; + } + + public void setPosition(long pos) { + this.pos = pos; + } + + @Override + public byte readByte() { + ref = reader.fillSlice(ref, pos, 1); + byte b = ref.bytes[ref.offset]; + pos++; + return b; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + reader.fillSlice(ref, pos, len); + System.arraycopy(ref.bytes, ref.offset, b, offset, len); + pos += len; + } + + public void fill(BytesRef ref) { + reader.fillSlice(ref, pos, ref.length); + } +} Index: src/java/org/apache/lucene/store/PagedBytesDataOutput.java =================================================================== --- src/java/org/apache/lucene/store/PagedBytesDataOutput.java (revision 0) +++ src/java/org/apache/lucene/store/PagedBytesDataOutput.java (revision 0) @@ -0,0 +1,61 @@ +package org.apache.lucene.store; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes; + +public class PagedBytesDataOutput extends DataOutput { + private long pos; + private PagedBytes pagedBytes; + private BytesRef ref = new BytesRef(); + private byte[] buf = new byte[1]; + + public PagedBytesDataOutput(PagedBytes pagedBytes) { + this.pagedBytes = pagedBytes; + } + + public long getPosition() { + return pos; + } + + @Override + public void writeByte(byte b) throws IOException { + buf[0] = b; + ref.bytes = buf; + ref.offset = 0; + ref.length = 1; + pagedBytes.copy(ref); + pos++; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + ref.bytes = b; + ref.offset = offset; + ref.length = length; + pagedBytes.copy(ref); + pos += length; + } + + public PagedBytes getPagedBytes() { + return pagedBytes; + } +} Index: src/java/org/apache/lucene/store/ByteArrayDataInput.java =================================================================== --- src/java/org/apache/lucene/store/ByteArrayDataInput.java (revision 1177966) +++ src/java/org/apache/lucene/store/ByteArrayDataInput.java (working copy) @@ -46,6 +46,10 @@ public int getPosition() { return pos; } + + public void setPosition(int pos) { + this.pos = pos; + } public void reset(byte[] bytes, int offset, int len) { this.bytes = bytes; Index: src/java/org/apache/lucene/util/PagedBytes.java =================================================================== --- src/java/org/apache/lucene/util/PagedBytes.java (revision 0) +++ src/java/org/apache/lucene/util/PagedBytes.java (revision 0) @@ -0,0 +1,392 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; + +import java.util.List; +import java.util.ArrayList; +import java.io.Closeable; +import java.io.IOException; + +/** Represents a logical byte[] as a series of pages. You + * can write-once into the logical byte[] (append only), + * using copy, and then retrieve slices (BytesRef) into it + * using fill. + * + * @lucene.internal + **/ +public final class PagedBytes { + private final List blocks = new ArrayList(); + private final List blockEnd = new ArrayList(); + private final int blockSize; + private final int blockBits; + private final int blockMask; + private int upto; + private byte[] currentBlock; + + private static final byte[] EMPTY_BYTES = new byte[0]; + + public final static class Reader implements Closeable { + private final byte[][] blocks; + private final int[] blockEnds; + private final int blockBits; + private final int blockMask; + private final int blockSize; + private final CloseableThreadLocal threadBuffers = new CloseableThreadLocal(); + + public Reader(PagedBytes pagedBytes) { + blocks = new byte[pagedBytes.blocks.size()][]; + for(int i=0;istart with a + * given length. Iff the slice spans across a block border this method will + * allocate sufficient resources and copy the paged data. + *

+ * Slices spanning more than one block are not supported. + *

+ * @lucene.internal + **/ + public BytesRef fillSlice(BytesRef b, long start, int length) { + assert length >= 0: "length=" + length; + final int index = (int) (start >> blockBits); + final int offset = (int) (start & blockMask); + b.length = length; + if (blockSize - offset >= length) { + // Within block + b.bytes = blocks[index]; + b.offset = offset; + } else { + // Split + byte[] buffer = threadBuffers.get(); + if (buffer == null) { + buffer = new byte[length]; + threadBuffers.set(buffer); + } else if (buffer.length < length) { + buffer = ArrayUtil.grow(buffer, length); + threadBuffers.set(buffer); + } + b.bytes = buffer; + b.offset = 0; + System.arraycopy(blocks[index], offset, buffer, 0, blockSize-offset); + System.arraycopy(blocks[1+index], 0, buffer, blockSize-offset, length-(blockSize-offset)); + } + return b; + } + + /** + * Reads length as 1 or 2 byte vInt prefix, starting at start. + *

+ * Note: this method does not support slices spanning across block + * borders. + *

+ * + * @return the given {@link BytesRef} + * + * @lucene.internal + **/ + public BytesRef fill(BytesRef b, long start) { + final int index = (int) (start >> blockBits); + final int offset = (int) (start & blockMask); + final byte[] block = b.bytes = blocks[index]; + + if ((block[offset] & 128) == 0) { + b.length = block[offset]; + b.offset = offset+1; + } else { + b.length = ((block[offset] & 0x7f) << 8) | (block[1+offset] & 0xff); + b.offset = offset+2; + assert b.length > 0; + } + return b; + } + + /** + * Reads length as 1 or 2 byte vInt prefix, starting at start. * + *

+ * Note: this method does not support slices spanning across block + * borders. + *

+ * + * @return the internal block number of the slice. + * @lucene.internal + **/ + public int fillAndGetIndex(BytesRef b, long start) { + final int index = (int) (start >> blockBits); + final int offset = (int) (start & blockMask); + final byte[] block = b.bytes = blocks[index]; + + if ((block[offset] & 128) == 0) { + b.length = block[offset]; + b.offset = offset+1; + } else { + b.length = ((block[offset] & 0x7f) << 8) | (block[1+offset] & 0xff); + b.offset = offset+2; + assert b.length > 0; + } + return index; + } + + /** + * Reads length as 1 or 2 byte vInt prefix, starting at start and + * returns the start offset of the next part, suitable as start parameter on + * next call to sequentially read all {@link BytesRef}. + * + *

+ * Note: this method does not support slices spanning across block + * borders. + *

+ * + * @return the start offset of the next part, suitable as start parameter on + * next call to sequentially read all {@link BytesRef}. + * @lucene.internal + **/ + public long fillAndGetStart(BytesRef b, long start) { + final int index = (int) (start >> blockBits); + final int offset = (int) (start & blockMask); + final byte[] block = b.bytes = blocks[index]; + + if ((block[offset] & 128) == 0) { + b.length = block[offset]; + b.offset = offset+1; + start += 1L + b.length; + } else { + b.length = ((block[offset] & 0x7f) << 8) | (block[1+offset] & 0xff); + b.offset = offset+2; + start += 2L + b.length; + assert b.length > 0; + } + return start; + } + + + /** + * Gets a slice out of {@link PagedBytes} starting at start, the + * length is read as 1 or 2 byte vInt prefix. Iff the slice spans across a + * block border this method will allocate sufficient resources and copy the + * paged data. + *

+ * Slices spanning more than one block are not supported. + *

+ * + * @lucene.internal + **/ + public BytesRef fillSliceWithPrefix(BytesRef b, long start) { + final int index = (int) (start >> blockBits); + int offset = (int) (start & blockMask); + final byte[] block = blocks[index]; + final int length; + if ((block[offset] & 128) == 0) { + length = block[offset]; + offset = offset+1; + } else { + length = ((block[offset] & 0x7f) << 8) | (block[1+offset] & 0xff); + offset = offset+2; + assert length > 0; + } + assert length >= 0: "length=" + length; + b.length = length; + if (blockSize - offset >= length) { + // Within block + b.offset = offset; + b.bytes = blocks[index]; + } else { + // Split + byte[] buffer = threadBuffers.get(); + if (buffer == null) { + buffer = new byte[length]; + threadBuffers.set(buffer); + } else if (buffer.length < length) { + buffer = ArrayUtil.grow(buffer, length); + threadBuffers.set(buffer); + } + b.bytes = buffer; + b.offset = 0; + System.arraycopy(blocks[index], offset, buffer, 0, blockSize-offset); + System.arraycopy(blocks[1+index], 0, buffer, blockSize-offset, length-(blockSize-offset)); + } + return b; + } + + /** @lucene.internal */ + public byte[][] getBlocks() { + return blocks; + } + + /** @lucene.internal */ + public int[] getBlockEnds() { + return blockEnds; + } + + public void close() { + threadBuffers.close(); + } + } + + /** 1< 0) { + int left = blockSize - upto; + if (left == 0) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + left = blockSize; + } + if (left < byteCount) { + in.readBytes(currentBlock, upto, left, false); + upto = blockSize; + byteCount -= left; + } else { + in.readBytes(currentBlock, upto, (int) byteCount, false); + upto += byteCount; + break; + } + } + } + + /** Copy BytesRef in */ + public void copy(BytesRef bytes) throws IOException { + int byteCount = bytes.length; + int bytesUpto = bytes.offset; + while (byteCount > 0) { + int left = blockSize - upto; + if (left == 0) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + left = blockSize; + } + if (left < byteCount) { + System.arraycopy(bytes.bytes, bytesUpto, currentBlock, upto, left); + upto = blockSize; + byteCount -= left; + bytesUpto += left; + } else { + System.arraycopy(bytes.bytes, bytesUpto, currentBlock, upto, byteCount); + upto += byteCount; + break; + } + } + } + + /** Copy BytesRef in, setting BytesRef out to the result. + * Do not use this if you will use freeze(true). + * This only supports bytes.length <= blockSize */ + public void copy(BytesRef bytes, BytesRef out) throws IOException { + int left = blockSize - upto; + if (bytes.length > left || currentBlock==null) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + left = blockSize; + assert bytes.length <= blockSize; + // TODO: we could also support variable block sizes + } + + out.bytes = currentBlock; + out.offset = upto; + out.length = bytes.length; + + System.arraycopy(bytes.bytes, bytes.offset, currentBlock, upto, bytes.length); + upto += bytes.length; + } + + /** Commits final byte[], trimming it if necessary and if trim=true */ + public Reader freeze(boolean trim) { + if (trim && upto < blockSize) { + final byte[] newBlock = new byte[upto]; + System.arraycopy(currentBlock, 0, newBlock, 0, upto); + currentBlock = newBlock; + } + if (currentBlock == null) { + currentBlock = EMPTY_BYTES; + } + blocks.add(currentBlock); + blockEnd.add(upto); + currentBlock = null; + return new Reader(this); + } + + public long getPointer() { + if (currentBlock == null) { + return 0; + } else { + return (blocks.size() * ((long) blockSize)) + upto; + } + } + + /** Copy bytes in, writing the length as a 1 or 2 byte + * vInt prefix. */ + public long copyUsingLengthPrefix(BytesRef bytes) throws IOException { + + if (upto + bytes.length + 2 > blockSize) { + if (bytes.length + 2 > blockSize) { + throw new IllegalArgumentException("block size " + blockSize + " is too small to store length " + bytes.length + " bytes"); + } + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + } + + final long pointer = getPointer(); + + if (bytes.length < 128) { + currentBlock[upto++] = (byte) bytes.length; + } else { + currentBlock[upto++] = (byte) (0x80 | (bytes.length >> 8)); + currentBlock[upto++] = (byte) (bytes.length & 0xff); + } + System.arraycopy(bytes.bytes, bytes.offset, currentBlock, upto, bytes.length); + upto += bytes.length; + + return pointer; + } +} Index: src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java =================================================================== --- src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java (revision 0) +++ src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java (revision 0) @@ -0,0 +1,64 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataOutput; + +/** + * A growable byte array data output. + */ +public class GrowableByteArrayDataOutput extends DataOutput { + private byte[] bytes; + private int pos; + private int limit; + + public GrowableByteArrayDataOutput(byte[] bytes) { + this.bytes = bytes; + this.limit = bytes.length; + } + + public byte[] getBytes() { + return this.bytes; + } + + public int getPosition() { + return pos; + } + + @Override + public void writeByte(byte b) { + if (pos >= limit) { + growArray(); + } + bytes[pos++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + if (pos + length >= limit) { + growArray(); + } + System.arraycopy(b, offset, bytes, pos, length); + pos += length; + } + + private void growArray() { + bytes = ArrayUtil.grow(bytes); + limit = bytes.length; + } +} \ No newline at end of file Index: src/java/org/apache/lucene/util/packed/Direct32.java =================================================================== --- src/java/org/apache/lucene/util/packed/Direct32.java (revision 0) +++ src/java/org/apache/lucene/util/packed/Direct32.java (revision 0) @@ -0,0 +1,87 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Direct wrapping of 32 bit values to a backing array of ints. + * @lucene.internal + */ + +public class Direct32 extends PackedInts.ReaderImpl + implements PackedInts.Mutable { + private int[] values; + private static final int BITS_PER_VALUE = 32; + + public Direct32(int valueCount) { + super(valueCount, BITS_PER_VALUE); + values = new int[valueCount]; + } + + public Direct32(DataInput in, int valueCount) throws IOException { + super(valueCount, BITS_PER_VALUE); + int[] values = new int[valueCount]; + for(int i=0;i

+ * Note: The values are used directly, so changes to the given values will + * affect the structure. + * @param values used as the internal backing array. + */ + public Direct32(int[] values) { + super(values.length, BITS_PER_VALUE); + this.values = values; + } + + public int[] getArray() { + return values; + } + + public long get(final int index) { + return 0xFFFFFFFFL & values[index]; + } + + public void set(final int index, final long value) { + values[index] = (int)(value & 0xFFFFFFFF); + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + values.length * RamUsageEstimator.NUM_BYTES_INT; + } + + public void clear() { + Arrays.fill(values, 0); + } +} Index: src/java/org/apache/lucene/util/packed/GrowableWriter.java =================================================================== --- src/java/org/apache/lucene/util/packed/GrowableWriter.java (revision 0) +++ src/java/org/apache/lucene/util/packed/GrowableWriter.java (revision 0) @@ -0,0 +1,93 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Implements {@link PackedInts.Mutable}, but grows the + * bit count of the underlying packed ints on-demand. + * + *

@lucene.internal

+ */ + +public class GrowableWriter implements PackedInts.Mutable { + + private long currentMaxValue; + private PackedInts.Mutable current; + private final boolean roundFixedSize; + + public GrowableWriter(int startBitsPerValue, int valueCount, boolean roundFixedSize) { + this.roundFixedSize = roundFixedSize; + current = PackedInts.getMutable(valueCount, getSize(startBitsPerValue)); + currentMaxValue = PackedInts.maxValue(current.getBitsPerValue()); + } + + private final int getSize(int bpv) { + if (roundFixedSize) { + return PackedInts.getNextFixedSize(bpv); + } else { + return bpv; + } + } + + public long get(int index) { + return current.get(index); + } + + public int size() { + return current.size(); + } + + public int getBitsPerValue() { + return current.getBitsPerValue(); + } + + public PackedInts.Mutable getMutable() { + return current; + } + + public void set(int index, long value) { + if (value >= currentMaxValue) { + int bpv = getBitsPerValue(); + while(currentMaxValue <= value && currentMaxValue != Long.MAX_VALUE) { + bpv++; + currentMaxValue *= 2; + } + final int valueCount = size(); + PackedInts.Mutable next = PackedInts.getMutable(valueCount, getSize(bpv)); + for(int i=0;i

+ * Note: The values are used directly, so changes to the values will + * affect the structure. + * @param values used as the internal backing array. + */ + public Direct16(short[] values) { + super(values.length, BITS_PER_VALUE); + this.values = values; + } + + public short[] getArray() { + return values; + } + + public long get(final int index) { + return 0xFFFFL & values[index]; + } + + public void set(final int index, final long value) { + values[index] = (short)(value & 0xFFFF); + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + values.length * RamUsageEstimator.NUM_BYTES_SHORT; + } + + public void clear() { + Arrays.fill(values, (short)0); + } +} Index: src/java/org/apache/lucene/util/packed/Packed64.java =================================================================== --- src/java/org/apache/lucene/util/packed/Packed64.java (revision 0) +++ src/java/org/apache/lucene/util/packed/Packed64.java (revision 0) @@ -0,0 +1,217 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Space optimized random access capable array of values with a fixed number of + * bits. For 32 bits/value and less, performance on 32 bit machines is not + * optimal. Consider using {@link Packed32} for such a setup. + *

+ * The implementation strives to avoid conditionals and expensive operations, + * sacrificing code clarity to achieve better performance. + */ + +class Packed64 extends PackedInts.ReaderImpl implements PackedInts.Mutable { + static final int BLOCK_SIZE = 64; // 32 = int, 64 = long + static final int BLOCK_BITS = 6; // The #bits representing BLOCK_SIZE + static final int MOD_MASK = BLOCK_SIZE - 1; // x % BLOCK_SIZE + + private static final int ENTRY_SIZE = BLOCK_SIZE + 1; + private static final int FAC_BITPOS = 3; + + /* + * In order to make an efficient value-getter, conditionals should be + * avoided. A value can be positioned inside of a block, requiring shifting + * left or right or it can span two blocks, requiring a left-shift on the + * first block and a right-shift on the right block. + *

+ * By always shifting the first block both left and right, we get exactly + * the right bits. By always shifting the second block right and applying + * a mask, we get the right bits there. After that, we | the two bitsets. + */ + private static final int[][] SHIFTS = + new int[ENTRY_SIZE][ENTRY_SIZE * FAC_BITPOS]; + //new int[BLOCK_SIZE+1][BLOCK_SIZE][BLOCK_SIZE+1]; + private static final long[][] MASKS = new long[ENTRY_SIZE][ENTRY_SIZE]; + + static { // Generate shifts + for (int elementBits = 1 ; elementBits <= BLOCK_SIZE ; elementBits++) { + for (int bitPos = 0 ; bitPos < BLOCK_SIZE ; bitPos++) { + int[] currentShifts = SHIFTS[elementBits]; + int base = bitPos * FAC_BITPOS; + currentShifts[base ] = bitPos; + currentShifts[base + 1] = BLOCK_SIZE - elementBits; + if (bitPos <= BLOCK_SIZE - elementBits) { // Single block + currentShifts[base + 2] = 0; + MASKS[elementBits][bitPos] = 0; + } else { // Two blocks + int rBits = elementBits - (BLOCK_SIZE - bitPos); + currentShifts[base + 2] = BLOCK_SIZE - rBits; + MASKS[elementBits][bitPos] = ~(~0L << rBits); + } + } + } + } + + /* + * The setter requires more masking than the getter. + */ + private static final long[][] WRITE_MASKS = + new long[ENTRY_SIZE][ENTRY_SIZE * FAC_BITPOS]; + static { + for (int elementBits = 1 ; elementBits <= BLOCK_SIZE ; elementBits++) { + long elementPosMask = ~(~0L << elementBits); + int[] currentShifts = SHIFTS[elementBits]; + long[] currentMasks = WRITE_MASKS[elementBits]; + for (int bitPos = 0 ; bitPos < BLOCK_SIZE ; bitPos++) { + int base = bitPos * FAC_BITPOS; + currentMasks[base ] =~((elementPosMask + << currentShifts[base + 1]) + >>> currentShifts[base]); + if (bitPos <= BLOCK_SIZE - elementBits) { // Second block not used + currentMasks[base+1] = ~0; // Keep all bits + currentMasks[base+2] = 0; // Or with 0 + } else { + currentMasks[base+1] = ~(elementPosMask + << currentShifts[base + 2]); + currentMasks[base+2] = currentShifts[base + 2] == 0 ? 0 : ~0; + } + } + } + } + + /* The bits */ + private long[] blocks; + + // Cached calculations + private int maxPos; // blocks.length * BLOCK_SIZE / elementBits - 1 + private int[] shifts; // The shifts for the current elementBits + private long[] readMasks; + private long[] writeMasks; + + /** + * Creates an array with the internal structures adjusted for the given + * limits and initialized to 0. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + */ + public Packed64(int valueCount, int bitsPerValue) { + // TODO: Test for edge-cases (2^31 values, 63 bitsPerValue) + // +2 due to the avoid-conditionals-trick. The last entry is always 0 + this(new long[(int)((long)valueCount * bitsPerValue / BLOCK_SIZE + 2)], + valueCount, bitsPerValue); + } + + + /** + * Creates an array backed by the given blocks. + *

+ * Note: The blocks are used directly, so changes to the given block will + * affect the Packed32-structure. + * @param blocks used as the internal backing array. Not that the last + * element cannot be addressed directly. + * @param valueCount the number of values. + * @param bitsPerValue the number of bits available for any given value. + */ + public Packed64(long[] blocks, int valueCount, int bitsPerValue) { + super(valueCount, bitsPerValue); + this.blocks = blocks; + updateCached(); + } + + /** + * Creates an array with content retrieved from the given DataInput. + * @param in a DataInput, positioned at the start of Packed64-content. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * @throws java.io.IOException if the values for the backing array could not + * be retrieved. + */ + public Packed64(DataInput in, int valueCount, int bitsPerValue) + throws IOException { + super(valueCount, bitsPerValue); + int size = size(valueCount, bitsPerValue); + blocks = new long[size+1]; // +1 due to non-conditional tricks + // TODO: find a faster way to bulk-read longs... + for(int i=0;i>> BLOCK_BITS); // / BLOCK_SIZE + final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); + + final int base = bitPos * FAC_BITPOS; + assert elementPos < blocks.length : "elementPos: " + elementPos + "; blocks.len: " + blocks.length; + return ((blocks[elementPos] << shifts[base]) >>> shifts[base+1]) | + ((blocks[elementPos+1] >>> shifts[base+2]) & readMasks[bitPos]); + } + + public void set(final int index, final long value) { + final long majorBitPos = (long)index * bitsPerValue; + final int elementPos = (int)(majorBitPos >>> BLOCK_BITS); // / BLOCK_SIZE + final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); + final int base = bitPos * FAC_BITPOS; + + blocks[elementPos ] = (blocks[elementPos ] & writeMasks[base]) + | (value << shifts[base + 1] >>> shifts[base]); + blocks[elementPos+1] = (blocks[elementPos+1] & writeMasks[base+1]) + | ((value << shifts[base + 2]) & writeMasks[base+2]); + } + + @Override + public String toString() { + return "Packed64(bitsPerValue=" + bitsPerValue + ", size=" + + size() + ", maxPos=" + maxPos + + ", elements.length=" + blocks.length + ")"; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + blocks.length * RamUsageEstimator.NUM_BYTES_LONG; + } + + public void clear() { + Arrays.fill(blocks, 0L); + } +} Index: src/java/org/apache/lucene/util/packed/Direct8.java =================================================================== --- src/java/org/apache/lucene/util/packed/Direct8.java (revision 0) +++ src/java/org/apache/lucene/util/packed/Direct8.java (revision 0) @@ -0,0 +1,91 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Direct wrapping of 8 bit values to a backing array of bytes. + * @lucene.internal + */ + +public class Direct8 extends PackedInts.ReaderImpl + implements PackedInts.Mutable { + private byte[] values; + private static final int BITS_PER_VALUE = 8; + + public Direct8(int valueCount) { + super(valueCount, BITS_PER_VALUE); + values = new byte[valueCount]; + } + + public Direct8(DataInput in, int valueCount) + throws IOException { + super(valueCount, BITS_PER_VALUE); + byte[] values = new byte[valueCount]; + for(int i=0;i

+ * Note: The values are used directly, so changes to the given values will + * affect the structure. + * @param values used as the internal backing array. + */ + public Direct8(byte[] values) { + super(values.length, BITS_PER_VALUE); + this.values = values; + } + + public byte[] getArray() { + return values; + } + + public long get(final int index) { + return 0xFFL & values[index]; + } + + public void set(final int index, final long value) { + values[index] = (byte)(value & 0xFF); + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + values.length; + } + + public void clear() { + Arrays.fill(values, (byte)0); + } +} Index: src/java/org/apache/lucene/util/packed/Direct64.java =================================================================== --- src/java/org/apache/lucene/util/packed/Direct64.java (revision 0) +++ src/java/org/apache/lucene/util/packed/Direct64.java (revision 0) @@ -0,0 +1,83 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Direct wrapping of 32 bit values to a backing array of ints. + * @lucene.internal + */ + +public class Direct64 extends PackedInts.ReaderImpl + implements PackedInts.Mutable { + private long[] values; + private static final int BITS_PER_VALUE = 64; + + public Direct64(int valueCount) { + super(valueCount, BITS_PER_VALUE); + values = new long[valueCount]; + } + + public Direct64(DataInput in, int valueCount) throws IOException { + super(valueCount, BITS_PER_VALUE); + long[] values = new long[valueCount]; + for(int i=0;i

+ * Note: The values are used directly, so changes to the given values will + * affect the structure. + * @param values used as the internal backing array. + */ + public Direct64(long[] values) { + super(values.length, BITS_PER_VALUE); + this.values = values; + } + + public long get(final int index) { + return values[index]; + } + + public void set(final int index, final long value) { + values[index] = value; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + values.length * RamUsageEstimator.NUM_BYTES_LONG; + } + + public void clear() { + Arrays.fill(values, 0L); + } +} Index: src/java/org/apache/lucene/util/packed/PackedReaderIterator.java =================================================================== --- src/java/org/apache/lucene/util/packed/PackedReaderIterator.java (revision 0) +++ src/java/org/apache/lucene/util/packed/PackedReaderIterator.java (revision 0) @@ -0,0 +1,111 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; + +import java.io.IOException; + +final class PackedReaderIterator implements PackedInts.ReaderIterator { + private long pending; + private int pendingBitsLeft; + private final IndexInput in; + private final int bitsPerValue; + private final int valueCount; + private int position = -1; + + // masks[n-1] masks for bottom n bits + private final long[] masks; + + public PackedReaderIterator(int bitsPerValue, int valueCount, IndexInput in) + throws IOException { + + this.valueCount = valueCount; + this.bitsPerValue = bitsPerValue; + + this.in = in; + masks = new long[bitsPerValue]; + + long v = 1; + for (int i = 0; i < bitsPerValue; i++) { + v *= 2; + masks[i] = v - 1; + } + } + + public int getBitsPerValue() { + return bitsPerValue; + } + + public int size() { + return valueCount; + } + + public long next() throws IOException { + if (pendingBitsLeft == 0) { + pending = in.readLong(); + pendingBitsLeft = 64; + } + + final long result; + if (pendingBitsLeft >= bitsPerValue) { // not split + result = (pending >> (pendingBitsLeft - bitsPerValue)) & masks[bitsPerValue-1]; + pendingBitsLeft -= bitsPerValue; + } else { // split + final int bits1 = bitsPerValue - pendingBitsLeft; + final long result1 = (pending & masks[pendingBitsLeft-1]) << bits1; + pending = in.readLong(); + final long result2 = (pending >> (64 - bits1)) & masks[bits1-1]; + pendingBitsLeft = 64 + pendingBitsLeft - bitsPerValue; + result = result1 | result2; + } + + ++position; + return result; + } + + public void close() throws IOException { + in.close(); + } + + public int ord() { + return position; + } + + public long advance(final int ord) throws IOException{ + assert ord < valueCount : "ord must be less than valueCount"; + assert ord > position : "ord must be greater than the current position"; + final long bits = (long) bitsPerValue; + final int posToSkip = ord - 1 - position; + final long bitsToSkip = (bits * (long)posToSkip); + if (bitsToSkip < pendingBitsLeft) { // enough bits left - no seek required + pendingBitsLeft -= bitsToSkip; + } else { + final long skip = bitsToSkip-pendingBitsLeft; + final long closestByte = (skip >> 6) << 3; + if (closestByte != 0) { // need to seek + final long filePointer = in.getFilePointer(); + in.seek(filePointer + closestByte); + } + pending = in.readLong(); + pendingBitsLeft = 64 - (int)(skip % 64); + } + position = ord-1; + return next(); + } +} Index: src/java/org/apache/lucene/util/packed/PackedWriter.java =================================================================== --- src/java/org/apache/lucene/util/packed/PackedWriter.java (revision 0) +++ src/java/org/apache/lucene/util/packed/PackedWriter.java (revision 0) @@ -0,0 +1,114 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataOutput; + +import java.io.IOException; + +// Packs high order byte first, to match +// IndexOutput.writeInt/Long/Short byte order + +/** + * Generic writer for space-optimal packed values. The resulting bits can be + * used directly by Packed32, Packed64 and PackedDirect* and will always be + * long-aligned. + */ + +class PackedWriter extends PackedInts.Writer { + private long pending; + private int pendingBitPos; + + // masks[n-1] masks for bottom n bits + private final long[] masks; + private int written = 0; + + public PackedWriter(DataOutput out, int valueCount, int bitsPerValue) + throws IOException { + super(out, valueCount, bitsPerValue); + + pendingBitPos = 64; + masks = new long[bitsPerValue - 1]; + + long v = 1; + for (int i = 0; i < bitsPerValue - 1; i++) { + v *= 2; + masks[i] = v - 1; + } + } + + /** + * Do not call this after finish + */ + @Override + public void add(long v) throws IOException { + assert v <= PackedInts.maxValue(bitsPerValue) : "v=" + v + + " maxValue=" + PackedInts.maxValue(bitsPerValue); + assert v >= 0; + //System.out.println(" packedw add v=" + v + " pendingBitPos=" + pendingBitPos); + + // TODO + if (pendingBitPos >= bitsPerValue) { + // not split + + // write-once, so we can |= w/o first masking to 0s + pending |= v << (pendingBitPos - bitsPerValue); + if (pendingBitPos == bitsPerValue) { + // flush + out.writeLong(pending); + pending = 0; + pendingBitPos = 64; + } else { + pendingBitPos -= bitsPerValue; + } + + } else { + // split + + // write top pendingBitPos bits of value into bottom bits of pending + pending |= (v >> (bitsPerValue - pendingBitPos)) & masks[pendingBitPos - 1]; + //System.out.println(" part1 (v >> " + (bitsPerValue - pendingBitPos) + ") & " + masks[pendingBitPos-1]); + + // flush + out.writeLong(pending); + + // write bottom (bitsPerValue - pendingBitPos) bits of value into top bits of pending + pendingBitPos = 64 - bitsPerValue + pendingBitPos; + //System.out.println(" part2 v << " + pendingBitPos); + pending = (v << pendingBitPos); + } + written++; + } + + @Override + public void finish() throws IOException { + while (written < valueCount) { + add(0L); // Auto flush + } + + if (pendingBitPos != 64) { + out.writeLong(pending); + } + } + + @Override + public String toString() { + return "PackedWriter(written " + written + "/" + valueCount + " with " + + bitsPerValue + " bits/value)"; + } +} Index: src/java/org/apache/lucene/util/packed/PackedInts.java =================================================================== --- src/java/org/apache/lucene/util/packed/PackedInts.java (revision 0) +++ src/java/org/apache/lucene/util/packed/PackedInts.java (revision 0) @@ -0,0 +1,302 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Closeable; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.Constants; + +import java.io.IOException; + +/** + * Simplistic compression for array of unsigned long values. + * Each value is >= 0 and <= a specified maximum value. The + * values are stored as packed ints, with each value + * consuming a fixed number of bits. + * + * @lucene.internal + */ + +public class PackedInts { + + private final static String CODEC_NAME = "PackedInts"; + private final static int VERSION_START = 0; + private final static int VERSION_CURRENT = VERSION_START; + + /** + * A read-only random access array of positive integers. + * @lucene.internal + */ + public static interface Reader { + /** + * @param index the position of the wanted value. + * @return the value at the stated index. + */ + long get(int index); + + /** + * @return the number of bits used to store any given value. + * Note: This does not imply that memory usage is + * {@code bitsPerValue * #values} as implementations are free to + * use non-space-optimal packing of bits. + */ + int getBitsPerValue(); + + /** + * @return the number of values. + */ + int size(); + } + + /** + * Run-once iterator interface, to decode previously saved PackedInts. + */ + public static interface ReaderIterator extends Closeable { + /** Returns next value */ + long next() throws IOException; + /** Returns number of bits per value */ + int getBitsPerValue(); + /** Returns number of values */ + int size(); + /** Returns the current position */ + int ord(); + /** Skips to the given ordinal and returns its value. + * @return the value at the given position + * @throws IOException if reading the value throws an IOException*/ + long advance(int ord) throws IOException; + } + + /** + * A packed integer array that can be modified. + * @lucene.internal + */ + public static interface Mutable extends Reader { + /** + * Set the value at the given index in the array. + * @param index where the value should be positioned. + * @param value a value conforming to the constraints set by the array. + */ + void set(int index, long value); + + /** + * Sets all values to 0. + */ + + void clear(); + } + + /** + * A simple base for Readers that keeps track of valueCount and bitsPerValue. + * @lucene.internal + */ + public static abstract class ReaderImpl implements Reader { + protected final int bitsPerValue; + protected final int valueCount; + + protected ReaderImpl(int valueCount, int bitsPerValue) { + this.bitsPerValue = bitsPerValue; + assert bitsPerValue > 0 && bitsPerValue <= 64 : "bitsPerValue=" + bitsPerValue; + this.valueCount = valueCount; + } + + public int getBitsPerValue() { + return bitsPerValue; + } + + public int size() { + return valueCount; + } + + public long getMaxValue() { // Convenience method + return maxValue(bitsPerValue); + } + } + + /** A write-once Writer. + * @lucene.internal + */ + public static abstract class Writer { + protected final DataOutput out; + protected final int bitsPerValue; + protected final int valueCount; + + protected Writer(DataOutput out, int valueCount, int bitsPerValue) + throws IOException { + assert bitsPerValue <= 64; + + this.out = out; + this.valueCount = valueCount; + this.bitsPerValue = bitsPerValue; + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + out.writeVInt(bitsPerValue); + out.writeVInt(valueCount); + } + + public abstract void add(long v) throws IOException; + public abstract void finish() throws IOException; + } + + /** + * Retrieve PackedInt data from the DataInput and return a packed int + * structure based on it. + * @param in positioned at the beginning of a stored packed int structure. + * @return a read only random access capable array of positive integers. + * @throws IOException if the structure could not be retrieved. + * @lucene.internal + */ + public static Reader getReader(DataInput in) throws IOException { + CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START, VERSION_START); + final int bitsPerValue = in.readVInt(); + assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; + final int valueCount = in.readVInt(); + + switch (bitsPerValue) { + case 8: + return new Direct8(in, valueCount); + case 16: + return new Direct16(in, valueCount); + case 32: + return new Direct32(in, valueCount); + case 64: + return new Direct64(in, valueCount); + default: + if (Constants.JRE_IS_64BIT || bitsPerValue >= 32) { + return new Packed64(in, valueCount, bitsPerValue); + } else { + return new Packed32(in, valueCount, bitsPerValue); + } + } + } + + /** + * Retrieve PackedInts as a {@link ReaderIterator} + * @param in positioned at the beginning of a stored packed int structure. + * @return an iterator to access the values + * @throws IOException if the structure could not be retrieved. + * @lucene.internal + */ + public static ReaderIterator getReaderIterator(IndexInput in) throws IOException { + CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START, VERSION_START); + final int bitsPerValue = in.readVInt(); + assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; + final int valueCount = in.readVInt(); + return new PackedReaderIterator(bitsPerValue, valueCount, in); + } + + /** + * Create a packed integer array with the given amount of values initialized + * to 0. the valueCount and the bitsPerValue cannot be changed after creation. + * All Mutables known by this factory are kept fully in RAM. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * @return a mutable packed integer array. + * @throws java.io.IOException if the Mutable could not be created. With the + * current implementations, this never happens, but the method + * signature allows for future persistence-backed Mutables. + * @lucene.internal + */ + public static Mutable getMutable( + int valueCount, int bitsPerValue) { + switch (bitsPerValue) { + case 8: + return new Direct8(valueCount); + case 16: + return new Direct16(valueCount); + case 32: + return new Direct32(valueCount); + case 64: + return new Direct64(valueCount); + default: + if (Constants.JRE_IS_64BIT || bitsPerValue >= 32) { + return new Packed64(valueCount, bitsPerValue); + } else { + return new Packed32(valueCount, bitsPerValue); + } + } + } + + /** + * Create a packed integer array writer for the given number of values at the + * given bits/value. Writers append to the given IndexOutput and has very + * low memory overhead. + * @param out the destination for the produced bits. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * @return a Writer ready for receiving values. + * @throws IOException if bits could not be written to out. + * @lucene.internal + */ + public static Writer getWriter(DataOutput out, int valueCount, int bitsPerValue) + throws IOException { + return new PackedWriter(out, valueCount, bitsPerValue); + } + + /** Returns how many bits are required to hold values up + * to and including maxValue + * @param maxValue the maximum value that should be representable. + * @return the amount of bits needed to represent values from 0 to maxValue. + * @lucene.internal + */ + public static int bitsRequired(long maxValue) { + // Very high long values does not translate well to double, so we do an + // explicit check for the edge cases + if (maxValue > 0x3FFFFFFFFFFFFFFFL) { + return 63; + } if (maxValue > 0x1FFFFFFFFFFFFFFFL) { + return 62; + } + return Math.max(1, (int) Math.ceil(Math.log(1+maxValue)/Math.log(2.0))); + } + + /** + * Calculates the maximum unsigned long that can be expressed with the given + * number of bits. + * @param bitsPerValue the number of bits available for any given value. + * @return the maximum value for the given bits. + * @lucene.internal + */ + public static long maxValue(int bitsPerValue) { + return bitsPerValue == 64 ? Long.MAX_VALUE : ~(~0L << bitsPerValue); + } + + /** Rounds bitsPerValue up to 8, 16, 32 or 64. */ + public static int getNextFixedSize(int bitsPerValue) { + if (bitsPerValue <= 8) { + return 8; + } else if (bitsPerValue <= 16) { + return 16; + } else if (bitsPerValue <= 32) { + return 32; + } else { + return 64; + } + } + + /** Possibly wastes some storage in exchange for faster lookups */ + public static int getRoundedFixedSize(int bitsPerValue) { + if (bitsPerValue > 58 || (bitsPerValue < 32 && bitsPerValue > 29)) { // 10% space-waste is ok + return getNextFixedSize(bitsPerValue); + } else { + return bitsPerValue; + } + } +} Index: src/java/org/apache/lucene/util/packed/Packed32.java =================================================================== --- src/java/org/apache/lucene/util/packed/Packed32.java (revision 0) +++ src/java/org/apache/lucene/util/packed/Packed32.java (revision 0) @@ -0,0 +1,227 @@ +package org.apache.lucene.util.packed; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Space optimized random access capable array of values with a fixed number of + * bits. The maximum number of bits/value is 31. Use {@link Packed64} for higher + * numbers. + *

+ * The implementation strives to avoid conditionals and expensive operations, + * sacrificing code clarity to achieve better performance. + */ + +class Packed32 extends PackedInts.ReaderImpl implements PackedInts.Mutable { + static final int BLOCK_SIZE = 32; // 32 = int, 64 = long + static final int BLOCK_BITS = 5; // The #bits representing BLOCK_SIZE + static final int MOD_MASK = BLOCK_SIZE - 1; // x % BLOCK_SIZE + + private static final int ENTRY_SIZE = BLOCK_SIZE + 1; + private static final int FAC_BITPOS = 3; + + /* + * In order to make an efficient value-getter, conditionals should be + * avoided. A value can be positioned inside of a block, requiring shifting + * left or right or it can span two blocks, requiring a left-shift on the + * first block and a right-shift on the right block. + *

+ * By always shifting the first block both left and right, we get exactly + * the right bits. By always shifting the second block right and applying + * a mask, we get the right bits there. After that, we | the two bitsets. + */ + private static final int[][] SHIFTS = + new int[ENTRY_SIZE][ENTRY_SIZE * FAC_BITPOS]; + private static final int[][] MASKS = new int[ENTRY_SIZE][ENTRY_SIZE]; + + static { // Generate shifts + for (int elementBits = 1 ; elementBits <= BLOCK_SIZE ; elementBits++) { + for (int bitPos = 0 ; bitPos < BLOCK_SIZE ; bitPos++) { + int[] currentShifts = SHIFTS[elementBits]; + int base = bitPos * FAC_BITPOS; + currentShifts[base ] = bitPos; + currentShifts[base + 1] = BLOCK_SIZE - elementBits; + if (bitPos <= BLOCK_SIZE - elementBits) { // Single block + currentShifts[base + 2] = 0; + MASKS[elementBits][bitPos] = 0; + } else { // Two blocks + int rBits = elementBits - (BLOCK_SIZE - bitPos); + currentShifts[base + 2] = BLOCK_SIZE - rBits; + MASKS[elementBits][bitPos] = ~(~0 << rBits); + } + } + } + } + + /* + * The setter requires more masking than the getter. + */ + private static final int[][] WRITE_MASKS = + new int[ENTRY_SIZE][ENTRY_SIZE * FAC_BITPOS]; + static { + for (int elementBits = 1 ; elementBits <= BLOCK_SIZE ; elementBits++) { + int elementPosMask = ~(~0 << elementBits); + int[] currentShifts = SHIFTS[elementBits]; + int[] currentMasks = WRITE_MASKS[elementBits]; + for (int bitPos = 0 ; bitPos < BLOCK_SIZE ; bitPos++) { + int base = bitPos * FAC_BITPOS; + currentMasks[base ] =~((elementPosMask + << currentShifts[base + 1]) + >>> currentShifts[base]); + if (bitPos <= BLOCK_SIZE - elementBits) { // Second block not used + currentMasks[base+1] = ~0; // Keep all bits + currentMasks[base+2] = 0; // Or with 0 + } else { + currentMasks[base+1] = ~(elementPosMask + << currentShifts[base + 2]); + currentMasks[base+2] = currentShifts[base + 2] == 0 ? 0 : ~0; + } + } + } + } + + /* The bits */ + private int[] blocks; + + // Cached calculations + private int maxPos; // blocks.length * BLOCK_SIZE / bitsPerValue - 1 + private int[] shifts; // The shifts for the current bitsPerValue + private int[] readMasks; + private int[] writeMasks; + + /** + * Creates an array with the internal structures adjusted for the given + * limits and initialized to 0. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * Note: bitsPerValue >32 is not supported by this implementation. + */ + public Packed32(int valueCount, int bitsPerValue) { + this(new int[(int)(((long)valueCount) * bitsPerValue / BLOCK_SIZE + 2)], + valueCount, bitsPerValue); + } + + /** + * Creates an array with content retrieved from the given DataInput. + * @param in a DataInput, positioned at the start of Packed64-content. + * @param valueCount the number of elements. + * @param bitsPerValue the number of bits available for any given value. + * @throws java.io.IOException if the values for the backing array could not + * be retrieved. + */ + public Packed32(DataInput in, int valueCount, int bitsPerValue) + throws IOException { + super(valueCount, bitsPerValue); + int size = size(bitsPerValue, valueCount); + blocks = new int[size + 1]; // +1 due to non-conditional tricks + // TODO: find a faster way to bulk-read ints... + for(int i = 0 ; i < size ; i++) { + blocks[i] = in.readInt(); + } + if (size % 2 == 1) { + in.readInt(); // Align to long + } + updateCached(); + } + + private static int size(int bitsPerValue, int valueCount) { + final long totBitCount = (long) valueCount * bitsPerValue; + return (int) (totBitCount/32 + ((totBitCount % 32 == 0 ) ? 0:1)); + } + + + /** + * Creates an array backed by the given blocks. + *

+ * Note: The blocks are used directly, so changes to the given block will + * affect the Packed32-structure. + * @param blocks used as the internal backing array. + * @param valueCount the number of values. + * @param bitsPerValue the number of bits available for any given value. + * Note: bitsPerValue >32 is not supported by this implementation. + */ + public Packed32(int[] blocks, int valueCount, int bitsPerValue) { + // TODO: Check that blocks.length is sufficient for holding length values + super(valueCount, bitsPerValue); + if (bitsPerValue > 31) { + throw new IllegalArgumentException(String.format( + "This array only supports values of 31 bits or less. The " + + "required number of bits was %d. The Packed64 " + + "implementation allows values with more than 31 bits", + bitsPerValue)); + } + this.blocks = blocks; + updateCached(); + } + + private void updateCached() { + readMasks = MASKS[bitsPerValue]; + maxPos = (int)((((long)blocks.length) * BLOCK_SIZE / bitsPerValue) - 2); + shifts = SHIFTS[bitsPerValue]; + writeMasks = WRITE_MASKS[bitsPerValue]; + } + + /** + * @param index the position of the value. + * @return the value at the given index. + */ + public long get(final int index) { + final long majorBitPos = (long)index * bitsPerValue; + final int elementPos = (int)(majorBitPos >>> BLOCK_BITS); // / BLOCK_SIZE + final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); + + final int base = bitPos * FAC_BITPOS; + + return ((blocks[elementPos] << shifts[base]) >>> shifts[base+1]) | + ((blocks[elementPos+1] >>> shifts[base+2]) & readMasks[bitPos]); + } + + public void set(final int index, final long value) { + final int intValue = (int)value; + final long majorBitPos = (long)index * bitsPerValue; + final int elementPos = (int)(majorBitPos >>> BLOCK_BITS); // / BLOCK_SIZE + final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); + final int base = bitPos * FAC_BITPOS; + + blocks[elementPos ] = (blocks[elementPos ] & writeMasks[base]) + | (intValue << shifts[base + 1] >>> shifts[base]); + blocks[elementPos+1] = (blocks[elementPos+1] & writeMasks[base+1]) + | ((intValue << shifts[base + 2]) + & writeMasks[base+2]); + } + + public void clear() { + Arrays.fill(blocks, 0); + } + + @Override + public String toString() { + return "Packed32(bitsPerValue=" + bitsPerValue + ", maxPos=" + maxPos + + ", elements.length=" + blocks.length + ")"; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + blocks.length * RamUsageEstimator.NUM_BYTES_INT; + } +} Index: src/java/org/apache/lucene/util/packed/package.html =================================================================== --- src/java/org/apache/lucene/util/packed/package.html (revision 0) +++ src/java/org/apache/lucene/util/packed/package.html (revision 0) @@ -0,0 +1,32 @@ + + + + + + +

+ The packed package provides random access capable arrays of positive longs. + The implementations provides different trade offs between memory usage and + access speed. The standard usage scenario is replacing large int or long + arrays in order to reduce the memory footprint. +

+ The main access point is the {@link org.apache.lucene.util.packed.PackedInts} factory. +

+ + +