Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 1050096) +++ CHANGES.txt (working copy) @@ -1,3 +1,41 @@ +Changes by Hao Yan (hyan2008@gmail.com) + +In summary, I added five files to support and test the codec. + +In Src, +1. org.apache.lucene.index.codecs.pfordelta.PForDelta.java +2. org.apache.lucene.index.codecs.pfordelta.Simple16.java +3. org.apache.lucene.index.codecs.PForDeltaFixedBlockCodec.java +4. org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutputWithGetElementNum.java + +In Test, +5. org.apache.lucene.index.codecs.intblock.TestPForDeltaFixedIntBLockCodec.java + +1) In particular, the firs class PForDelta is the core implementation +of PForDelta algorithm, which compresses exceptions using Simple16 +that is implemented in the second class Simple16. +2) The third classs PForDeltaFixedBlockCodec is similar to +org.apache.lucene.index.codesc.ockintblock.MockFixedIntBlockCodec in +Test, except that it uses PForDelta to encode the data in the buffer. +3) The fourth class is almost the same as +org.apache.lucene.index.codecs.intblock.FixedIntBlockINdexOuput, +except that it provides an additional public function to retrieve the +value of the upto field, which is private filed in +FixedIntBlockINdexOuput. The reason I added this public function is +that the number of elements in the block that have meaningful values is not always equal to the blockSize or the buffer +size since the last block/buffer of a stream of data usually only +contain less number of data. In the case, I will fill all elements after the meaningful elements with 0s. Thus, we alwasy compress one entire block. + +4) The last class is the unit test to test PForDeltaFixedIntBlockCodec +which is very similar to +org.apache.lucene.index.codecs.mintblock.TestIntBlockCodec. + +I also changed the LuceneTestCase class to add the new +PForDeltaFixeIntBlockCOde. + +The unit tests and all lucence tests have passed. + + Lucene Change Log ======================= Trunk (not yet released) ======================= Index: src/test/org/apache/lucene/index/codecs/intblock/TestPForDeltaFixedIntBlockCodec.java =================================================================== --- src/test/org/apache/lucene/index/codecs/intblock/TestPForDeltaFixedIntBlockCodec.java (revision 0) +++ src/test/org/apache/lucene/index/codecs/intblock/TestPForDeltaFixedIntBlockCodec.java (revision 0) @@ -0,0 +1,79 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.store.*; +import org.apache.lucene.index.codecs.PForDeltaFixedIntBlockCodec; +import org.apache.lucene.index.codecs.sep.*; + +/** + * This class is to test the PForDeltaFixedIntBlockCodec + * + * + */ + +public class TestPForDeltaFixedIntBlockCodec extends LuceneTestCase { + + public void testPForDeltaSimpleIntBlocks() throws Exception { + Directory dir = newDirectory(); + int blockSize = 128; + IntStreamFactory f = new PForDeltaFixedIntBlockCodec(blockSize).getIntFactory(); + int testDataSize = 212402; + int[] testData = new int[testDataSize]; + for(int i=0; i stores; - private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock"}; + private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock", "PForDeltaFixedIntBlock"}; + //private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock"}; + // private static final String[] TEST_CODECS = new String[] {"MockSep", "MockPForDeltaFixedIntBlock", "MockVariableIntBlock"}; private static void swapCodec(Codec c, CodecProvider cp) { Codec prior = null; @@ -250,9 +253,11 @@ swapCodec(new MockSepCodec(), cp); swapCodec(new PulsingCodec(codecHasParam && "Pulsing".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 20)), cp); swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)), cp); + swapCodec(new PForDeltaFixedIntBlockCodec(codecHasParam && "PForDeltaFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)), cp); + // baseBlockSize cannot be over 127: swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127)), cp); - + return cp.lookup(codec); } @@ -267,6 +272,7 @@ } cp.unregister(cp.lookup("MockSep")); cp.unregister(cp.lookup("MockFixedIntBlock")); + cp.unregister(cp.lookup("PForDeltaFixedIntBlock")); cp.unregister(cp.lookup("MockVariableIntBlock")); swapCodec(new PulsingCodec(1), cp); cp.setDefaultFieldCodec(savedDefaultCodec); Index: src/java/org/apache/lucene/index/codecs/PForDeltaFixedIntBlockCodec.java =================================================================== --- src/java/org/apache/lucene/index/codecs/PForDeltaFixedIntBlockCodec.java (revision 0) +++ src/java/org/apache/lucene/index/codecs/PForDeltaFixedIntBlockCodec.java (revision 0) @@ -0,0 +1,272 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; +import java.util.Arrays; +import java.util.Set; + +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.pfordelta.PForDelta; +import org.apache.lucene.index.codecs.sep.IntStreamFactory; +import org.apache.lucene.index.codecs.sep.IntIndexInput; +import org.apache.lucene.index.codecs.sep.IntIndexOutput; +import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; +import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; +import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput; +import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutputWithGetElementNum; +import org.apache.lucene.index.codecs.FixedGapTermsIndexReader; +import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter; +import org.apache.lucene.index.codecs.PostingsWriterBase; +import org.apache.lucene.index.codecs.PostingsReaderBase; +import org.apache.lucene.index.codecs.PrefixCodedTermsReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsWriter; +import org.apache.lucene.index.codecs.TermsIndexReaderBase; +import org.apache.lucene.index.codecs.TermsIndexWriterBase; +import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.store.*; +import org.apache.lucene.util.BytesRef; + +/** + * A codec for fixed sized int block encoders. The int encoder + * used here writes each block as data encoded by PForDelta. + */ + +public class PForDeltaFixedIntBlockCodec extends Codec { + + private final int blockSize; + + public PForDeltaFixedIntBlockCodec(int blockSize) { + this.blockSize = blockSize; + name = "PForDeltaFixedIntBlock"; + } + + @Override + public String toString() { + return name + "(blockSize=" + blockSize + ")"; + } + + /** + * Encode a block of integers using PForDelta and + * @param block the input block to be compressed + * @param elementNum the number of elements in the block to be compressed + * @return the compressed size in the number of integers of the compressed data + * @throws Exception + */ + private int encodeOneBlockWithPForDelta(final int[] block, int elementNum) throws Exception + { + if(block == null || block.length == 0) + { + throw new Exception("input block is empty"); + } + + int[] compressedBlock = PForDelta.compressOneBlock(block, elementNum); + if(compressedBlock == null) + { + throw new Exception("compressed buffer is null"); + } + System.arraycopy(compressedBlock, 0, block, 0, compressedBlock.length); + return compressedBlock.length; + } + + /** + * Decode a block of compressed data (using PForDelta) into a block of elementNum uncompressed integers + * @param block the input block to be decompressed + * @param elementNum the number of elements in the block to be compressed + */ + private void decodeOneBlockWithPForDelta(final int[] block, int elementNum) + { + int[] decompressedBlock = PForDelta.decompressOneBlock(block, elementNum); + System.arraycopy(decompressedBlock, 0, block, 0, decompressedBlock.length); + } + + + public IntStreamFactory getIntFactory() { + return new PForDeltaIntFactory(); + } + + private class PForDeltaIntFactory extends IntStreamFactory { + + @Override + public IntIndexInput openInput(Directory dir, String fileName, int readBufferSize) throws IOException { + return new FixedIntBlockIndexInput(dir.openInput(fileName, readBufferSize)) { + + @Override + protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException { + return new BlockReader() { + public void seek(long pos) {} + public void readBlock() throws IOException { + if(buffer != null) + { + // retrieve the compressed size in ints + final int compressedSizeInInt = in.readInt(); + // read the compressed data (compressedSizeInInt ints) + for(int i=0;i compressedSizeInInt) + { + try + { + decodeOneBlockWithPForDelta(buffer, blockSize); + } + catch(Exception e) + { + e.printStackTrace(); + } + } + } + } + }; + } + }; + } + + @Override + public IntIndexOutput createOutput(Directory dir, String fileName) throws IOException { + return new FixedIntBlockIndexOutputWithGetElementNum(dir.createOutput(fileName), blockSize) { + @Override + protected void flushBlock() throws IOException { + if(buffer != null && buffer.length>0) + { + // retrieve the number of actual elements in the block + int numberOfElements = getElementNum(); + // pad 0s after the actual elements + if(numberOfElements < blockSize) + { + Arrays.fill(buffer, numberOfElements, blockSize, 0); + } + int compressedSizeInInts = 0; + // compress the data + try{ + compressedSizeInInts = encodeOneBlockWithPForDelta(buffer, blockSize); + } + catch(Exception e) + { + e.printStackTrace(); + } + // write out the compressed size in ints + out.writeInt(compressedSizeInInts); + // write out the compressed data + for(int i=0;i files) { + SepPostingsReaderImpl.files(segmentInfo, codecId, files); + PrefixCodedTermsReader.files(dir, segmentInfo, codecId, files); + FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); + } + + @Override + public void getExtensions(Set extensions) { + SepPostingsWriterImpl.getExtensions(extensions); + PrefixCodedTermsReader.getExtensions(extensions); + FixedGapTermsIndexReader.getIndexExtensions(extensions); + } +} Index: src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutputWithGetElementNum.java =================================================================== --- src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutputWithGetElementNum.java (revision 0) +++ src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutputWithGetElementNum.java (revision 0) @@ -0,0 +1,143 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** This class is almost same as the class FixedIntBlockIndexOutput except that it provides one additional + * public interface to retrieve the actual number of elements in the current block + */ + +import java.io.IOException; + +import org.apache.lucene.index.codecs.sep.IntIndexOutput; +import org.apache.lucene.store.IndexOutput; + +/** Abstract base class that writes fixed-size blocks of ints + * to an IndexOutput. While this is a simple approach, a + * more performant approach would directly create an impl + * of IntIndexOutput inside Directory. Wrapping a generic + * IndexInput will likely cost performance. + * + * @lucene.experimental + */ +public abstract class FixedIntBlockIndexOutputWithGetElementNum extends IntIndexOutput { + + protected final IndexOutput out; + private final int blockSize; + protected final int[] buffer; + private int upto; + + // get the number of elements in the block that have the actual values + public int getElementNum() + { + return upto; + } + + protected FixedIntBlockIndexOutputWithGetElementNum(IndexOutput out, int fixedBlockSize) throws IOException { + blockSize = fixedBlockSize; + this.out = out; + out.writeVInt(blockSize); + buffer = new int[blockSize]; + } + + protected abstract void flushBlock() throws IOException; + + @Override + public Index index() throws IOException { + return new Index(); + } + + private class Index extends IntIndexOutput.Index { + long fp; + int upto; + long lastFP; + int lastUpto; + + @Override + public void mark() throws IOException { + fp = out.getFilePointer(); + upto = FixedIntBlockIndexOutputWithGetElementNum.this.upto; + } + + @Override + public void set(IntIndexOutput.Index other) throws IOException { + Index idx = (Index) other; + lastFP = fp = idx.fp; + lastUpto = upto = idx.upto; + } + + @Override + public void write(IndexOutput indexOut, boolean absolute) throws IOException { + if (absolute) { + indexOut.writeVLong(fp); + indexOut.writeVInt(upto); + } else if (fp == lastFP) { + // same block + indexOut.writeVLong(0); + assert upto >= lastUpto; + indexOut.writeVInt(upto - lastUpto); + } else { + // new block + indexOut.writeVLong(fp - lastFP); + indexOut.writeVInt(upto); + } + lastUpto = upto; + lastFP = fp; + } + + @Override + public void write(IntIndexOutput indexOut, boolean absolute) throws IOException { + if (absolute) { + indexOut.writeVLong(fp); + indexOut.write(upto); + } else if (fp == lastFP) { + // same block + indexOut.writeVLong(0); + assert upto >= lastUpto; + indexOut.write(upto - lastUpto); + } else { + // new block + indexOut.writeVLong(fp - lastFP); + indexOut.write(upto); + } + lastUpto = upto; + lastFP = fp; + } + } + + @Override + public void write(int v) throws IOException { + buffer[upto++] = v; + if (upto == blockSize) { + flushBlock(); + upto = 0; + } + } + + @Override + public void close() throws IOException { + try { + if (upto > 0) { + // NOTE: entries in the block after current upto are + // invalid + flushBlock(); + } + } finally { + out.close(); + } + } +} Index: src/java/org/apache/lucene/index/codecs/pfordelta/PForDelta.java =================================================================== --- src/java/org/apache/lucene/index/codecs/pfordelta/PForDelta.java (revision 0) +++ src/java/org/apache/lucene/index/codecs/pfordelta/PForDelta.java (revision 0) @@ -0,0 +1,340 @@ +package org.apache.lucene.index.codecs.pfordelta; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +/** + * Implementation of the optimized PForDelta algorithm for sorted integer arrays. The basic ideas are based on + * + * 1. Original algorithm from + * http://homepages.cwi.nl/~heman/downloads/msthesis.pdf + * + * 2. Optimization and + * variation from http://www2008.org/papers/pdf/p387-zhangA.pdf + * + * 3. Further optimization + * http://www2009.org/proceedings/pdf/p401.pdf + * + * As a part of the PForDelta implementation, Simple16 is used to compress exceptions. The original Simple16 algorithm can also be found in the above literatures. + * @author hao yan, hyan2008@gmail.com + */ + +public class PForDelta{ + + //All possible values of b in the PForDelta algorithm + private static final int[] POSSIBLE_B = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,28}; + // Max number of bits to store an uncompressed value + private static final int MAX_BITS = 32; + // Header records the value of b and the number of exceptions in the block + private static final int HEADER_NUM = 1; + // Header size in bits + private static final int HEADER_SIZE = MAX_BITS * HEADER_NUM; + + private static final int[] MASK = {0x00000000, + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, + 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff, + 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, + 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, + 0x7fffffff, 0xffffffff}; + + /** + * Compress one block of blockSize integers using PForDelta with the optimal parameter b + * @param inBlock the block to be compressed + * @param blockSize the block size + * @return the compressed block + */ + public static int[] compressOneBlock(final int[] inBlock, int blockSize) + { + // find the best b that can lead to the smallest overall compressed size + int currentB = POSSIBLE_B[0]; + int tmpB = currentB; + int optSize = estimateCompressedSize(inBlock, tmpB, blockSize); + for (int i = 1; i < POSSIBLE_B.length; ++i) + { + tmpB = POSSIBLE_B[i]; + int curSize = estimateCompressedSize(inBlock, tmpB, blockSize); + if(curSize < optSize) + { + currentB = tmpB; + optSize = curSize; + } + } + + // compress the block using the above best b + int[] outBlock = compressOneBlockCore(inBlock, currentB, blockSize); + + return outBlock; + } + + /** + * Decompress one block using PForDelta + * @param inBlock the block to be decompressed + * @param blockSize the number of elements in the decompressed block + * @return the decompressed block + */ + public static int[] decompressOneBlock(int[] inBlock, int blockSize) + { + int[] expPos = new int[blockSize]; + int[] expHighBits = new int[blockSize]; + int[] outBlock = new int[blockSize]; + if(inBlock == null) + { + System.out.println("error: compBlock is null"); + return null; + } + + int expNum = inBlock[0] & 0x3ff; + int bits = (inBlock[0]>>>10) & (0x1f); + + // decompress the b-bit slots + int offset = HEADER_SIZE; + int compressedBits = 0; + if(bits == 0) + { + Arrays.fill(outBlock,0); + } + else + { + compressedBits = decompressBBitSlots(outBlock, inBlock, blockSize, bits); + } + offset += compressedBits; + + // decompress exceptions + if(expNum>0) + { + compressedBits = decompressBlockByS16(expPos, inBlock, offset, expNum); + offset += compressedBits; + compressedBits = decompressBlockByS16(expHighBits, inBlock, offset, expNum); + offset += compressedBits; + + for (int i = 0; i < expNum; i++) + { + int curExpPos = expPos[i] ; + int curHighBits = expHighBits[i]; + outBlock[curExpPos] = (outBlock[curExpPos] & MASK[bits]) | ((curHighBits & MASK[32-bits] ) << bits); + } + } + return outBlock; + } + + /** + * Estimate the compressed size in ints of a block + * @param inputBlock the block to be compressed + * @param bits the value of the parameter b + * @param blockSize the block size + * @return the compressed size in ints + * @throws IllegalArgumentException + */ + private static int estimateCompressedSize(int[] inputBlock, int bits, int blockSize) throws IllegalArgumentException { + int maxNoExp = (1< maxNoExp) + { + expNum++; + } + } + outputOffset += (expNum<<5); + + return outputOffset; + } + + /** + * The core implementation of compressing a block with blockSize integers using PForDelta with the given parameter b + * @param inputBlock the block to be compressed + * @param bits the the value of the parameter b + * @param blockSize the block size + * @return the compressed block + * @throws IllegalArgumentException + */ + private static int[] compressOneBlockCore(int[] inputBlock, int bits, int blockSize) throws IllegalArgumentException { + int[] expPos = new int[blockSize]; + int[] expHighBits = new int[blockSize]; + + int maxCompBitSize = HEADER_SIZE + blockSize * (MAX_BITS + MAX_BITS + MAX_BITS) + 32; + int[] tmpCompressedBlock = new int[(maxCompBitSize>>>5)]; + + int outputOffset = HEADER_SIZE; + int expUpperBound = 1<>> bits) & MASK[32-bits]; + expNum++; + } + outputOffset += bits; + } + + // the first int in the compressed block stores the value of b and the number of exceptions + tmpCompressedBlock[0] = ((bits & MASK[10]) << 10) | (expNum & 0x3ff); + + // compress exceptions + if(expNum>0) + { + int compressedBitSize = compressBlockByS16(tmpCompressedBlock, outputOffset, expPos, expNum, blockSize, inputBlock); + outputOffset += compressedBitSize; + compressedBitSize = compressBlockByS16(tmpCompressedBlock, outputOffset, expHighBits, expNum, blockSize, inputBlock); + outputOffset += compressedBitSize; + } + + // discard the redundant parts in the tmpCompressedBlock + int compressedSizeInInts = (outputOffset+31)>>>5; + int[] compBlock; + compBlock = new int[compressedSizeInInts]; + System.arraycopy(tmpCompressedBlock,0, compBlock, 0, compressedSizeInInts); + + return compBlock; + } + + /** + * Decompress b-bit slots + * @param outDecompSlots decompressed block which is the output + * @param inCompBlock the compressed block which is the input + * @param blockSize the block size + * @param bits the value of the parameter b + * @return the compressed size in bits of the data that has been decompressed + */ + private static int decompressBBitSlots(int[] outDecompSlots, int[] inCompBlock, int blockSize, int bits) + { + int compressedBitSize = 0; + int offset = HEADER_SIZE; + for(int i =0; i>>5; + int num, inOffset=0, numLeft; + for(numLeft=blockSize; numLeft>0; numLeft -= num) + { + num = Simple16.s16Compress(outCompBlock, outOffset, inBlock, inOffset, numLeft, blockSize, oriBlockSize, oriInputBlock); + if(num<0) + { + System.out.println("oops: s16 get -1 "); + } + outOffset++; + inOffset += num; + } + int compressedBitSize = (outOffset<<5)-outStartOffsetInBits; + return compressedBitSize; + } + + /** + * Decompress a block of blockSize integers using Simple16 algorithm + * @param outDecompBlock the decompressed block which is the output + * @param inCompBlock the compressed block which is the input + * @param blockSize the block size + * @param inStartOffsetInBits the start offset in bits of the compressed block + * @return the compressed size in bits of the data that has been decompressed + */ + private static int decompressBlockByS16(int[] outDecompBlock, int[] inCompBlock, int inStartOffsetInBits, int blockSize) + { + int inOffset = (inStartOffsetInBits+31)>>>5; + int num, outOffset=0, numLeft; + for(numLeft=blockSize; numLeft>0; numLeft -= num) + { + num = Simple16.s16Decompress(outDecompBlock, outOffset, inCompBlock, inOffset, numLeft); + outOffset += num; + inOffset++; + } + int compressedBitSize = (inOffset<<5)-inStartOffsetInBits; + return compressedBitSize; + } + + + /** + * Write a certain number of bits of an integer into an integer array starting from the given start offset + * + * @param out the output array + * @param val the integer to be written + * @param outOffset the start offset in bits in the output array + * @param bits the number of bits to be written (bits>=0) + */ + private static final void writeBits(int[] out, int val, int outOffset, int bits) { + if(bits == 0) + return; + final int index = outOffset >>> 5; + final int skip = outOffset & 0x1f; + val &= (0xffffffff >>> (32 - bits)); + out[index] |= (val << skip); + if (32 - skip < bits) { + out[index + 1] |= (val >>> (32 - skip)); + } + } + + /** + * Read a certain number of bits of an integer into an integer array starting from the given start offset + * + * @param in the input array + * @param val the integer to be read + * @param inOffset the start offset in bits in the input array + * @param bits the number of bits to be read, unlike writeBits(), readBits() does not deal with bits==0 and thus bits must > 0. When bits ==0, the calling functions will just skip the entire bits-bit slots without decoding them + * @return the bits bits of the input + */ + private static final int readBits(int[] in, final int inOffset, final int bits) { + final int index = inOffset >>> 5; + final int skip = inOffset & 0x1f; + int val = in[index] >>> skip; + if (32 - skip < bits) { + val |= (in[index + 1] << (32 - skip)); + } + return val & (0xffffffff >>> (32 - bits)); + } + +} + + Index: src/java/org/apache/lucene/index/codecs/pfordelta/Simple16.java =================================================================== --- src/java/org/apache/lucene/index/codecs/pfordelta/Simple16.java (revision 0) +++ src/java/org/apache/lucene/index/codecs/pfordelta/Simple16.java (revision 0) @@ -0,0 +1,123 @@ +package org.apache.lucene.index.codecs.pfordelta; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Implementation of the Simple16 algorithm for sorted integer arrays. The basic ideas are based on papers from + * + * 1. http://www2008.org/papers/pdf/p387-zhangA.pdf + * + * 2. http://www2009.org/proceedings/pdf/p401.pdf + * + */ + +public class Simple16{ + + private static final int S16_NUMSIZE = 16; + private static final int S16_BITSSIZE = 28; + // the possible number of bits used to represent one integer + private static final int[] S16_NUM = {28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1}; + // the corresponding number of elements for each value of the number of bits + private static final int[][] S16_BITS = { {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + {2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0}, + {1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,0,0,0,0,0,0}, + {1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,0,0,0,0,0,0,0}, + {2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {3,4,4,4,4,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {5,5,5,5,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,4,5,5,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {6,6,6,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {5,5,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {7,7,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {10,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} }; + + /** + * Compress an integer array using Simple16 + * + * @param out the compressed output + * @param outOffset the offset of the output in the number of integers + * @param in the integer input array + * @param inOffset the offset of the input in the number of integers + * @param n the number of elements to be compressed + * @return the number of compressed integers + */ + public static final int s16Compress(int[] out, int outOffset, int[] in, int inOffset, int n, int blockSize, int oriBlockSize, int[] oriInputBlock) + { + int numIdx, j, num, bits; + for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) + { + out[outOffset] = numIdx<>>S16_BITSSIZE; + int num = S16_NUM[numIdx] < n ? S16_NUM[numIdx] : n; + for(j=0, bits=0; j>> inWithIntOffset); + return val & (0xffffffff >>> (32 - bits)); + } + } Index: build.xml =================================================================== --- build.xml (revision 1050096) +++ build.xml (working copy) @@ -61,7 +61,8 @@ - +