Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 982928) +++ lucene/CHANGES.txt (working copy) @@ -543,6 +543,10 @@ * LUCENE-2526: Don't throw NPE from MultiPhraseQuery.toString when it's empty. (Ross Woolf via Mike McCandless) + +* LUCENE-2589: Add a VariableSizedIntIndexInput, which, when used w/ + Sep*, makes it simple to take any variable sized int block coders + (like Simple9/16) and use them in a codec. (Mike McCandless) Optimizations Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 982928) +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -268,7 +268,7 @@ final int NUM_TERMS = 100; final TermData[] terms = new TermData[NUM_TERMS]; for(int i=0;i= count: "buffer.length=" + buffer.length + " count=" + count; + for(int i=0;i files) { + SepPostingsReaderImpl.files(segmentInfo, files); + StandardTermsDictReader.files(dir, segmentInfo, files); + SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + } + + @Override + public void getExtensions(Set extensions) { + SepPostingsWriterImpl.getExtensions(extensions); + StandardTermsDictReader.getExtensions(extensions); + SimpleStandardTermsIndexReader.getIndexExtensions(extensions); + } +} Index: lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockFactory.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockFactory.java (revision 982928) +++ lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockFactory.java (working copy) @@ -1,44 +0,0 @@ -package org.apache.lucene.index.codecs.mockintblock; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.Directory; -import org.apache.lucene.index.codecs.sep.IntStreamFactory; -import org.apache.lucene.index.codecs.sep.IntIndexInput; -import org.apache.lucene.index.codecs.sep.IntIndexOutput; - -import java.io.IOException; - -/** Silly int factory that reads/writes block of ints by - * simply encoding each as vInt. Don't use this - * (performance will be poor)! This is here just to test - * the core intblock codec classes.*/ -public class MockFixedIntBlockFactory extends IntStreamFactory { - private final int blockSize; - public MockFixedIntBlockFactory(int blockSize) { - this.blockSize = blockSize; - } - @Override - public IntIndexInput openInput(Directory dir, String fileName, int readBufferSize) throws IOException { - return new MockFixedIntBlockIndexInput(dir, fileName, readBufferSize); - } - @Override - public IntIndexOutput createOutput(Directory dir, String fileName) throws IOException { - return new MockFixedIntBlockIndexOutput(dir, fileName, blockSize); - } -} Index: lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockIndexInput.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockIndexInput.java (revision 982928) +++ lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockIndexInput.java (working copy) @@ -1,65 +0,0 @@ -package org.apache.lucene.index.codecs.mockintblock; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Naive int block API that writes vInts. This is - * expected to give poor performance; it's really only for - * testing the pluggability. One should typically use pfor instead. */ - -import org.apache.lucene.util.CodecUtil; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput; - -import java.io.IOException; - -/** Don't use this class!! It naively encodes ints one vInt - * at a time. Use it only for testing. */ -public class MockFixedIntBlockIndexInput extends FixedIntBlockIndexInput { - - public MockFixedIntBlockIndexInput(Directory dir, String fileName, int readBufferSize) throws IOException { - IndexInput in = dir.openInput(fileName, readBufferSize); - CodecUtil.checkHeader(in, MockFixedIntBlockIndexOutput.CODEC, - MockFixedIntBlockIndexOutput.VERSION_START, MockFixedIntBlockIndexOutput.VERSION_START); - init(in); - } - - private static class BlockReader implements FixedIntBlockIndexInput.BlockReader { - - private final IndexInput in; - private final int[] buffer; - - public BlockReader(IndexInput in, int[] buffer) { - this.in = in; - this.buffer = buffer; - } - - public void readBlock() throws IOException { - // silly impl - for(int i=0;i= 0: "pendingUpto=" + pendingUpto; + seekPending = true; + } + + private final void maybeSeek() throws IOException { + if (seekPending) { + if (pendingFP != lastBlockFP) { + // need new block + in.seek(pendingFP); + blockReader.seek(pendingFP); + lastBlockFP = pendingFP; + blockSize = blockReader.readBlock(); + } + upto = pendingUpto; + + // TODO: if we were more clever when writing the + // index, such that a seek point wouldn't be written + // until the int encoder "committed", we could avoid + // this (likely minor) inefficiency: + + // This is necessary for int encoders that are + // non-causal, ie must see future int values to + // encode the current ones. + while(upto >= blockSize) { + upto -= blockSize; + lastBlockFP = in.getFilePointer(); + blockSize = blockReader.readBlock(); + } + seekPending = false; + } + } + + @Override + public int next() throws IOException { + this.maybeSeek(); + if (upto == blockSize) { + lastBlockFP = in.getFilePointer(); + blockSize = blockReader.readBlock(); + upto = 0; + } + + return pending[upto++]; + } + + @Override + public IntsRef read(final int count) throws IOException { + this.maybeSeek(); + if (upto == blockSize) { + lastBlockFP = in.getFilePointer(); + blockSize = blockReader.readBlock(); + upto = 0; + } + bulkResult.offset = upto; + if (upto + count < blockSize) { + bulkResult.length = count; + upto += count; + } else { + bulkResult.length = blockSize - upto; + upto = blockSize; + } + + return bulkResult; + } + } + + private class Index extends IntIndexInput.Index { + private long fp; + private int upto; + + @Override + public void read(final IndexInput indexIn, final boolean absolute) throws IOException { + if (absolute) { + fp = indexIn.readVLong(); + upto = indexIn.readByte()&0xFF; + } else { + final long delta = indexIn.readVLong(); + if (delta == 0) { + // same block + upto = indexIn.readByte()&0xFF; + } else { + // new block + fp += delta; + upto = indexIn.readByte()&0xFF; + } + } + // TODO: we can't do this assert because non-causal + // int encoders can have upto over the buffer size + //assert upto < maxBlockSize: "upto=" + upto + " max=" + maxBlockSize; + } + + @Override + public String toString() { + return "VarIntBlock.Index fp=" + fp + " upto=" + upto + " maxBlock=" + maxBlockSize; + } + + @Override + public void seek(final IntIndexInput.Reader other) throws IOException { + ((Reader) other).seek(fp, upto); + } + + @Override + public void set(final IntIndexInput.Index other) { + final Index idx = (Index) other; + fp = idx.fp; + upto = idx.upto; + } + + @Override + public Object clone() { + Index other = new Index(); + other.fp = fp; + other.upto = upto; + return other; + } + } +} Index: lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java (revision 0) +++ lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java (revision 0) @@ -0,0 +1,128 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Naive int block API that writes vInts. This is + * expected to give poor performance; it's really only for + * testing the pluggability. One should typically use pfor instead. */ + +import java.io.IOException; + +import org.apache.lucene.index.codecs.sep.IntIndexOutput; +import org.apache.lucene.store.IndexOutput; + +// TODO: much of this can be shared code w/ the fixed case + +/** Abstract base class that writes variable-size blocks of ints + * to an IndexOutput. While this is a simple approach, a + * more performant approach would directly create an impl + * of IntIndexOutput inside Directory. Wrapping a generic + * IndexInput will likely cost performance. + * + * @lucene.experimental + */ +public abstract class VariableIntBlockIndexOutput extends IntIndexOutput { + + protected final IndexOutput out; + + private int upto; + + private static final int MAX_BLOCK_SIZE = 1 << 8; + + /** NOTE: maxBlockSize plus the max non-causal lookahead + * of your codec must be less than 256. EG Simple9 + * requires lookahead=1 because on seeing the Nth value + * it knows it must now encode the N-1 values before it. */ + protected VariableIntBlockIndexOutput(IndexOutput out, int maxBlockSize) throws IOException { + if (maxBlockSize > MAX_BLOCK_SIZE) { + throw new IllegalArgumentException("maxBlockSize must be <= " + MAX_BLOCK_SIZE + "; got " + maxBlockSize); + } + this.out = out; + out.writeInt(maxBlockSize); + } + + /** Called one value at a time. Return the number of + * buffered input values that have been written to out. */ + protected abstract int add(int value) throws IOException; + + @Override + public Index index() throws IOException { + return new Index(); + } + + private class Index extends IntIndexOutput.Index { + long fp; + int upto; + long lastFP; + int lastUpto; + + @Override + public void mark() throws IOException { + fp = out.getFilePointer(); + upto = VariableIntBlockIndexOutput.this.upto; + } + + @Override + public void set(IntIndexOutput.Index other) throws IOException { + Index idx = (Index) other; + lastFP = fp = idx.fp; + lastUpto = upto = idx.upto; + } + + @Override + public void write(IndexOutput indexOut, boolean absolute) throws IOException { + assert upto >= 0; + if (absolute) { + indexOut.writeVLong(fp); + indexOut.writeByte((byte) upto); + } else if (fp == lastFP) { + // same block + indexOut.writeVLong(0); + assert upto >= lastUpto; + indexOut.writeByte((byte) upto); + } else { + // new block + indexOut.writeVLong(fp - lastFP); + indexOut.writeByte((byte) upto); + } + lastUpto = upto; + lastFP = fp; + } + } + + @Override + public void write(int v) throws IOException { + upto -= add(v)-1; + assert upto >= 0; + } + + @Override + public void close() throws IOException { + try { + // stuff 0s in until the "real" data is flushed: + int stuffed = 0; + while(upto > stuffed) { + upto -= add(0)-1; + assert upto >= 0; + stuffed += 1; + } + } finally { + out.close(); + } + } +} Index: lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java (revision 982928) +++ lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java (working copy) @@ -37,10 +37,10 @@ */ public abstract class FixedIntBlockIndexInput extends IntIndexInput { - private IndexInput in; - protected int blockSize; - - protected void init(final IndexInput in) throws IOException { + private final IndexInput in; + protected final int blockSize; + + public FixedIntBlockIndexInput(final IndexInput in) throws IOException { this.in = in; blockSize = in.readVInt(); } Index: lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java (revision 982928) +++ lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java (working copy) @@ -36,19 +36,19 @@ */ public abstract class FixedIntBlockIndexOutput extends IntIndexOutput { - private IndexOutput out; - private int blockSize; - private int[] pending; + protected final IndexOutput out; + private final int blockSize; + protected final int[] buffer; private int upto; - protected void init(IndexOutput out, int fixedBlockSize) throws IOException { + protected FixedIntBlockIndexOutput(IndexOutput out, int fixedBlockSize) throws IOException { blockSize = fixedBlockSize; + this.out = out; out.writeVInt(blockSize); - this.out = out; - pending = new int[blockSize]; + buffer = new int[blockSize]; } - protected abstract void flushBlock(int[] buffer, IndexOutput out) throws IOException; + protected abstract void flushBlock() throws IOException; @Override public Index index() throws IOException { @@ -96,9 +96,9 @@ @Override public void write(int v) throws IOException { - pending[upto++] = v; + buffer[upto++] = v; if (upto == blockSize) { - flushBlock(pending, out); + flushBlock(); upto = 0; } } @@ -107,9 +107,9 @@ public void close() throws IOException { try { if (upto > 0) { - // NOTE: entries in the block after current upto are - // invalid - flushBlock(pending, out); + // NOTE: entries in the block after current upto are + // invalid + flushBlock(); } } finally { out.close(); Index: lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java (revision 982928) +++ lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java (working copy) @@ -115,7 +115,7 @@ DefaultCodecProvider() { register(new StandardCodec()); register(new PreFlexCodec()); - register(new PulsingCodec()); + register(new PulsingCodec(1)); } @Override Index: lucene/src/java/org/apache/lucene/util/CodecUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/CodecUtil.java (revision 982928) +++ lucene/src/java/org/apache/lucene/util/CodecUtil.java (working copy) @@ -33,7 +33,7 @@ public final class CodecUtil { private final static int CODEC_MAGIC = 0x3fd76c17; - public static void writeHeader(IndexOutput out, String codec, int version) + public static IndexOutput writeHeader(IndexOutput out, String codec, int version) throws IOException { final long start = out.getFilePointer(); out.writeInt(CODEC_MAGIC); @@ -44,6 +44,8 @@ if (out.getFilePointer()-start != codec.length()+9) { throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]"); } + + return out; } public static int headerLength(String codec) {