Index: lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java (revision 1069435) +++ lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java (working copy) @@ -31,8 +31,6 @@ import org.apache.lucene.index.codecs.sep.IntIndexOutput; import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; -import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput; -import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput; import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.BlockTermsReader; @@ -73,7 +71,7 @@ @Override public IntIndexInput openInput(Directory dir, final String fileName, int readBufferSize) throws IOException { - return new VariableIntBlockIndexInput(dir.openInput(fileName, readBufferSize)) { + return new VariableIntFixedPhyBlockIndexInput(dir.openInput(fileName, readBufferSize)) { @Override protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException { @@ -100,7 +98,7 @@ @Override public IntIndexOutput createOutput(Directory dir, String fileName) throws IOException { - return new VariableIntBlockIndexOutput(dir.createOutput(fileName), 61*multiplier) { + return new VariableIntFixedPhyBlockIndexOutput(dir.createOutput(fileName), 61*multiplier, 8*multiplier) { private final long[] buffer = new long[multiplier]; private int totWritten; private int totConsumed; Index: lucene/src/java/org/apache/lucene/index/codecs/simple64/VariableIntFixedPhyBlockIndexInput.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/simple64/VariableIntFixedPhyBlockIndexInput.java (revision 0) +++ lucene/src/java/org/apache/lucene/index/codecs/simple64/VariableIntFixedPhyBlockIndexInput.java (revision 0) @@ -0,0 +1,208 @@ +package org.apache.lucene.index.codecs.simple64; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.BulkPostingsEnum; +import org.apache.lucene.index.codecs.sep.IntIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; + +// TODO: much of this can be shared code w/ the variable case +// TODO: not specific to simple64, (e.g. can be used by simple9/simple16 at least) + +/** Abstract base class that reads variable-size blocks of ints + * from an IndexInput that have a fixed physical size in bytes. + * While this is a simple approach, a + * more performant approach would directly create an impl + * of IntIndexInput inside Directory. Wrapping a generic + * IndexInput will likely cost performance. + * + * @lucene.experimental + */ +public abstract class VariableIntFixedPhyBlockIndexInput extends IntIndexInput { + + protected final IndexInput in; + protected final int maxBlockSize; + protected final int phyBlockSize; + protected final static int HEADER = 8; /* 2 ints */ + + protected VariableIntFixedPhyBlockIndexInput(final IndexInput in) throws IOException { + this.in = in; + maxBlockSize = in.readInt(); + phyBlockSize = in.readInt(); + } + + @Override + public Reader reader() throws IOException { + final int[] buffer = new int[maxBlockSize]; + final IndexInput clone = (IndexInput) in.clone(); + // TODO: can this be simplified? + return new Reader(clone, buffer, this.getBlockReader(clone, buffer), phyBlockSize); + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public Index index() { + return new Index(); + } + + protected abstract BlockReader getBlockReader(IndexInput in, int[] buffer) throws IOException; + + public interface BlockReader { + public int readBlock() throws IOException; + // nocommit -- do we really need? + //public void seek(long pos) throws IOException; + } + + public static class Reader extends BulkPostingsEnum.BlockReader { + private final IndexInput in; + + public final int[] pending; + + private int offset; + private long lastBlockFP; + //private int blockSize; // nocommit redundant w/ limit? + private final BlockReader blockReader; + private int limit; + private final int phyBlockSize; + + public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader, final int phyBlockSize) + throws IOException { + this.in = in; + this.pending = pending; + this.blockReader = blockReader; + this.phyBlockSize = phyBlockSize; + } + + void seek(final long fp, final int upto) throws IOException { + //System.out.println("vintb seek fp=" + fp + " upto=" + upto); + // TODO: should we do this in real-time, not lazy? + offset = upto; + assert offset >= 0: "pendingUpto=" + offset; + if (fp != lastBlockFP) { + // Seek to new block + in.seek(fp); + // nocommit -- why? + //blockReader.seek(fp); + lastBlockFP = fp; + limit = blockReader.readBlock(); + } else { + // Seek w/in current block + } + + // TODO: if we were more clever when writing the + // index, such that a seek point wouldn't be written + // until the int encoder "committed", we could avoid + // this (likely minor) inefficiency: + + // This is necessary for int encoders that are + // non-causal, ie must see future int values to + // encode the current ones. + while(offset >= limit) { + //System.out.println("NON CAUSAL! offset=" + offset + " limit=" + limit); + offset -= limit; + fill(); + } + //System.out.println(" after skip bock offset=" + offset); + } + + @Override + public int[] getBuffer() { + return pending; + } + + @Override + public int end() { + return limit; + } + + @Override + public int offset() { + return offset; + } + + @Override + public void setOffset(int offset) { + this.offset = offset; + } + + @Override + public int fill() throws IOException { + lastBlockFP += phyBlockSize; + return limit = blockReader.readBlock(); + } + } + + private class Index extends IntIndexInput.Index { + private long fp; + private int upto; + + // This is used when reading skip data: + @Override + public void read(final DataInput indexIn, final boolean absolute) throws IOException { + if (absolute) { + fp = HEADER + (phyBlockSize * indexIn.readVLong()); + upto = indexIn.readByte()&0xFF; + } else { + final long delta = indexIn.readVLong(); + if ((delta & 1) == 1) { + // same block + upto += (delta >>> 1); + } else { + // new block + fp += (phyBlockSize * (delta >>> 1)); + upto = indexIn.readByte()&0xFF; + } + } + // TODO: we can't do this assert because non-causal + // int encoders can have upto over the buffer size + //assert upto < maxBlockSize: "upto=" + upto + " max=" + maxBlockSize; + } + + @Override + public String toString() { + return "VarIntBlock.Index fp=" + fp + " upto=" + upto + " maxBlock=" + maxBlockSize; + } + + @Override + public void seek(final BulkPostingsEnum.BlockReader other) throws IOException { + ((Reader) other).seek(fp, upto); + } + + @Override + public void set(final IntIndexInput.Index other) { + final Index idx = (Index) other; + fp = idx.fp; + upto = idx.upto; + } + + @Override + public Object clone() { + Index other = new Index(); + other.fp = fp; + other.upto = upto; + return other; + } + } +} Property changes on: lucene\src\java\org\apache\lucene\index\codecs\simple64\VariableIntFixedPhyBlockIndexInput.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/index/codecs/simple64/VariableIntFixedPhyBlockIndexOutput.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/simple64/VariableIntFixedPhyBlockIndexOutput.java (revision 0) +++ lucene/src/java/org/apache/lucene/index/codecs/simple64/VariableIntFixedPhyBlockIndexOutput.java (revision 0) @@ -0,0 +1,146 @@ +package org.apache.lucene.index.codecs.simple64; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.codecs.sep.IntIndexOutput; +import org.apache.lucene.store.IndexOutput; + +//TODO: much of this can be shared code w/ the variable case +//TODO: not specific to simple64, (e.g. can be used by simple9/simple16 at least) + +/** Abstract base class that writes variable-size blocks of ints + * to an IndexOutput that have a fixed physical size in bytes. + * While this is a simple approach, a + * more performant approach would directly create an impl + * of IntIndexOutput inside Directory. Wrapping a generic + * IndexInput will likely cost performance. + * + * @lucene.experimental + */ +public abstract class VariableIntFixedPhyBlockIndexOutput extends IntIndexOutput { + + protected final IndexOutput out; + + private int upto; + + // TODO: use vint so we can use unused simple selectors for larger blocks of 1s? + private static final int MAX_BLOCK_SIZE = 1 << 8; + private final int phyBlockSize; + private static final int HEADER = 8; /* two ints */ + + /** NOTE: maxBlockSize plus the max non-causal lookahead + * of your codec must be less than 256. EG Simple9 + * requires lookahead=1 because on seeing the Nth value + * it knows it must now encode the N-1 values before it. */ + protected VariableIntFixedPhyBlockIndexOutput(IndexOutput out, int maxBlockSize, int phyBlockSize) throws IOException { + if (maxBlockSize > MAX_BLOCK_SIZE) { + throw new IllegalArgumentException("maxBlockSize must be <= " + MAX_BLOCK_SIZE + "; got " + maxBlockSize); + } + this.out = out; + this.phyBlockSize = phyBlockSize; + out.writeInt(maxBlockSize); + out.writeInt(phyBlockSize); + } + + /** Called one value at a time. Return the number of + * buffered input values that have been written to out. */ + protected abstract int add(int value) throws IOException; + + @Override + public Index index() throws IOException { + return new Index(); + } + + private class Index extends IntIndexOutput.Index { + long fp; + int upto; + long lastFP; + int lastUpto; + + @Override + public void mark() throws IOException { + fp = out.getFilePointer(); + upto = VariableIntFixedPhyBlockIndexOutput.this.upto; + } + + @Override + public void set(IntIndexOutput.Index other) throws IOException { + Index idx = (Index) other; + lastFP = fp = idx.fp; + lastUpto = upto = idx.upto; + } + + @Override + public void write(IndexOutput indexOut, boolean absolute) throws IOException { + assert upto >= 0; + if (absolute) { + indexOut.writeVLong((fp - HEADER) / phyBlockSize); + indexOut.writeByte((byte) upto); + } else if (fp == lastFP) { + // same block + assert upto >= lastUpto; + int uptoDelta = upto - lastUpto; + indexOut.writeVLong(uptoDelta << 1 | 1); + } else { + // new block + indexOut.writeVLong(((fp - lastFP) / phyBlockSize) << 1); + indexOut.writeByte((byte) upto); + } + lastUpto = upto; + lastFP = fp; + } + + @Override + public String toString() { + return "VarIntFixedPhyBlock.Output fp=" + fp + " upto=" + upto; + } + } + + private boolean abort; + + @Override + public void write(int v) throws IOException { + boolean success = false; + try { + upto -= add(v)-1; + assert upto >= 0; + success = true; + } finally { + abort |= !success; + } + } + + @Override + public void close() throws IOException { + try { + // stuff 0s in until the "real" data is flushed: + if (!abort) { + int stuffed = 0; + while(upto > stuffed) { + upto -= add(0)-1; + assert upto >= 0; + stuffed += 1; + } + } + } finally { + out.close(); + } + } +} Property changes on: lucene\src\java\org\apache\lucene\index\codecs\simple64\VariableIntFixedPhyBlockIndexOutput.java ___________________________________________________________________ Added: svn:eol-style + native