diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java index 3e26107..c192173 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java @@ -187,6 +187,7 @@ public class HFileBlock implements Cacheable { */ private int nextBlockOnDiskSizeWithHeader = -1; + private volatile int[] index; /** * Creates a new {@link HFile} block from the given fields. This constructor * is mostly used when the block data has already been read and uncompressed, @@ -523,13 +524,15 @@ public class HFileBlock implements Cacheable { public long heapSize() { long size = ClassSize.align( ClassSize.OBJECT + - // Block type, byte buffer and meta references - 3 * ClassSize.REFERENCE + + // Block type, byte buffer, meta, and index references + 4 * ClassSize.REFERENCE + // On-disk size, uncompressed size, and next block's on-disk size // bytePerChecksum and onDiskDataSize 4 * Bytes.SIZEOF_INT + // This and previous block offset 2 * Bytes.SIZEOF_LONG + + // The Cell index + ClassSize.ARRAY + // Heap size of the meta object. meta will be always not null. fileContext.heapSize() ); @@ -1740,6 +1743,12 @@ public class HFileBlock implements Cacheable { return this.fileContext; } + public void setIndex(int[] index) { + this.index = index; + } + public int[] getIndex() { + return this.index; + } /** * Convert the contents of the block header into a human readable string. * This is mostly helpful for debugging. This assumes that the block diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV2.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV2.java index 1292319..90e720a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV2.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV2.java @@ -21,8 +21,11 @@ import java.io.DataInput; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; import java.util.List; +import org.apache.commons.lang.ArrayUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -926,6 +929,48 @@ public class HFileReaderV2 extends AbstractHFileReader { this.nextIndexedKey = null; } + protected void updateIndexIfNeeded() { + if (block.getIndex() == null) { + // generate a block index --- do this at load time instead? Would save the synch huh-hah + synchronized (block) { + if (block.getIndex() == null) { + int oldPosition = blockBuffer.position(); + blockBuffer.position(0); + List indexes = new ArrayList(); + while(blockBuffer.remaining() > 0) { + indexes.add(blockBuffer.position()); + readKeyValueLen(); + blockBuffer.position(blockBuffer.position() + currKeyLen + currValueLen + currMemstoreTSLen + KEY_VALUE_LEN_SIZE); + } + block.setIndex(ArrayUtils.toPrimitive(indexes.toArray(new Integer[indexes.size()]))); + blockBuffer.position(oldPosition); + // --- + } + } + } + } + + protected int binarySearch(Cell key) { + updateIndexIfNeeded(); + KeyValue.KeyOnlyKeyValue keyOnlykv = new KeyValue.KeyOnlyKeyValue(); + + int[] a = block.getIndex(); + int low = 0; + int high = a.length - 1; + + while (low <= high) { + int mid = (low + high) >>> 1; + int kvOffset = blockBuffer.arrayOffset() + a[mid]; + int klen = Bytes.toInt(blockBuffer.array(), kvOffset); + keyOnlykv.setKey(blockBuffer.array(), kvOffset+KEY_VALUE_LEN_SIZE, klen); + int cmp = reader.getComparator().compareOnlyKeyPortion(key, keyOnlykv); + if (cmp > 0) low = mid + 1; + else if (cmp < 0) high = mid - 1; + else return mid; // found + } + return -(low+1); // not found. + } + protected void readKeyValueLen() { blockBuffer.mark(); currKeyLen = blockBuffer.getInt(); @@ -978,79 +1023,43 @@ public class HFileReaderV2 extends AbstractHFileReader { * key) */ protected int blockSeek(Cell key, boolean seekBefore) { - int klen, vlen; - long memstoreTS = 0; - int memstoreTSLen = 0; - int lastKeyValueSize = -1; - KeyValue.KeyOnlyKeyValue keyOnlykv = new KeyValue.KeyOnlyKeyValue(); - do { - blockBuffer.mark(); - klen = blockBuffer.getInt(); - vlen = blockBuffer.getInt(); - blockBuffer.reset(); - if (this.reader.shouldIncludeMemstoreTS()) { - if (this.reader.decodeMemstoreTS) { - try { - int memstoreTSOffset = blockBuffer.arrayOffset() + blockBuffer.position() - + KEY_VALUE_LEN_SIZE + klen + vlen; - memstoreTS = Bytes.readVLong(blockBuffer.array(), memstoreTSOffset); - memstoreTSLen = WritableUtils.getVIntSize(memstoreTS); - } catch (Exception e) { - throw new RuntimeException("Error reading memstore timestamp", e); - } - } else { - memstoreTS = 0; - memstoreTSLen = 1; - } - } - - int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position() + KEY_VALUE_LEN_SIZE; - keyOnlykv.setKey(blockBuffer.array(), keyOffset, klen); - int comp = reader.getComparator().compareOnlyKeyPortion(key, keyOnlykv); - - if (comp == 0) { - if (seekBefore) { - if (lastKeyValueSize < 0) { - KeyValue kv = KeyValueUtil.ensureKeyValue(key); - throw new IllegalStateException("blockSeek with seekBefore " - + "at the first key of the block: key=" - + Bytes.toStringBinary(kv.getKey(), kv.getKeyOffset(), kv.getKeyLength()) - + ", blockOffset=" + block.getOffset() + ", onDiskSize=" - + block.getOnDiskSizeWithHeader()); - } - blockBuffer.position(blockBuffer.position() - lastKeyValueSize); - readKeyValueLen(); - return 1; // non exact match. + int pos = binarySearch(key); + if (pos >= 0) { + if (seekBefore) { + if (pos == 0) { + KeyValue kv = KeyValueUtil.ensureKeyValue(key); + throw new IllegalStateException("blockSeek with seekBefore " + + "at the first key of the block: key=" + + Bytes.toStringBinary(kv.getKey(), kv.getKeyOffset(), kv.getKeyLength()) + + ", blockOffset=" + block.getOffset() + ", onDiskSize=" + + block.getOnDiskSizeWithHeader()); } - currKeyLen = klen; - currValueLen = vlen; - if (this.reader.shouldIncludeMemstoreTS()) { - currMemstoreTS = memstoreTS; - currMemstoreTSLen = memstoreTSLen; - } - return 0; // indicate exact match - } else if (comp < 0) { - if (lastKeyValueSize > 0) - blockBuffer.position(blockBuffer.position() - lastKeyValueSize); + blockBuffer.position(block.getIndex()[pos-1]); + //blockBuffer.position(pos); readKeyValueLen(); - if (lastKeyValueSize == -1 && blockBuffer.position() == 0 - && this.reader.trailer.getMinorVersion() >= MINOR_VERSION_WITH_FAKED_KEY) { - return HConstants.INDEX_KEY_MAGIC; - } - return 1; + return 1; // non exact match. + } else { + blockBuffer.position(block.getIndex()[pos]); + readKeyValueLen(); + return 0; // indicate exact match } + } else { + // key does not exist, place scanner right before + if (-pos-1 == 0) { + blockBuffer.position(block.getIndex()[-pos - 1]); + } else { + blockBuffer.position(block.getIndex()[-pos - 1 - 1]); + } + readKeyValueLen(); + if (-pos - 1 == 0 && useFakeKey()) { + return HConstants.INDEX_KEY_MAGIC; + } + return 1; // non exact match + } + } - // The size of this key/value tuple, including key/value length fields. - lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE; - blockBuffer.position(blockBuffer.position() + lastKeyValueSize); - } while (blockBuffer.remaining() > 0); - - // Seek to the last key we successfully read. This will happen if this is - // the last key/value pair in the file, in which case the following call - // to next() has to return false. - blockBuffer.position(blockBuffer.position() - lastKeyValueSize); - readKeyValueLen(); - return 1; // didn't exactly find it. + protected boolean useFakeKey() { + return this.reader.trailer.getMinorVersion() >= MINOR_VERSION_WITH_FAKED_KEY; } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV3.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV3.java index 5460a14..3f7f7ab 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV3.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV3.java @@ -20,7 +20,10 @@ package org.apache.hadoop.hbase.io.hfile; import java.io.IOException; import java.security.Key; import java.security.KeyException; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.lang.ArrayUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -229,116 +232,32 @@ public class HFileReaderV3 extends HFileReaderV2 { blockBuffer.reset(); } - /** - * Within a loaded block, seek looking for the last key that is smaller than - * (or equal to?) the key we are interested in. - * A note on the seekBefore: if you have seekBefore = true, AND the first - * key in the block = key, then you'll get thrown exceptions. The caller has - * to check for that case and load the previous block as appropriate. - * @param key - * the key to find - * @param seekBefore - * find the key before the given key in case of exact match. - * @return 0 in case of an exact key match, 1 in case of an inexact match, - * -2 in case of an inexact match and furthermore, the input key - * less than the first key of current block(e.g. using a faked index - * key) - */ - @Override - protected int blockSeek(Cell key, boolean seekBefore) { - int klen, vlen, tlen = 0; - long memstoreTS = 0; - int memstoreTSLen = 0; - int lastKeyValueSize = -1; - KeyValue.KeyOnlyKeyValue keyOnlyKv = new KeyValue.KeyOnlyKeyValue(); - do { - blockBuffer.mark(); - klen = blockBuffer.getInt(); - vlen = blockBuffer.getInt(); - if (klen < 0 || vlen < 0 || klen > blockBuffer.limit() - || vlen > blockBuffer.limit()) { - throw new IllegalStateException("Invalid klen " + klen + " or vlen " - + vlen + ". Block offset: " - + block.getOffset() + ", block length: " + blockBuffer.limit() + ", position: " - + blockBuffer.position() + " (without header)."); - } - ByteBufferUtils.skip(blockBuffer, klen + vlen); - if (reader.hfileContext.isIncludesTags()) { - // Read short as unsigned, high byte first - tlen = ((blockBuffer.get() & 0xff) << 8) ^ (blockBuffer.get() & 0xff); - if (tlen < 0 || tlen > blockBuffer.limit()) { - throw new IllegalStateException("Invalid tlen " + tlen + ". Block offset: " - + block.getOffset() + ", block length: " + blockBuffer.limit() + ", position: " - + blockBuffer.position() + " (without header)."); - } - ByteBufferUtils.skip(blockBuffer, tlen); - } - if (this.reader.shouldIncludeMemstoreTS()) { - if (this.reader.decodeMemstoreTS) { - try { - memstoreTS = Bytes.readVLong(blockBuffer.array(), blockBuffer.arrayOffset() - + blockBuffer.position()); - memstoreTSLen = WritableUtils.getVIntSize(memstoreTS); - } catch (Exception e) { - throw new RuntimeException("Error reading memstore timestamp", e); - } - } else { - memstoreTS = 0; - memstoreTSLen = 1; - } - } - blockBuffer.reset(); - int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position() + (Bytes.SIZEOF_INT * 2); - keyOnlyKv.setKey(blockBuffer.array(), keyOffset, klen); - int comp = reader.getComparator().compareOnlyKeyPortion(key, keyOnlyKv); + protected boolean useFakeKey() { + return true; + } - if (comp == 0) { - if (seekBefore) { - if (lastKeyValueSize < 0) { - KeyValue kv = KeyValueUtil.ensureKeyValue(key); - throw new IllegalStateException("blockSeek with seekBefore " - + "at the first key of the block: key=" - + Bytes.toStringBinary(kv.getKey(), kv.getKeyOffset(), kv.getKeyLength()) - + ", blockOffset=" + block.getOffset() + ", onDiskSize=" - + block.getOnDiskSizeWithHeader()); + @Override + protected void updateIndexIfNeeded() { + if (block.getIndex() == null) { + // generate a block index --- do this at load time instead? Would save the synch huh-hah + synchronized (block) { + if (block.getIndex() == null) { + int oldPosition = blockBuffer.position(); + List indexes = new ArrayList(); + while(blockBuffer.remaining() > 0) { + indexes.add(blockBuffer.position()); + readKeyValueLen(); + blockBuffer.position(blockBuffer.position() + currKeyLen + currValueLen + + currMemstoreTSLen + + (reader.hfileContext.isIncludesTags() ? currTagsLen + Bytes.SIZEOF_SHORT : 0) + + KEY_VALUE_LEN_SIZE); } - blockBuffer.position(blockBuffer.position() - lastKeyValueSize); - readKeyValueLen(); - return 1; // non exact match. - } - currKeyLen = klen; - currValueLen = vlen; - currTagsLen = tlen; - if (this.reader.shouldIncludeMemstoreTS()) { - currMemstoreTS = memstoreTS; - currMemstoreTSLen = memstoreTSLen; - } - return 0; // indicate exact match - } else if (comp < 0) { - if (lastKeyValueSize > 0) - blockBuffer.position(blockBuffer.position() - lastKeyValueSize); - readKeyValueLen(); - if (lastKeyValueSize == -1 && blockBuffer.position() == 0) { - return HConstants.INDEX_KEY_MAGIC; + block.setIndex(ArrayUtils.toPrimitive(indexes.toArray(new Integer[indexes.size()]))); + blockBuffer.position(oldPosition); + // --- } - return 1; } - - // The size of this key/value tuple, including key/value length fields. - lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE; - // include tag length also if tags included with KV - if (reader.hfileContext.isIncludesTags()) { - lastKeyValueSize += tlen + Bytes.SIZEOF_SHORT; - } - blockBuffer.position(blockBuffer.position() + lastKeyValueSize); - } while (blockBuffer.remaining() > 0); - - // Seek to the last key we successfully read. This will happen if this is - // the last key/value pair in the file, in which case the following call - // to next() has to return false. - blockBuffer.position(blockBuffer.position() - lastKeyValueSize); - readKeyValueLen(); - return 1; // didn't exactly find it. + } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestSeekPerformance.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestSeekPerformance.java new file mode 100644 index 0000000..8e7202a --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestSeekPerformance.java @@ -0,0 +1,87 @@ +package org.apache.hadoop.hbase.client; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +public class TestSeekPerformance { + final Log LOG = LogFactory.getLog(getClass()); + protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + private static byte [] FAMILY = Bytes.toBytes("fam"); + + /** + * @throws java.lang.Exception + */ + @BeforeClass + public static void setUpBeforeClass() throws Exception { + TEST_UTIL.startMiniCluster(1); + } + + /** + * @throws java.lang.Exception + */ + @AfterClass + public static void tearDownAfterClass() throws Exception { + TEST_UTIL.shutdownMiniCluster(); + } + + @Ignore + @Test + public void testGetPerformance() throws Exception { + byte[] name = Bytes.toBytes("testGetPerformance"); + HTable t = TEST_UTIL.createTable(name, FAMILY); + //byte[] v = Bytes.toBytes("Some string that is longer than just three characters and makes this a more real scenario..."); + //int rowCount = TEST_UTIL.loadTable(t, new byte[][] { FAMILY }, v, false); + int rowCount = TEST_UTIL.loadTable(t, FAMILY, false); + TEST_UTIL.getHBaseAdmin().flush(name); + + Random r = new Random(); + LOG.info("Beginning testGetPerformance"); + long start = System.currentTimeMillis(); + for (int i = 0; i < 10000; i++) { + List gets = new ArrayList(100); + for (int j = 0; j < 100; j++) { + Get g = new Get(HBaseTestingUtility.ROWS[r.nextInt(rowCount)]); + g.addFamily(FAMILY); + gets.add(g); + } + assertEquals(t.get(gets).length, 100); + } + LOG.info("Finished testGetPerformance: " + (System.currentTimeMillis() - start)); + } + + @Ignore + @Test + public void testScanPerformance() throws Exception { + byte[] name = Bytes.toBytes("testScanPerformance"); + HTable t = TEST_UTIL.createTable(name, FAMILY); + //byte[] v = Bytes.toBytes("Some string that is longer than just three characters and makes this a more real scenario..."); + //int rowCount = TEST_UTIL.loadTable(t, new byte[][] { FAMILY }, v, false); + int rowCount = TEST_UTIL.loadTable(t, FAMILY, false); + TEST_UTIL.getHBaseAdmin().flush(name); + + Random r = new Random(); + LOG.info("Beginning testScanPerformance"); + long start = System.currentTimeMillis(); + for (int i = 0; i < 10000; i++) { + Scan s = new Scan(HBaseTestingUtility.ROWS[r.nextInt(rowCount-1000)]); + s.setCaching(100); + s.addFamily(FAMILY); + ResultScanner rs = t.getScanner(s); + assertEquals(rs.next(1000).length, 1000); + rs.close(); + } + LOG.info("Finished testScanPerformance: " + (System.currentTimeMillis() - start)); + } +}