Index: hbase-common/src/test/java/org/apache/hadoop/hbase/TestKeyValue.java =================================================================== --- hbase-common/src/test/java/org/apache/hadoop/hbase/TestKeyValue.java (revision 1450060) +++ hbase-common/src/test/java/org/apache/hadoop/hbase/TestKeyValue.java (working copy) @@ -27,6 +27,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.KeyValue.KVComparator; +import org.apache.hadoop.hbase.KeyValue.KeyComparator; import org.apache.hadoop.hbase.KeyValue.MetaComparator; import org.apache.hadoop.hbase.KeyValue.Type; import org.apache.hadoop.hbase.util.Bytes; @@ -493,4 +494,47 @@ assertEquals(HConstants.LATEST_TIMESTAMP, time1); assertEquals(12345L, time2); } + + public void testGetSmallestSeparator() { + final KeyComparator keyComparator = new KeyValue.KeyComparator(); + final KeyValue firstOnRowA = KeyValue.createFirstOnRow(rowA); + final KeyValue lastOnRowA = KeyValue.createLastOnRow(rowA); + assertTrue(keyComparator.compare(firstOnRowA.getKey(), lastOnRowA.getKey()) < 0); + byte[] newKey = keyComparator.getSmallestSeparator(firstOnRowA.getKey(), lastOnRowA.getKey()); + assertTrue(keyComparator.compare(firstOnRowA.getKey(), newKey) < 0); + assertTrue(keyComparator.compare(newKey, lastOnRowA.getKey()) <= 0); + //verify that faked shorter rowkey could be generated + long ts = 5; + KeyValue kv1 = new KeyValue(Bytes.toBytes("the quick brown fox"), family, qualA, ts, + Type.Put); + KeyValue kv2 = new KeyValue(Bytes.toBytes("the who test text"), family, qualA, ts, + Type.Put); + newKey = keyComparator.getSmallestSeparator(kv1.getKey(), kv2.getKey()); + assertTrue(keyComparator.compare(kv1.getKey(), newKey) < 0); + assertTrue(keyComparator.compare(newKey, kv2.getKey()) <= 0); + short newRowLength = Bytes.toShort(newKey, 0); + byte[] expectedArray = Bytes.toBytes("the r"); + Bytes.equals(newKey, KeyValue.ROW_LENGTH_SIZE, newRowLength, expectedArray, 0, + expectedArray.length); + //verify that faked ts could be generated + kv1 = new KeyValue(Bytes.toBytes("ilovehbase"), family, qualA, 5, + Type.Put); + kv2 = new KeyValue(Bytes.toBytes("ilovehbase"), family, qualA, 0, + Type.Put); + assertTrue(keyComparator.compare(kv1.getKey(), kv2.getKey()) < 0); + newKey = keyComparator.getSmallestSeparator(kv1.getKey(), kv2.getKey()); + assertTrue(keyComparator.compare(kv1.getKey(), newKey) < 0); + assertTrue(keyComparator.compare(newKey, kv2.getKey()) <= 0); + long newTimestamp = Bytes.toLong(newKey, newKey.length - KeyValue.TIMESTAMP_TYPE_SIZE); + assertTrue(newTimestamp == 4); + //verify root/metaKeyComparator's getSmallestSeparator output + final KeyComparator rootKeyComparator = new KeyValue.RootKeyComparator(); + final KeyComparator metaKeyComparator = new KeyValue.MetaKeyComparator(); + newKey = rootKeyComparator.getSmallestSeparator(kv1.getKey(), kv2.getKey()); + assertTrue(rootKeyComparator.compare(kv1.getKey(), newKey) < 0); + assertTrue(rootKeyComparator.compare(newKey, kv2.getKey()) == 0); + newKey = metaKeyComparator.getSmallestSeparator(kv1.getKey(), kv2.getKey()); + assertTrue(metaKeyComparator.compare(kv1.getKey(), newKey) < 0); + assertTrue(metaKeyComparator.compare(newKey, kv2.getKey()) == 0); + } } \ No newline at end of file Index: hbase-common/src/main/java/org/apache/hadoop/hbase/KeyValue.java =================================================================== --- hbase-common/src/main/java/org/apache/hadoop/hbase/KeyValue.java (revision 1450060) +++ hbase-common/src/main/java/org/apache/hadoop/hbase/KeyValue.java (working copy) @@ -26,6 +26,7 @@ import java.io.OutputStream; import java.nio.BufferOverflowException; import java.nio.ByteBuffer; +import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Map; @@ -2495,6 +2496,16 @@ byte[] right, int roffset, int rlength) { return Bytes.compareTo(left, loffset, llength, right, roffset, rlength); } + + /** + * @param leftKey + * @param rightKey + * Currently, let's return rightKey directly for meta/root key comparator + * @see {@link KeyComparator#getSmallestSeparator(byte[] leftKey, byte[] rightKey)} + */ + public byte[] getSmallestSeparator(byte[] leftKey, byte[] rightKey) { + return Arrays.copyOf(rightKey, rightKey.length); + } } /** @@ -2705,6 +2716,57 @@ } return 0; } + + /** + * Generate a shorter byte array if possible, see HBASE-7845 + * Else generate a smaller TS than leftKey, see HBASE-4443 + * All of above need to ensure: leftKey < newKey <= rightKey + * Please use it on HFileV2 only(includeMemstoreTS is true by default) + * @return newKey: the new faked key + */ + public byte[] getSmallestSeparator(byte[] leftKey, byte[] rightKey) { + if (rightKey == null) { + throw new IllegalArgumentException("rightKey can not be null"); + } + if (leftKey == null) { + return Arrays.copyOf(rightKey, rightKey.length); + } + + // Compare row + short leftRowLength = Bytes.toShort(leftKey, 0); + short rightRowLength = Bytes.toShort(rightKey, 0); + if (leftRowLength != rightRowLength || compareRows(leftKey, ROW_LENGTH_SIZE, leftRowLength, + rightKey, ROW_LENGTH_SIZE, rightRowLength) != 0) { + int minLength = Math.min(leftRowLength, rightRowLength); + int diffIdx = 0; + while (diffIdx < minLength && leftKey[ROW_LENGTH_SIZE + diffIdx] == + rightKey[ROW_LENGTH_SIZE + diffIdx]) { + diffIdx++; + } + if (diffIdx >= minLength) { + // Do not shorten if is a prefix + return Arrays.copyOf(rightKey, rightKey.length); + } + int diffByte = leftKey[ROW_LENGTH_SIZE + diffIdx]; + if ((0xff & diffByte) < 0xff && (diffByte + 1) < + (rightKey[ROW_LENGTH_SIZE + diffIdx] & 0xff)) { + byte[] newKey = new byte[ROW_LENGTH_SIZE + diffIdx + 1]; + System.arraycopy(leftKey, 0, newKey, 0, ROW_LENGTH_SIZE + diffIdx); + newKey[ROW_LENGTH_SIZE + diffIdx] = (byte)(diffByte + 1); + return newKey; + } + return Arrays.copyOf(rightKey, rightKey.length); + } + // If the rows are equal, let's generate a smallert TS than leftKey's + long leftTimestamp = Bytes.toLong(leftKey, leftKey.length - TIMESTAMP_TYPE_SIZE); + if (leftTimestamp == HConstants.OLDEST_TIMESTAMP) { + return Arrays.copyOf(rightKey, rightKey.length); + } + long newTimestamp = leftTimestamp - 1; + byte[] newKey = Arrays.copyOf(leftKey, leftKey.length); + Bytes.putLong(newKey, leftKey.length - TIMESTAMP_TYPE_SIZE, newTimestamp); + return newKey; + } } /** Index: hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterV2.java =================================================================== --- hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterV2.java (revision 1450060) +++ hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterV2.java (working copy) @@ -34,6 +34,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.KeyValue.KeyComparator; +import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.io.hfile.HFile.Writer; import org.apache.hadoop.hbase.io.hfile.HFileBlock.BlockWritable; @@ -77,6 +78,9 @@ /** The offset of the last data block or 0 if the file is empty. */ private long lastDataBlockOffset; + /** The last(stop) Key of the previous data block. */ + private byte[] lastKeyOfPreviousBlock = null; + /** Additional data items to be written to the "load-on-open" section. */ private List additionalLoadOnOpenData = new ArrayList(); @@ -169,12 +173,20 @@ // Update the last data block offset lastDataBlockOffset = outputStream.getPos(); - fsBlockWriter.writeHeaderAndData(outputStream); - int onDiskSize = fsBlockWriter.getOnDiskSizeWithHeader(); - dataBlockIndexWriter.addEntry(firstKeyInBlock, lastDataBlockOffset, - onDiskSize); + // Generate a shorter faked key into index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who test text". We can use "the r" as the + // key for the index block entry since it is > all entries in the previous block and <= all + // entries in subsequent blocks. + if (comparator instanceof KeyComparator && blockEncoder.getEncodingOnDisk() == + DataBlockEncoding.NONE) { + byte[] shortestSeparator = ((KeyComparator) comparator).getSmallestSeparator( + lastKeyOfPreviousBlock, firstKeyInBlock); + dataBlockIndexWriter.addEntry(shortestSeparator, lastDataBlockOffset,onDiskSize); + } else { + dataBlockIndexWriter.addEntry(firstKeyInBlock, lastDataBlockOffset,onDiskSize); + } totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader(); HFile.offerWriteLatency(System.nanoTime() - startTimeNs); @@ -229,6 +241,10 @@ // This is where the next block begins. fsBlockWriter.startWriting(BlockType.DATA); firstKeyInBlock = null; + if (lastKeyLength > 0) { + lastKeyOfPreviousBlock = new byte[lastKeyLength]; + System.arraycopy(lastKeyBuffer, lastKeyOffset, lastKeyOfPreviousBlock, 0, lastKeyLength); + } } /** @@ -463,4 +479,4 @@ } }); } -} \ No newline at end of file +}