Index: src/test/java/org/apache/hadoop/hbase/TestKeyValue.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/TestKeyValue.java (revision 1043216) +++ src/test/java/org/apache/hadoop/hbase/TestKeyValue.java (working copy) @@ -34,6 +34,14 @@ public class TestKeyValue extends TestCase { private final Log LOG = LogFactory.getLog(this.getClass().getName()); + public void testVersion() { + int expected = KeyValue.getVersion(KeyValue.VERSION_BITS); + KeyValue kv = new KeyValue(Bytes.toBytes("test"), + System.currentTimeMillis(), Type.Put); + int found = kv.getVersion(); + assertEquals(expected, found); + } + public void testColumnCompare() throws Exception { final byte [] a = Bytes.toBytes("aaa"); byte [] family1 = Bytes.toBytes("abc"); Index: src/main/java/org/apache/hadoop/hbase/KeyValue.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/KeyValue.java (revision 1043218) +++ src/main/java/org/apache/hadoop/hbase/KeyValue.java (working copy) @@ -36,7 +36,9 @@ import org.apache.hadoop.io.Writable; /** - * An HBase Key/Value. This is the fundamental HBase Type. + * An HBase Key/Value. This is the fundamental HBase Type. Its persisted into + * StoreFiles/HFiles in the FileSystem and is what a {@link org.hadoop.hbase.client.Result} + * carries from server to HBase client. * *

If being used client-side, the primary methods to access individual fields * are {@link #getRow()}, {@link #getFamily()}, {@link #getQualifier()}, @@ -54,7 +56,7 @@ * format inside a byte array is: * <keylength> <valuelength> <key> <value> * Key is further decomposed as: - * <rowlength> <row> <columnfamilylength> <columnfamily> <columnqualifier> <timestamp> <keytype> + * <rowlength> <row> <columnfamilylength> <columnfamily> <columnqualifier> <timestamp> <insertionseqid> <keytype> * The rowlength maximum is Short.MAX_SIZE, * column family length maximum is * Byte.MAX_SIZE, and column qualifier + key length must @@ -63,8 +65,6 @@ */ public class KeyValue implements Writable, HeapSize { static final Log LOG = LogFactory.getLog(KeyValue.class); - // TODO: Group Key-only comparators and operations into a Key class, just - // for neatness sake, if can figure what to call it. /** * Colon character in UTF-8 @@ -72,7 +72,7 @@ public static final char COLUMN_FAMILY_DELIMITER = ':'; public static final byte[] COLUMN_FAMILY_DELIM_ARRAY = - new byte[]{COLUMN_FAMILY_DELIMITER}; + new byte[] {COLUMN_FAMILY_DELIMITER}; /** * Comparator for plain key/values; i.e. non-catalog table key/values. @@ -111,12 +111,8 @@ /** * Get the appropriate row comparator for the specified table. - * - * Hopefully we can get rid of this, I added this here because it's replacing - * something in HSK. We should move completely off of that. - * * @param tableName The table name. - * @return The comparator. + * @return The comparator to use on this table. */ public static KeyComparator getRowComparator(byte [] tableName) { if(Bytes.equals(HTableDescriptor.ROOT_TABLEDESC.getName(),tableName)) { @@ -128,16 +124,18 @@ return COMPARATOR.getRawComparator(); } - // Size of the timestamp and type byte on end of a key -- a long + a byte. - public static final int TIMESTAMP_TYPE_SIZE = + // Size of the timestamp, insertion seqid long and the type byte on the end + // of the key portion of a KeyValue -- two longs and a byte. + public static final int TIMESTAMP_SEQID_TYPE_SIZE = Bytes.SIZEOF_LONG /* timestamp */ + + Bytes.SIZEOF_LONG /* insertion seqid */ + Bytes.SIZEOF_BYTE /*keytype*/; - // Size of the length shorts and bytes in key. + // Size of the length shorts and bytes in a key. public static final int KEY_INFRASTRUCTURE_SIZE = Bytes.SIZEOF_SHORT /*rowlength*/ + Bytes.SIZEOF_BYTE /*columnfamilylength*/ + - TIMESTAMP_TYPE_SIZE; + TIMESTAMP_SEQID_TYPE_SIZE; // How far into the key the row starts at. First thing to read is the short // that says how long the row is. @@ -149,9 +147,21 @@ public static final int KEYVALUE_INFRASTRUCTURE_SIZE = ROW_OFFSET; /** + * Mask used comparing KeyValue Types without regard to KeyValue version. + */ + private static final byte VERSION_MASK = (byte)192; + private static final byte VERSION_MASK_INVERSE = ~VERSION_MASK; + + /** + * This KeyValues version in bits that can be OR'd into a Type. + */ + static final byte VERSION_BITS = (byte)64; + + /** * Key type. * Has space for other key types to be added later. Cannot rely on * enum ordinals . They change if item is removed or moved. Do our own codes. + * Keys are versioned using two most significant bytes in Type. */ public static enum Type { Minimum((byte)0), @@ -161,6 +171,14 @@ DeleteColumn((byte)12), DeleteFamily((byte)14), + // Bit 64 and 128 are reserved used specifying KeyValue version. If top + // two bits zero, then version is 0. If 7th bit is set -- 64 -- then + // version 1. Version is ignored when KV is compared. We define Version + // below just to show that the bits are occupied. The version bit twiddling + // and compares are done elsewhere, outside of this enum. See VERSION_MASK + // and VERSION_BITS defines above. + Version((byte)192), + // Maximum is used when searching; you look from maximum on down. Maximum((byte)255); @@ -182,11 +200,12 @@ */ public static Type codeToType(final byte b) { for (Type t : Type.values()) { - if (t.getCode() == b) { + // When comparing, do not compare on version. + if (t.getCode() == stripVersionFromType(b)) { return t; } } - throw new RuntimeException("Unknown code " + b); + throw new RuntimeException("Unknown code " + stripVersionFromType(b)); } } @@ -205,18 +224,6 @@ // the row cached private byte [] rowCache = null; - - /** Here be dragons **/ - - // used to achieve atomic operations in the memstore. - public long getMemstoreTS() { - return memstoreTS; - } - - public void setMemstoreTS(long memstoreTS) { - this.memstoreTS = memstoreTS; - } - // default value is 0, aka DNC private long memstoreTS = 0; @@ -473,7 +480,9 @@ pos = Bytes.putBytes(bytes, pos, qualifier, qoffset, qlength); } pos = Bytes.putLong(bytes, pos, timestamp); - pos = Bytes.putByte(bytes, pos, type.getCode()); + // Put insertion seqid. TODO: FIXXXX + pos = Bytes.putLong(bytes, pos, timestamp); + pos = Bytes.putByte(bytes, pos, addVersionToType(type.getCode())); if (value != null && value.length > 0) { pos = Bytes.putBytes(bytes, pos, value, voffset, vlength); } @@ -481,6 +490,20 @@ } /** + * Decorate passed type with this KVs version. + * @param type + * @return Type with Version added. + */ + private static byte addVersionToType(final byte type) { + // First blank out any existing version before adding this KVs. + return (byte)(((type & VERSION_MASK_INVERSE)) | VERSION_BITS); + } + + private static byte stripVersionFromType(final byte b) { + return (byte)(b & VERSION_MASK); + } + + /** * Write KeyValue format into a byte array. *

* Takes column in the form family:qualifier @@ -601,13 +624,13 @@ String row = Bytes.toStringBinary(b, o + Bytes.SIZEOF_SHORT, rowlength); int columnoffset = o + Bytes.SIZEOF_SHORT + 1 + rowlength; int familylength = b[columnoffset - 1]; - int columnlength = l - ((columnoffset - o) + TIMESTAMP_TYPE_SIZE); + int columnlength = l - ((columnoffset - o) + TIMESTAMP_SEQID_TYPE_SIZE); String family = familylength == 0? "": Bytes.toStringBinary(b, columnoffset, familylength); String qualifier = columnlength == 0? "": Bytes.toStringBinary(b, columnoffset + familylength, columnlength - familylength); - long timestamp = Bytes.toLong(b, o + (l - TIMESTAMP_TYPE_SIZE)); + long timestamp = Bytes.toLong(b, o + (l - TIMESTAMP_SEQID_TYPE_SIZE)); byte type = b[o + l - 1]; // return row + "/" + family + // (family != null && family.length() > 0? COLUMN_FAMILY_DELIMITER: "") + @@ -644,6 +667,29 @@ return length; } + public long getMemstoreTS() { + return memstoreTS; + } + + public void setMemstoreTS(long memstoreTS) { + this.memstoreTS = memstoreTS; + } + + /** + * @return This instance's version. + */ + public int getVersion() { + return getVersion(getType()); + } + + /** + * @param type + * @return Version that is in passed type + */ + public static int getVersion(final byte type) { + return (type & VERSION_MASK) >>> 6; + } + //--------------------------------------------------------------------------- // // Length and Offset Calculators @@ -801,7 +847,7 @@ * @return Timestamp offset */ public int getTimestampOffset(final int keylength) { - return getKeyOffset() + keylength - TIMESTAMP_TYPE_SIZE; + return getKeyOffset() + keylength - TIMESTAMP_SEQID_TYPE_SIZE; } /** @@ -1872,9 +1918,9 @@ // Compare column family. Start compare past row and family length. int lcolumnoffset = Bytes.SIZEOF_SHORT + lrowlength + 1 + loffset; int rcolumnoffset = Bytes.SIZEOF_SHORT + rrowlength + 1 + roffset; - int lcolumnlength = llength - TIMESTAMP_TYPE_SIZE - + int lcolumnlength = llength - TIMESTAMP_SEQID_TYPE_SIZE - (lcolumnoffset - loffset); - int rcolumnlength = rlength - TIMESTAMP_TYPE_SIZE - + int rcolumnlength = rlength - TIMESTAMP_SEQID_TYPE_SIZE - (rcolumnoffset - roffset); // if row matches, and no column in the 'left' AND put type is 'minimum', @@ -1905,9 +1951,9 @@ if (!this.ignoreTimestamp) { // Get timestamps. long ltimestamp = Bytes.toLong(left, - loffset + (llength - TIMESTAMP_TYPE_SIZE)); + loffset + (llength - TIMESTAMP_SEQID_TYPE_SIZE)); long rtimestamp = Bytes.toLong(right, - roffset + (rlength - TIMESTAMP_TYPE_SIZE)); + roffset + (rlength - TIMESTAMP_SEQID_TYPE_SIZE)); compare = compareTimestamps(ltimestamp, rtimestamp); if (compare != 0) { return compare;