diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Scan.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Scan.java index fe9745e..1892f54 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Scan.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Scan.java @@ -227,6 +227,7 @@ public class Scan extends Query { filter = scan.getFilter(); // clone? loadColumnFamiliesOnDemand = scan.getLoadColumnFamiliesOnDemandValue(); consistency = scan.getConsistency(); + this.setIsolationLevel(scan.getIsolationLevel()); reversed = scan.isReversed(); asyncPrefetch = scan.isAsyncPrefetch(); small = scan.isSmall(); @@ -271,6 +272,7 @@ public class Scan extends Query { this.getScan = true; this.asyncPrefetch = false; this.consistency = get.getConsistency(); + this.setIsolationLevel(get.getIsolationLevel()); for (Map.Entry attr : get.getAttributesMap().entrySet()) { setAttribute(attr.getKey(), attr.getValue()); } diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestIncrement.java b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestIncrement.java index 4b9f113..c38340d 100644 --- a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestIncrement.java +++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestIncrement.java @@ -30,7 +30,7 @@ import org.junit.experimental.categories.Category; @Category({ClientTests.class, SmallTests.class}) public class TestIncrement { @Test - public void test() { + public void testIncrementInstance() { final long expected = 13; Increment inc = new Increment(new byte [] {'r'}); int total = 0; diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/CellUtil.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/CellUtil.java index 1b38b56..0ca69ed 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/CellUtil.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/CellUtil.java @@ -841,7 +841,7 @@ public final class CellUtil { final int tagsLength = cell.getTagsLength(); // Save an object allocation where we can if (tagsLength == 0) { - return EMPTY_TAGS_ITR; + return TagUtil.EMPTY_TAGS_ITR; } if (cell instanceof ByteBufferedCell) { return tagsIterator(((ByteBufferedCell) cell).getTagsByteBuffer(), @@ -1387,7 +1387,7 @@ public final class CellUtil { /** * Compares the row of two keyvalues for equality - * + * * @param left * @param right * @return True if rows match. @@ -2288,4 +2288,4 @@ public final class CellUtil { return Type.DeleteFamily.getCode(); } } -} \ No newline at end of file +} diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 1b71cb4..4e07e6a 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -1012,11 +1012,6 @@ public final class HConstants { public static final String LOAD_BALANCER_SLOP_KEY = "hbase.regions.slop"; - /** - * The byte array represents for NO_NEXT_INDEXED_KEY; - * The actual value is irrelevant because this is always compared by reference. - */ - public static final Cell NO_NEXT_INDEXED_KEY = new KeyValue(); /** delimiter used between portions of a region name */ public static final int DELIMITER = ','; diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/Tag.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/Tag.java index 1d55baa..c6698f5 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/Tag.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/Tag.java @@ -75,4 +75,4 @@ public interface Tag { * @return The {@link java.nio.ByteBuffer} containing the value bytes. */ ByteBuffer getValueByteBuffer(); -} +} \ No newline at end of file diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/TagUtil.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/TagUtil.java index 15ddfc8..e6214f3 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/TagUtil.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/TagUtil.java @@ -22,6 +22,7 @@ import static org.apache.hadoop.hbase.Tag.TAG_LENGTH_SIZE; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import org.apache.hadoop.hbase.classification.InterfaceAudience; @@ -104,7 +105,7 @@ public final class TagUtil { * @return the serialized tag data as bytes */ public static byte[] fromList(List tags) { - if (tags.isEmpty()) { + if (tags == null || tags.isEmpty()) { return HConstants.EMPTY_BYTE_ARRAY; } int length = 0; @@ -216,4 +217,74 @@ public final class TagUtil { } return StreamUtils.readRawVarint32(tag.getValueByteBuffer(), offset); } + + /** + * @return A List<Tag> of any Tags found in cell else null. + */ + public static List carryForwardTags(final Cell cell) { + return carryForwardTags(null, cell); + } + + /** + * Add to tagsOrNull any Tags cell is carrying or null if none. + */ + public static List carryForwardTags(final List tagsOrNull, final Cell cell) { + List tags = tagsOrNull; + if (cell.getTagsLength() <= 0) { + return tags; + } + Iterator itr = CellUtil.tagsIterator(cell); + if (itr == EMPTY_TAGS_ITR) { + // If no Tags, return early. + return tagsOrNull; + } + if (tags == null) { + tags = new ArrayList(); + } + while (itr.hasNext()) { + tags.add(itr.next()); + } + return tags; + } + + + /** + * @return Carry forward the TTL tag. + */ + public static List carryForwardTTLTag(final List tagsOrNull, final long ttl) { + if (ttl == Long.MAX_VALUE) { + return tagsOrNull; + } + List tags = tagsOrNull; + // If we are making the array in here, given we are the last thing checked, we'll be only thing + // in the array so set its size to '1' (I saw this being done in earlier version of + // tag-handling). + if (tags == null) { + tags = new ArrayList(1); + } + tags.add(new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(ttl))); + return tags; + } + + /** + * Iterator returned when no Tags. Used by CellUtil too. + */ + static final Iterator EMPTY_TAGS_ITR = new Iterator() { + @Override + public boolean hasNext() { + return false; + } + + @Override + @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="IT_NO_SUCH_ELEMENT", + justification="Intentional") + public Tag next() { + return null; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; } \ No newline at end of file diff --git a/hbase-examples/src/main/java/org/apache/hadoop/hbase/coprocessor/example/ZooKeeperScanPolicyObserver.java b/hbase-examples/src/main/java/org/apache/hadoop/hbase/coprocessor/example/ZooKeeperScanPolicyObserver.java index 420799f..48d7a55 100644 --- a/hbase-examples/src/main/java/org/apache/hadoop/hbase/coprocessor/example/ZooKeeperScanPolicyObserver.java +++ b/hbase-examples/src/main/java/org/apache/hadoop/hbase/coprocessor/example/ZooKeeperScanPolicyObserver.java @@ -33,10 +33,9 @@ import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver; import org.apache.hadoop.hbase.coprocessor.ObserverContext; import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; import org.apache.hadoop.hbase.regionserver.HStore; -import org.apache.hadoop.hbase.regionserver.Store; -import org.apache.hadoop.hbase.regionserver.ScanInfo; import org.apache.hadoop.hbase.regionserver.InternalScanner; import org.apache.hadoop.hbase.regionserver.KeyValueScanner; +import org.apache.hadoop.hbase.regionserver.ScanInfo; import org.apache.hadoop.hbase.regionserver.ScanType; import org.apache.hadoop.hbase.regionserver.Store; import org.apache.hadoop.hbase.regionserver.StoreScanner; @@ -232,6 +231,6 @@ public class ZooKeeperScanPolicyObserver extends BaseRegionObserver { return null; } return new StoreScanner(store, scanInfo, scan, targetCols, - ((HStore)store).getHRegion().getReadpoint(IsolationLevel.READ_COMMITTED)); + ((HStore)store).getHRegion().getReadPoint(IsolationLevel.READ_COMMITTED)); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java index 1bdba3b..9f29f97 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java @@ -34,18 +34,18 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.hbase.ByteBufferedKeyOnlyKeyValue; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellComparator; import org.apache.hadoop.hbase.CellUtil; -import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.KeyValue.KeyOnlyKeyValue; -import org.apache.hadoop.hbase.ByteBufferedKeyOnlyKeyValue; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.io.HeapSize; import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader; import org.apache.hadoop.hbase.nio.ByteBuff; +import org.apache.hadoop.hbase.regionserver.KeyValueScanner; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.ClassSize; import org.apache.hadoop.hbase.util.ObjectIntPair; @@ -289,7 +289,7 @@ public class HFileBlockIndex { if (rootLevelIndex < blockKeys.length - 1) { nextIndexedKey = blockKeys[rootLevelIndex + 1]; } else { - nextIndexedKey = HConstants.NO_NEXT_INDEXED_KEY; + nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY; } int lookupLevel = 1; // How many levels deep we are in our lookup. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java index 4db26d1..a873280 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java @@ -51,6 +51,7 @@ import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext; import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo; import org.apache.hadoop.hbase.nio.ByteBuff; +import org.apache.hadoop.hbase.regionserver.KeyValueScanner; import org.apache.hadoop.hbase.security.EncryptionUtil; import org.apache.hadoop.hbase.util.ByteBufferUtils; import org.apache.hadoop.hbase.util.Bytes; @@ -788,7 +789,7 @@ public class HFileReaderImpl implements HFile.Reader, Configurable { } else { // The comparison with no_next_index_key has to be checked if (this.nextIndexedKey != null && - (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY || reader + (this.nextIndexedKey == KeyValueScanner.NO_NEXT_INDEXED_KEY || reader .getComparator().compareKeyIgnoresMvcc(key, nextIndexedKey) < 0)) { // The reader shall continue to scan the current data block instead // of querying the diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DefaultMemStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DefaultMemStore.java index 89ae0d1..2984754 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DefaultMemStore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/DefaultMemStore.java @@ -454,10 +454,6 @@ public class DefaultMemStore implements MemStore { * value for that row/family/qualifier. If a KeyValue did already exist, * it will then be removed. *

- * Currently the memstoreTS is kept at 0 so as each insert happens, it will - * be immediately visible. May want to change this so it is atomic across - * all KeyValues. - *

* This is called under row lock, so Get operations will still see updates * atomically. Scans will only see each KeyValue update as atomic. * @@ -484,8 +480,7 @@ public class DefaultMemStore implements MemStore { * family, and qualifier, they are removed. *

* Callers must hold the read lock. - * - * @param cell + * @param readpoint Smallest outstanding readpoint; below which we can remove duplicate Cells. * @return change in size of MemStore */ private long upsert(Cell cell, long readpoint) { @@ -505,7 +500,7 @@ public class DefaultMemStore implements MemStore { cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength()); SortedSet ss = cellSet.tailSet(firstCell); Iterator it = ss.iterator(); - // versions visible to oldest scanner + // Versions visible to oldest scanner. int versionsVisible = 0; while ( it.hasNext() ) { Cell cur = it.next(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java index 34a37f1..4ee2fb0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java @@ -1,5 +1,4 @@ /* - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -29,6 +28,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -41,6 +41,7 @@ import java.util.NavigableSet; import java.util.RandomAccess; import java.util.Set; import java.util.TreeMap; +import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.CompletionService; import java.util.concurrent.ConcurrentHashMap; @@ -69,7 +70,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.ArrayBackedTag; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellComparator; import org.apache.hadoop.hbase.CellScanner; @@ -77,7 +77,6 @@ import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.CompoundConfiguration; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.DroppedSnapshotException; -import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HConstants.OperationStatusCode; @@ -93,7 +92,6 @@ import org.apache.hadoop.hbase.ShareableMemory; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.Tag; import org.apache.hadoop.hbase.TagRewriteCell; -import org.apache.hadoop.hbase.TagType; import org.apache.hadoop.hbase.TagUtil; import org.apache.hadoop.hbase.UnknownScannerException; import org.apache.hadoop.hbase.backup.HFileArchiver; @@ -112,7 +110,7 @@ import org.apache.hadoop.hbase.client.RowMutations; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.conf.ConfigurationManager; import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver; -import org.apache.hadoop.hbase.coprocessor.RegionObserver; +import org.apache.hadoop.hbase.coprocessor.RegionObserver.MutationType; import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare; import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException; import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException; @@ -123,8 +121,6 @@ import org.apache.hadoop.hbase.filter.FilterWrapper; import org.apache.hadoop.hbase.filter.IncompatibleFilterException; import org.apache.hadoop.hbase.io.HeapSize; import org.apache.hadoop.hbase.io.TimeRange; -import org.apache.hadoop.hbase.io.hfile.BlockCache; -import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.ipc.CallerDisconnectedException; import org.apache.hadoop.hbase.ipc.RpcCallContext; @@ -148,6 +144,7 @@ import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.Stor import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor; import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor.EventType; import org.apache.hadoop.hbase.protobuf.generated.WALProtos.StoreDescriptor; +import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry; import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope; import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState; import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext; @@ -169,7 +166,6 @@ import org.apache.hadoop.hbase.util.CompressionTest; import org.apache.hadoop.hbase.util.Counter; import org.apache.hadoop.hbase.util.EncryptionTest; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.util.FSTableDescriptors; import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.HashedBytes; import org.apache.hadoop.hbase.util.Pair; @@ -199,6 +195,7 @@ import com.google.protobuf.RpcController; import com.google.protobuf.Service; import com.google.protobuf.TextFormat; +@SuppressWarnings("deprecation") @InterfaceAudience.Private public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region { private static final Log LOG = LogFactory.getLog(HRegion.class); @@ -207,18 +204,6 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi "hbase.hregion.scan.loadColumnFamiliesOnDemand"; /** - * Longest time we'll wait on a sequenceid. - * Sequenceid comes up out of the WAL subsystem. WAL subsystem can go bad or a test might use - * it without cleaning up previous usage properly; generally, a WAL roll is needed. The timeout - * is for a latch in WALKey. There is no global accounting of outstanding WALKeys; intentionally - * to avoid contention, but it makes it so if an abort or problem, we could be stuck waiting - * on the WALKey latch. Revisit. - */ - private final int maxWaitForSeqId; - private static final String MAX_WAIT_FOR_SEQ_ID_KEY = "hbase.hregion.max.wait.for.sequenceid.ms"; - private static final int DEFAULT_MAX_WAIT_FOR_SEQ_ID = 30000; - - /** * This is the global default value for durability. All tables/mutations not * defining a durability or using USE_DEFAULT will default to this value. */ @@ -282,7 +267,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi final Counter checkAndMutateChecksPassed = new Counter(); final Counter checkAndMutateChecksFailed = new Counter(); - //Number of requests + // Number of requests final Counter readRequestsCount = new Counter(); final Counter writeRequestsCount = new Counter(); @@ -356,7 +341,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi */ private boolean disallowWritesInRecovering = false; - // when a region is in recovering state, it can only accept writes not reads + // When a region is in recovering state, it can only accept writes not reads private volatile boolean recovering = false; private volatile Optional configurationManager; @@ -373,7 +358,6 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi // We achieve this by synchronizing on the scannerReadPoints object. synchronized(scannerReadPoints) { minimumReadPoint = mvcc.getReadPoint(); - for (Long readPoint: this.scannerReadPoints.values()) { if (readPoint < minimumReadPoint) { minimumReadPoint = readPoint; @@ -673,7 +657,6 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration", DEFAULT_ROWLOCK_WAIT_DURATION); - this.maxWaitForSeqId = conf.getInt(MAX_WAIT_FOR_SEQ_ID_KEY, DEFAULT_MAX_WAIT_FOR_SEQ_ID); this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true); this.htableDescriptor = htd; this.rsServices = rsServices; @@ -1177,7 +1160,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi */ public void setRecovering(boolean newState) { boolean wasRecovering = this.recovering; - // before we flip the recovering switch (enabling reads) we should write the region open + // Before we flip the recovering switch (enabling reads) we should write the region open // event to WAL if needed if (wal != null && getRegionServerServices() != null && !writestate.readOnly && wasRecovering && !newState) { @@ -1263,28 +1246,34 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } - public MultiVersionConcurrencyControl getMVCC() { - return mvcc; - } + @VisibleForTesting + public MultiVersionConcurrencyControl getMVCC() { + return mvcc; + } + + @Override + public long getMaxFlushedSeqId() { + return maxFlushedSeqId; + } - @Override - public long getMaxFlushedSeqId() { - return maxFlushedSeqId; - } + @Override + public long getReadPoint(IsolationLevel isolationLevel) { + if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) { + // This scan can read even uncommitted transactions + return Long.MAX_VALUE; + } + return mvcc.getReadPoint(); + } - @Override - public long getReadpoint(IsolationLevel isolationLevel) { - if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) { - // This scan can read even uncommitted transactions - return Long.MAX_VALUE; - } - return mvcc.getReadPoint(); - } + @Override + public long getReadpoint(IsolationLevel isolationLevel) { + return getReadPoint(isolationLevel); + } - @Override - public boolean isLoadingCfsOnDemandDefault() { - return this.isLoadingCfsOnDemandDefault; - } + @Override + public boolean isLoadingCfsOnDemandDefault() { + return this.isLoadingCfsOnDemandDefault; + } /** * Close down this HRegion. Flush the cache, shut down each HStore, don't @@ -2039,7 +2028,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi /** * Should the store be flushed because it is old enough. *

- * Every FlushPolicy should call this to determine whether a store is old enough to flush(except + * Every FlushPolicy should call this to determine whether a store is old enough to flush (except * that you always flush all stores). Otherwise the {@link #shouldFlush()} method will always * returns true which will make a lot of flush requests. */ @@ -2140,19 +2129,13 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi * for say installing a bulk loaded file just ahead of the last hfile that was * the result of this flush, etc. * - * @param wal - * Null if we're NOT to go via wal. - * @param myseqid - * The seqid to use if wal is null writing out flush - * file. - * @param storesToFlush - * The list of stores to flush. + * @param wal Null if we're NOT to go via wal. + * @param myseqid The seqid to use if wal is null writing out flush file. + * @param storesToFlush The list of stores to flush. * @return object describing the flush's state - * @throws IOException - * general io exceptions - * @throws DroppedSnapshotException - * Thrown when replay of wal is required because a Snapshot was not - * properly persisted. + * @throws IOException general io exceptions + * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was + * not properly persisted. */ protected FlushResult internalFlushcache(final WAL wal, final long myseqid, final Collection storesToFlush, MonitoredTask status, boolean writeFlushWalMarker) @@ -2176,47 +2159,45 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi throw new IOException("Aborting flush because server is aborted..."); } final long startTime = EnvironmentEdgeManager.currentTime(); - // If nothing to flush, return, but we need to safely update the region sequence id + // If nothing to flush, return, but return with a valid unused sequenceId. + // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a + // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs + // to no other that it can use to associate with the bulk load. Hence this little dance below + // to go get one. if (this.memstoreSize.get() <= 0) { - // Take an update lock because am about to change the sequence id and we want the sequence id - // to be at the border of the empty memstore. - MultiVersionConcurrencyControl.WriteEntry writeEntry = null; + // Take an update lock so no edits can come into memory just yet. this.updatesLock.writeLock().lock(); + WriteEntry writeEntry = null; try { if (this.memstoreSize.get() <= 0) { // Presume that if there are still no edits in the memstore, then there are no edits for // this region out in the WAL subsystem so no need to do any trickery clearing out - // edits in the WAL system. Up the sequence number so the resulting flush id is for - // sure just beyond the last appended region edit (useful as a marker when bulk loading, - // etc.). NOTE: The writeEntry write number is NOT in the WAL.. there is no WAL writing - // here. + // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for + // sure just beyond the last appended region edit and not associated with any edit + // (useful as marker when bulk loading, etc.). + FlushResult flushResult = null; if (wal != null) { writeEntry = mvcc.begin(); long flushOpSeqId = writeEntry.getWriteNumber(); - FlushResult flushResult = new FlushResultImpl( - FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, - flushOpSeqId, - "Nothing to flush", - writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker)); - // TODO: Lets see if we hang here, if there is a scenario where an outstanding reader - // with a read point is in advance of this write point. + flushResult = new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, + flushOpSeqId, "Nothing to flush", + writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker)); mvcc.completeAndWait(writeEntry); + // Set to null so we don't complete it again down in finally block. writeEntry = null; return new PrepareFlushResult(flushResult, myseqid); } else { - return new PrepareFlushResult( - new FlushResultImpl( - FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, - "Nothing to flush", - false), - myseqid); + return new PrepareFlushResult(new FlushResultImpl( + FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid); } } } finally { - this.updatesLock.writeLock().unlock(); if (writeEntry != null) { + // If writeEntry is non-null, this operation failed; the mvcc transaction failed... + // but complete it anyways so it doesn't block the mvcc queue. mvcc.complete(writeEntry); } + this.updatesLock.writeLock().unlock(); } } @@ -2245,8 +2226,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi status.setStatus("Obtaining lock to block concurrent updates"); // block waiting for the lock for internal flush this.updatesLock.writeLock().lock(); - status.setStatus("Preparing to flush by snapshotting stores in " + - getRegionInfo().getEncodedName()); + status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName()); long totalFlushableSizeOfFlushableStores = 0; Set flushedFamilyNames = new HashSet(); @@ -2268,101 +2248,76 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi // will be in advance of this sequence id. long flushedSeqId = HConstants.NO_SEQNUM; byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes(); - - long trxId = 0; - MultiVersionConcurrencyControl.WriteEntry writeEntry = mvcc.begin(); try { - try { - if (wal != null) { - Long earliestUnflushedSequenceIdForTheRegion = + if (wal != null) { + Long earliestUnflushedSequenceIdForTheRegion = wal.startCacheFlush(encodedRegionName, flushedFamilyNames); - if (earliestUnflushedSequenceIdForTheRegion == null) { - // This should never happen. This is how startCacheFlush signals flush cannot proceed. - String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing."; - status.setStatus(msg); - return new PrepareFlushResult( + if (earliestUnflushedSequenceIdForTheRegion == null) { + // This should never happen. This is how startCacheFlush signals flush cannot proceed. + String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing."; + status.setStatus(msg); + return new PrepareFlushResult( new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false), myseqid); - } - flushOpSeqId = getNextSequenceId(wal); - // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit - flushedSeqId = - earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM? - flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1; - } else { - // use the provided sequence Id as WAL is not being used for this flush. - flushedSeqId = flushOpSeqId = myseqid; } + flushOpSeqId = getNextSequenceId(wal); + // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit + flushedSeqId = + earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM? + flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1; + } else { + // use the provided sequence Id as WAL is not being used for this flush. + flushedSeqId = flushOpSeqId = myseqid; + } - for (Store s : storesToFlush) { - totalFlushableSizeOfFlushableStores += s.getFlushableSize(); - storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId)); - committedFiles.put(s.getFamily().getName(), null); // for writing stores to WAL - storeFlushableSize.put(s.getFamily().getName(), s.getFlushableSize()); - } + for (Store s : storesToFlush) { + totalFlushableSizeOfFlushableStores += s.getFlushableSize(); + storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId)); + committedFiles.put(s.getFamily().getName(), null); // for writing stores to WAL + storeFlushableSize.put(s.getFamily().getName(), s.getFlushableSize()); + } - // write the snapshot start to WAL - if (wal != null && !writestate.readOnly) { - FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH, + // write the snapshot start to WAL + if (wal != null && !writestate.readOnly) { + FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); - // no sync. Sync is below where we do not hold the updates lock - trxId = WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), - desc, false, mvcc); - } + // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH + WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), desc, false, mvcc); + } - // Prepare flush (take a snapshot) - for (StoreFlushContext flush : storeFlushCtxs.values()) { - flush.prepare(); - } - } catch (IOException ex) { - if (wal != null) { - if (trxId > 0) { // check whether we have already written START_FLUSH to WAL - try { - FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, - getRegionInfo(), flushOpSeqId, committedFiles); - WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), - desc, false, mvcc); - } catch (Throwable t) { - LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" + - StringUtils.stringifyException(t)); - // ignore this since we will be aborting the RS with DSE. - } - } - // we have called wal.startCacheFlush(), now we have to abort it - wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); - throw ex; // let upper layers deal with it. - } - } finally { - this.updatesLock.writeLock().unlock(); + // Prepare flush (take a snapshot) + for (StoreFlushContext flush : storeFlushCtxs.values()) { + flush.prepare(); } - String s = "Finished memstore snapshotting " + this + - ", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSizeOfFlushableStores; - status.setStatus(s); - if (LOG.isTraceEnabled()) LOG.trace(s); - // sync unflushed WAL changes - // see HBASE-8208 for details + } catch (IOException ex) { if (wal != null) { try { - wal.sync(); // ensure that flush marker is sync'ed - } catch (IOException ioe) { - wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); - throw ioe; + FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, + getRegionInfo(), flushOpSeqId, committedFiles); + WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), desc, false, + mvcc); + } catch (Throwable t) { + LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" + + StringUtils.stringifyException(t)); + // ignore this since we will be aborting the RS with DSE. } + // we have called wal.startCacheFlush(), now we have to abort it + wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); + throw ex; // let upper layers deal with it. } - - // wait for all in-progress transactions to commit to WAL before - // we can start the flush. This prevents - // uncommitted transactions from being written into HFiles. - // We have to block before we start the flush, otherwise keys that - // were removed via a rollbackMemstore could be written to Hfiles. - mvcc.completeAndWait(writeEntry); - // set writeEntry to null to prevent mvcc.complete from being called again inside finally - // block - writeEntry = null; } finally { - if (writeEntry != null) { - // In case of failure just mark current writeEntry as complete. - mvcc.complete(writeEntry); + this.updatesLock.writeLock().unlock(); + } + String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " + + "flushsize=" + totalFlushableSizeOfFlushableStores; + status.setStatus(s); + // Sync unflushed WAL changes. See HBASE-8208 for details + if (wal != null) { + try { + wal.sync(); // ensure that flush marker is sync'ed + } catch (IOException ioe) { + wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); + throw ioe; } } return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime, @@ -2388,8 +2343,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH, getRegionInfo(), -1, new TreeMap>(Bytes.BYTES_COMPARATOR)); try { - WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), - desc, true, mvcc); + WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), desc, true, mvcc); return true; } catch (IOException e) { LOG.warn(getRegionInfo().getEncodedName() + " : " @@ -2459,8 +2413,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi // write flush marker to WAL. If fail, we should throw DroppedSnapshotException FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); - WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), - desc, true, mvcc); + WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), desc, true, mvcc); } } catch (Throwable t) { // An exception here means that the snapshot was not persisted. @@ -2473,8 +2426,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi try { FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, getRegionInfo(), flushOpSeqId, committedFiles); - WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), - desc, false, mvcc); + WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(), desc, false, mvcc); } catch (Throwable ex) { LOG.warn(getRegionInfo().getEncodedName() + " : " + "failed writing ABORT_FLUSH marker to WAL", ex); @@ -2545,15 +2497,9 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi */ @VisibleForTesting protected long getNextSequenceId(final WAL wal) throws IOException { - // TODO: For review. Putting an empty edit in to get a sequenceid out will not work if the - // WAL is banjaxed... if it has gotten an exception and the WAL has not yet been rolled or - // aborted. In this case, we'll just get stuck here. For now, until HBASE-12751, just have - // a timeout. May happen in tests after we tightened the semantic via HBASE-14317. - // Also, the getSequenceId blocks on a latch. There is no global list of outstanding latches - // so if an abort or stop, there is no way to call them in. - WALKey key = this.appendEmptyEdit(wal); - mvcc.complete(key.getWriteEntry()); - return key.getSequenceId(this.maxWaitForSeqId); + WriteEntry we = mvcc.begin(); + mvcc.completeAndWait(we); + return we.getWriteNumber(); } ////////////////////////////////////////////////////////////////////////////// @@ -2732,6 +2678,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi try { // All edits for the given row (across all column families) must happen atomically. doBatchMutate(put); + } catch (Exception e) { + throw e; } finally { closeRegionOperation(Operation.PUT); } @@ -2742,13 +2690,13 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi * accumulating status codes and tracking the index at which processing * is proceeding. */ - private abstract static class BatchOperationInProgress { + private abstract static class BatchOperation { T[] operations; int nextIndexToProcess = 0; OperationStatus[] retCodeDetails; WALEdit[] walEditsFromCoprocessors; - public BatchOperationInProgress(T[] operations) { + public BatchOperation(T[] operations) { this.operations = operations; this.retCodeDetails = new OperationStatus[operations.length]; this.walEditsFromCoprocessors = new WALEdit[operations.length]; @@ -2768,7 +2716,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } - private static class MutationBatch extends BatchOperationInProgress { + private static class MutationBatch extends BatchOperation { private long nonceGroup; private long nonce; public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) { @@ -2808,7 +2756,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } - private static class ReplayBatch extends BatchOperationInProgress { + private static class ReplayBatch extends BatchOperation { private long replaySeqId = 0; public ReplayBatch(MutationReplay[] operations, long seqId) { super(operations); @@ -2894,7 +2842,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi * OperationStatusCode and the exceptionMessage if any. * @throws IOException */ - OperationStatus[] batchMutate(BatchOperationInProgress batchOp) throws IOException { + OperationStatus[] batchMutate(BatchOperation batchOp) throws IOException { boolean initialized = false; Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE; startRegionOperation(op); @@ -2908,11 +2856,11 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi if (!initialized) { this.writeRequestsCount.add(batchOp.operations.length); if (!batchOp.isInReplay()) { - doPreMutationHook(batchOp); + doPreBatchMutateHook(batchOp); } initialized = true; } - long addedSize = doMiniBatchMutation(batchOp); + long addedSize = doMiniBatchMutate(batchOp); long newSize = this.addAndGetGlobalMemstoreSize(addedSize); if (isFlushSize(newSize)) { requestFlush(); @@ -2924,8 +2872,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi return batchOp.retCodeDetails; } - - private void doPreMutationHook(BatchOperationInProgress batchOp) + private void doPreBatchMutateHook(BatchOperation batchOp) throws IOException { /* Run coprocessor pre hook outside of locks to avoid deadlock */ WALEdit walEdit = new WALEdit(); @@ -2964,103 +2911,60 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } + /** + * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[], long, long)} + * In here we also handle replay of edits on region recover. + * @param batchOp + * @return Change in size brought about by applying batchOp + * @throws IOException + */ @SuppressWarnings("unchecked") - private long doMiniBatchMutation(BatchOperationInProgress batchOp) throws IOException { - boolean isInReplay = batchOp.isInReplay(); - // variable to note if all Put items are for the same CF -- metrics related + // TODO: This needs a rewrite. Doesn't have to be this long. St.Ack 20160120 + private long doMiniBatchMutate(BatchOperation batchOp) throws IOException { + boolean replay = batchOp.isInReplay(); + // Variable to note if all Put items are for the same CF -- metrics related boolean putsCfSetConsistent = true; - //The set of columnFamilies first seen for Put. - Set putsCfSet = null; - // variable to note if all Delete items are for the same CF -- metrics related + // Variable to note if all Delete items are for the same CF -- metrics related boolean deletesCfSetConsistent = true; - //The set of columnFamilies first seen for Delete. + // The set of columnFamilies first seen for Put. + Set putsCfSet = null; + // The set of columnFamilies first seen for Delete. Set deletesCfSet = null; - - long currentNonceGroup = HConstants.NO_NONCE, currentNonce = HConstants.NO_NONCE; - WALEdit walEdit = new WALEdit(isInReplay); - MultiVersionConcurrencyControl.WriteEntry writeEntry = null; - long txid = 0; - boolean doRollBackMemstore = false; + long currentNonceGroup = HConstants.NO_NONCE; + long currentNonce = HConstants.NO_NONCE; + WALEdit walEdit = new WALEdit(replay); boolean locked = false; - - /** Keep track of the locks we hold so we can release them in finally clause */ - List acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length); // reference family maps directly so coprocessors can mutate them if desired Map>[] familyMaps = new Map[batchOp.operations.length]; // We try to set up a batch in the range [firstIndex,lastIndexExclusive) int firstIndex = batchOp.nextIndexToProcess; int lastIndexExclusive = firstIndex; boolean success = false; - int noOfPuts = 0, noOfDeletes = 0; - WALKey walKey = null; - long mvccNum = 0; + int noOfPuts = 0; + int noOfDeletes = 0; + WriteEntry writeEntry = null; + /** Keep track of the locks we hold so we can release them in finally clause */ + List acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length); try { - // ------------------------------------ - // STEP 1. Try to acquire as many locks as we can, and ensure - // we acquire at least one. - // ---------------------------------- + // STEP 1. Try to acquire as many locks as we can, and ensure we acquire at least one. int numReadyToWrite = 0; long now = EnvironmentEdgeManager.currentTime(); while (lastIndexExclusive < batchOp.operations.length) { - Mutation mutation = batchOp.getMutation(lastIndexExclusive); - boolean isPutMutation = mutation instanceof Put; - - Map> familyMap = mutation.getFamilyCellMap(); - // store the family map reference to allow for mutations - familyMaps[lastIndexExclusive] = familyMap; - - // skip anything that "ran" already - if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode() - != OperationStatusCode.NOT_RUN) { - lastIndexExclusive++; - continue; - } - - try { - if (isPutMutation) { - // Check the families in the put. If bad, skip this one. - if (isInReplay) { - removeNonExistentColumnFamilyForReplay(familyMap); - } else { - checkFamilies(familyMap.keySet()); - } - checkTimestamps(mutation.getFamilyCellMap(), now); - } else { - prepareDelete((Delete) mutation); - } - checkRow(mutation.getRow(), "doMiniBatchMutation"); - } catch (NoSuchColumnFamilyException nscf) { - LOG.warn("No such column family in batch mutation", nscf); - batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( - OperationStatusCode.BAD_FAMILY, nscf.getMessage()); - lastIndexExclusive++; - continue; - } catch (FailedSanityCheckException fsce) { - LOG.warn("Batch Mutation did not pass sanity check", fsce); - batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( - OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage()); - lastIndexExclusive++; - continue; - } catch (WrongRegionException we) { - LOG.warn("Batch mutation had a row that does not belong to this region", we); - batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( - OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage()); + if (checkBatchOp(batchOp, lastIndexExclusive, familyMaps, now)) { lastIndexExclusive++; continue; } - - // If we haven't got any rows in our batch, we should block to - // get the next one. + Mutation mutation = batchOp.getMutation(lastIndexExclusive); + // If we haven't got any rows in our batch, we should block to get the next one. RowLock rowLock = null; try { rowLock = getRowLock(mutation.getRow(), true); } catch (IOException ioe) { - LOG.warn("Failed getting lock in batch put, row=" - + Bytes.toStringBinary(mutation.getRow()), ioe); + LOG.warn("Failed getting lock, row=" + Bytes.toStringBinary(mutation.getRow()), ioe); } if (rowLock == null) { // We failed to grab another lock - break; // stop acquiring more rows for this batch + break; // Stop acquiring more rows for this batch } else { acquiredRowLocks.add(rowLock); } @@ -3068,9 +2972,9 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi lastIndexExclusive++; numReadyToWrite++; - if (isPutMutation) { + if (mutation instanceof Put) { // If Column Families stay consistent through out all of the - // individual puts then metrics can be reported as a mutliput across + // individual puts then metrics can be reported as a multiput across // column families in the first put. if (putsCfSet == null) { putsCfSet = mutation.getFamilyCellMap().keySet(); @@ -3088,23 +2992,26 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } - // we should record the timestamp only after we have acquired the rowLock, + // We've now grabbed as many mutations off the list as we can + + // STEP 2. Update any LATEST_TIMESTAMP timestamps + // We should record the timestamp only after we have acquired the rowLock, // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp now = EnvironmentEdgeManager.currentTime(); byte[] byteNow = Bytes.toBytes(now); // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily? - if (numReadyToWrite <= 0) return 0L; - - // We've now grabbed as many mutations off the list as we can + if (numReadyToWrite <= 0) { + return 0L; + } - // ------------------------------------ - // STEP 2. Update any LATEST_TIMESTAMP timestamps - // ---------------------------------- - for (int i = firstIndex; !isInReplay && i < lastIndexExclusive; i++) { + for (int i = firstIndex; !replay && i < lastIndexExclusive; i++) { // skip invalid if (batchOp.retCodeDetails[i].getOperationStatusCode() - != OperationStatusCode.NOT_RUN) continue; + != OperationStatusCode.NOT_RUN) { + // lastIndexExclusive was incremented above. + continue; + } Mutation mutation = batchOp.getMutation(i); if (mutation instanceof Put) { @@ -3121,16 +3028,14 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi locked = true; // calling the pre CP hook for batch mutation - if (!isInReplay && coprocessorHost != null) { + if (!replay && coprocessorHost != null) { MiniBatchOperationInProgress miniBatchOp = new MiniBatchOperationInProgress(batchOp.getMutationsForCoprocs(), batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive); if (coprocessorHost.preBatchMutate(miniBatchOp)) return 0L; } - // ------------------------------------ // STEP 3. Build WAL edit - // ---------------------------------- Durability durability = Durability.USE_DEFAULT; for (int i = firstIndex; i < lastIndexExclusive; i++) { // Skip puts that were determined to be invalid during preprocessing @@ -3148,26 +3053,15 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi continue; } - long nonceGroup = batchOp.getNonceGroup(i), nonce = batchOp.getNonce(i); + long nonceGroup = batchOp.getNonceGroup(i); + long nonce = batchOp.getNonce(i); // In replay, the batch may contain multiple nonces. If so, write WALEdit for each. // Given how nonces are originally written, these should be contiguous. // They don't have to be, it will still work, just write more WALEdits than needed. if (nonceGroup != currentNonceGroup || nonce != currentNonce) { - if (walEdit.size() > 0) { - assert isInReplay; - if (!isInReplay) { - throw new IOException("Multiple nonces per batch and not in replay"); - } - // txid should always increase, so having the one from the last call is ok. - // we use HLogKey here instead of WALKey directly to support legacy coprocessors. - walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(), - this.htableDescriptor.getTableName(), now, m.getClusterIds(), - currentNonceGroup, currentNonce, mvcc); - txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, - walEdit, true); - walEdit = new WALEdit(isInReplay); - walKey = null; - } + // Write what we have so far for nonces out to WAL + appendCurrentNonces(m, replay, walEdit, now, currentNonceGroup, currentNonce); + walEdit = new WALEdit(replay); currentNonceGroup = nonceGroup; currentNonce = nonce; } @@ -3182,11 +3076,10 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi addFamilyMapToWALEdit(familyMaps[i], walEdit); } - // ------------------------- - // STEP 4. Append the final edit to WAL. Do not sync wal. - // ------------------------- + // STEP 4. Append the final edit to WAL and sync. Mutation mutation = batchOp.getMutation(firstIndex); - if (isInReplay) { + WALKey walKey = null; + if (replay) { // use wal key from the original walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(), this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now, @@ -3194,95 +3087,70 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi long replaySeqId = batchOp.getReplaySequenceId(); walKey.setOrigLogSeqNum(replaySeqId); } - if (walEdit.size() > 0) { - if (!isInReplay) { - // we use HLogKey here instead of WALKey directly to support legacy coprocessors. - walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(), + // Not sure what is going on here when replay is going on... does the below append get + // called for replayed edits? Am afraid to change it without test. + if (!walEdit.isEmpty()) { + if (!replay) { + // we use HLogKey here instead of WALKey directly to support legacy coprocessors. + walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(), this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now, mutation.getClusterIds(), currentNonceGroup, currentNonce, mvcc); } - txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit, true); + // TODO: Use the doAppend methods below... complicated by the replay stuff above. + try { + long txid = + this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit, true); + if (txid != 0) sync(txid, durability); + writeEntry = walKey.getWriteEntry(); + } catch (IOException ioe) { + if (walKey != null) mvcc.complete(walKey.getWriteEntry()); + throw ioe; + } } - // ------------------------------------ - // Acquire the latest mvcc number - // ---------------------------------- if (walKey == null) { - // If this is a skip wal operation just get the read point from mvcc - walKey = this.appendEmptyEdit(this.wal); - } - if (!isInReplay) { - writeEntry = walKey.getWriteEntry(); - mvccNum = writeEntry.getWriteNumber(); - } else { - mvccNum = batchOp.getReplaySequenceId(); + // If no walKey, then skipping WAL or some such. Being an mvcc transaction so sequenceid. + writeEntry = mvcc.begin(); } - // ------------------------------------ // STEP 5. Write back to memstore - // Write to memstore. It is ok to write to memstore - // first without syncing the WAL because we do not roll - // forward the memstore MVCC. The MVCC will be moved up when - // the complete operation is done. These changes are not yet - // visible to scanners till we update the MVCC. The MVCC is - // moved only when the sync is complete. - // ---------------------------------- long addedSize = 0; for (int i = firstIndex; i < lastIndexExclusive; i++) { - if (batchOp.retCodeDetails[i].getOperationStatusCode() - != OperationStatusCode.NOT_RUN) { + if (batchOp.retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.NOT_RUN) { continue; } - doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote - addedSize += applyFamilyMapToMemstore(familyMaps[i], mvccNum, isInReplay); + addedSize += applyFamilyMapToMemstore(familyMaps[i], replay, + replay? batchOp.getReplaySequenceId(): writeEntry.getWriteNumber()); } - // ------------------------------- - // STEP 6. Release row locks, etc. - // ------------------------------- + // STEP 6. Complete mvcc. + if (replay) mvcc.complete(writeEntry); + else mvcc.completeAndWait(writeEntry); + writeEntry = null; + + // STEP 7. Release row locks, etc. if (locked) { this.updatesLock.readLock().unlock(); locked = false; } releaseRowLocks(acquiredRowLocks); - // ------------------------- - // STEP 7. Sync wal. - // ------------------------- - if (txid != 0) { - syncOrDefer(txid, durability); - } - - doRollBackMemstore = false; // calling the post CP hook for batch mutation - if (!isInReplay && coprocessorHost != null) { + if (!replay && coprocessorHost != null) { MiniBatchOperationInProgress miniBatchOp = new MiniBatchOperationInProgress(batchOp.getMutationsForCoprocs(), batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive); coprocessorHost.postBatchMutate(miniBatchOp); } - // ------------------------------------------------------------------ - // STEP 8. Advance mvcc. This will make this put visible to scanners and getters. - // ------------------------------------------------------------------ - if (writeEntry != null) { - mvcc.completeAndWait(writeEntry); - writeEntry = null; - } else if (isInReplay) { - // ensure that the sequence id of the region is at least as big as orig log seq id - mvcc.advanceTo(mvccNum); - } - for (int i = firstIndex; i < lastIndexExclusive; i ++) { if (batchOp.retCodeDetails[i] == OperationStatus.NOT_RUN) { batchOp.retCodeDetails[i] = OperationStatus.SUCCESS; } } - // ------------------------------------ - // STEP 9. Run coprocessor post hooks. This should be done after the wal is + // STEP 8. Run coprocessor post hooks. This should be done after the wal is // synced so that the coprocessor contract is adhered to. - // ------------------------------------ - if (!isInReplay && coprocessorHost != null) { + if (!replay && coprocessorHost != null) { for (int i = firstIndex; i < lastIndexExclusive; i++) { // only for successful puts if (batchOp.retCodeDetails[i].getOperationStatusCode() @@ -3301,18 +3169,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi success = true; return addedSize; } finally { - // if the wal sync was unsuccessful, remove keys from memstore - if (doRollBackMemstore) { - for (int j = 0; j < familyMaps.length; j++) { - for(List cells:familyMaps[j].values()) { - rollbackMemstore(cells); - } - } - if (writeEntry != null) mvcc.complete(writeEntry); - } else if (writeEntry != null) { - mvcc.completeAndWait(writeEntry); - } - + // Call complete rather than completeAndWait because we probably had error if walKey != null + if (writeEntry != null) mvcc.complete(writeEntry); if (locked) { this.updatesLock.readLock().unlock(); } @@ -3357,6 +3215,88 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } + private void appendCurrentNonces(final Mutation mutation, final boolean replay, + final WALEdit walEdit, final long now, final long currentNonceGroup, final long currentNonce) + throws IOException { + if (walEdit.isEmpty()) return; + if (!replay) throw new IOException("Multiple nonces per batch and not in replay"); + WALKey walKey = new WALKey(this.getRegionInfo().getEncodedNameAsBytes(), + this.htableDescriptor.getTableName(), now, mutation.getClusterIds(), + currentNonceGroup, currentNonce, mvcc); + this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit, true); + // Complete the mvcc transaction started down in append else it will block others + this.mvcc.complete(walKey.getWriteEntry()); + } + + private boolean checkBatchOp(BatchOperation batchOp, final int lastIndexExclusive, + final Map>[] familyMaps, final long now) + throws IOException { + boolean skip = false; + // Skip anything that "ran" already + if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode() + != OperationStatusCode.NOT_RUN) { + return true; + } + Mutation mutation = batchOp.getMutation(lastIndexExclusive); + Map> familyMap = mutation.getFamilyCellMap(); + // store the family map reference to allow for mutations + familyMaps[lastIndexExclusive] = familyMap; + + try { + if (mutation instanceof Put) { + // Check the families in the put. If bad, skip this one. + if (batchOp.isInReplay()) { + removeNonExistentColumnFamilyForReplay(familyMap); + } else { + checkFamilies(familyMap.keySet()); + } + checkTimestamps(mutation.getFamilyCellMap(), now); + } else { + prepareDelete((Delete)mutation); + } + checkRow(mutation.getRow(), "doMiniBatchMutation"); + } catch (NoSuchColumnFamilyException nscf) { + LOG.warn("No such column family in batch mutation", nscf); + batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( + OperationStatusCode.BAD_FAMILY, nscf.getMessage()); + skip = true; + } catch (FailedSanityCheckException fsce) { + LOG.warn("Batch Mutation did not pass sanity check", fsce); + batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( + OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage()); + skip = true; + } catch (WrongRegionException we) { + LOG.warn("Batch mutation had a row that does not belong to this region", we); + batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( + OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage()); + skip = true; + } + return skip; + } + + /** + * During replay, there could exist column families which are removed between region server + * failure and replay + */ + private void removeNonExistentColumnFamilyForReplay(final Map> familyMap) { + List nonExistentList = null; + for (byte[] family : familyMap.keySet()) { + if (!this.htableDescriptor.hasFamily(family)) { + if (nonExistentList == null) { + nonExistentList = new ArrayList(); + } + nonExistentList.add(family); + } + } + if (nonExistentList != null) { + for (byte[] family : nonExistentList) { + // Perhaps schema was changed between crash and replay + LOG.info("No family for " + Bytes.toString(family) + " omit from reply."); + familyMap.remove(family); + } + } + } + /** * Returns effective durability from the passed durability and * the table descriptor. @@ -3365,93 +3305,82 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi return d == Durability.USE_DEFAULT ? this.durability : d; } - //TODO, Think that gets/puts and deletes should be refactored a bit so that - //the getting of the lock happens before, so that you would just pass it into - //the methods. So in the case of checkAndMutate you could just do lockRow, - //get, put, unlockRow or something - @Override public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier, - CompareOp compareOp, ByteArrayComparable comparator, Mutation w, + CompareOp compareOp, ByteArrayComparable comparator, Mutation mutation, boolean writeToWAL) throws IOException{ + checkMutationType(mutation, row); + return doCheckAndRowMutate(row, family, qualifier, compareOp, comparator, null, + mutation, writeToWAL); + } + + @Override + public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier, + CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm, + boolean writeToWAL) + throws IOException { + return doCheckAndRowMutate(row, family, qualifier, compareOp, comparator, rm, null, + writeToWAL); + } + + /** + * checkAndMutate and checkAndRowMutate are 90% the same. Rather than copy/paste, below has + * switches in the few places where there is deviation. + */ + private boolean doCheckAndRowMutate(byte [] row, byte [] family, byte [] qualifier, + CompareOp compareOp, ByteArrayComparable comparator, RowMutations rowMutations, + Mutation mutation, boolean writeToWAL) + throws IOException { + // Could do the below checks but seems wacky with two callers only. Just comment out for now. + // One caller passes a Mutation, the other passes RowMutation. Presume all good so we don't + // need these commented out checks. + // if (rowMutations == null && mutation == null) throw new DoNotRetryIOException("Both null"); + // if (rowMutations != null && mutation != null) throw new DoNotRetryIOException("Both set"); checkReadOnly(); - //TODO, add check for value length or maybe even better move this to the - //client if this becomes a global setting + // TODO, add check for value length also move this check to the client checkResources(); - boolean isPut = w instanceof Put; - if (!isPut && !(w instanceof Delete)) - throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " + - "be Put or Delete"); - if (!Bytes.equals(row, w.getRow())) { - throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " + - "getRow must match the passed row"); - } - startRegionOperation(); try { Get get = new Get(row); checkFamily(family); get.addColumn(family, qualifier); - // Lock row - note that doBatchMutate will relock this row if called RowLock rowLock = getRowLock(get.getRow()); - // wait for all previous transactions to complete (with lock held) - mvcc.await(); try { - if (this.getCoprocessorHost() != null) { + if (mutation != null && this.getCoprocessorHost() != null) { + // Call coprocessor. Boolean processed = null; - if (w instanceof Put) { + if (mutation instanceof Put) { processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family, - qualifier, compareOp, comparator, (Put) w); - } else if (w instanceof Delete) { + qualifier, compareOp, comparator, (Put)mutation); + } else if (mutation instanceof Delete) { processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family, - qualifier, compareOp, comparator, (Delete) w); + qualifier, compareOp, comparator, (Delete)mutation); } if (processed != null) { return processed; } } + // NOTE: We used to wait here until mvcc caught up: mvcc.await(); + // Supposition is that now all changes are done under row locks, then when we go to read, + // we'll get the latest on this row. List result = get(get, false); - - boolean valueIsNull = comparator.getValue() == null || - comparator.getValue().length == 0; + boolean valueIsNull = comparator.getValue() == null || comparator.getValue().length == 0; boolean matches = false; long cellTs = 0; if (result.size() == 0 && valueIsNull) { matches = true; - } else if (result.size() > 0 && result.get(0).getValueLength() == 0 && - valueIsNull) { + } else if (result.size() > 0 && result.get(0).getValueLength() == 0 && valueIsNull) { matches = true; cellTs = result.get(0).getTimestamp(); } else if (result.size() == 1 && !valueIsNull) { Cell kv = result.get(0); cellTs = kv.getTimestamp(); int compareResult = CellComparator.compareValue(kv, comparator); - switch (compareOp) { - case LESS: - matches = compareResult < 0; - break; - case LESS_OR_EQUAL: - matches = compareResult <= 0; - break; - case EQUAL: - matches = compareResult == 0; - break; - case NOT_EQUAL: - matches = compareResult != 0; - break; - case GREATER_OR_EQUAL: - matches = compareResult >= 0; - break; - case GREATER: - matches = compareResult > 0; - break; - default: - throw new RuntimeException("Unknown Compare op " + compareOp.name()); - } + matches = matches(compareOp, compareResult); } - //If matches put the new put or delete the new delete + // If matches put the new put or delete the new delete if (matches) { // We have acquired the row lock already. If the system clock is NOT monotonically // non-decreasing (see HBASE-14070) we should make sure that the mutation has a @@ -3460,16 +3389,27 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi long now = EnvironmentEdgeManager.currentTime(); long ts = Math.max(now, cellTs); // ensure write is not eclipsed byte[] byteTs = Bytes.toBytes(ts); - - if (w instanceof Put) { - updateCellTimestamps(w.getFamilyCellMap().values(), byteTs); + if (mutation != null) { + if (mutation instanceof Put) { + updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs); + } + // And else 'delete' is not needed since it already does a second get, and sets the + // timestamp from get (see prepareDeleteTimestamps). + } else { + for (Mutation m: rowMutations.getMutations()) { + if (m instanceof Put) { + updateCellTimestamps(m.getFamilyCellMap().values(), byteTs); + } + } + // And else 'delete' is not needed since it already does a second get, and sets the + // timestamp from get (see prepareDeleteTimestamps). + } + // All edits for the given row (across all column families) must happen atomically. + if (mutation != null) { + doBatchMutate(mutation); + } else { + mutateRow(rowMutations); } - // else delete is not needed since it already does a second get, and sets the timestamp - // from get (see prepareDeleteTimestamps). - - // All edits for the given row (across all column families) must - // happen atomically. - doBatchMutate(w); this.checkAndMutateChecksPassed.increment(); return true; } @@ -3483,113 +3423,54 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } - //TODO, Think that gets/puts and deletes should be refactored a bit so that - //the getting of the lock happens before, so that you would just pass it into - //the methods. So in the case of checkAndMutate you could just do lockRow, - //get, put, unlockRow or something - - @Override - public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier, - CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm, - boolean writeToWAL) throws IOException { - checkReadOnly(); - //TODO, add check for value length or maybe even better move this to the - //client if this becomes a global setting - checkResources(); + private void checkMutationType(final Mutation mutation, final byte [] row) + throws DoNotRetryIOException { + boolean isPut = mutation instanceof Put; + if (!isPut && !(mutation instanceof Delete)) { + throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must be Put or Delete"); + } + if (!Bytes.equals(row, mutation.getRow())) { + throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match"); + } + } - startRegionOperation(); - try { - Get get = new Get(row); - checkFamily(family); - get.addColumn(family, qualifier); + private boolean matches(final CompareOp compareOp, final int compareResult) { + boolean matches = false; + switch (compareOp) { + case LESS: + matches = compareResult < 0; + break; + case LESS_OR_EQUAL: + matches = compareResult <= 0; + break; + case EQUAL: + matches = compareResult == 0; + break; + case NOT_EQUAL: + matches = compareResult != 0; + break; + case GREATER_OR_EQUAL: + matches = compareResult >= 0; + break; + case GREATER: + matches = compareResult > 0; + break; + default: + throw new RuntimeException("Unknown Compare op " + compareOp.name()); + } + return matches; + } - // Lock row - note that doBatchMutate will relock this row if called - RowLock rowLock = getRowLock(get.getRow()); - // wait for all previous transactions to complete (with lock held) - mvcc.await(); - try { - List result = get(get, false); - boolean valueIsNull = comparator.getValue() == null || - comparator.getValue().length == 0; - boolean matches = false; - long cellTs = 0; - if (result.size() == 0 && valueIsNull) { - matches = true; - } else if (result.size() > 0 && result.get(0).getValueLength() == 0 && - valueIsNull) { - matches = true; - cellTs = result.get(0).getTimestamp(); - } else if (result.size() == 1 && !valueIsNull) { - Cell kv = result.get(0); - cellTs = kv.getTimestamp(); - int compareResult = CellComparator.compareValue(kv, comparator); - switch (compareOp) { - case LESS: - matches = compareResult < 0; - break; - case LESS_OR_EQUAL: - matches = compareResult <= 0; - break; - case EQUAL: - matches = compareResult == 0; - break; - case NOT_EQUAL: - matches = compareResult != 0; - break; - case GREATER_OR_EQUAL: - matches = compareResult >= 0; - break; - case GREATER: - matches = compareResult > 0; - break; - default: - throw new RuntimeException("Unknown Compare op " + compareOp.name()); - } - } - //If matches put the new put or delete the new delete - if (matches) { - // We have acquired the row lock already. If the system clock is NOT monotonically - // non-decreasing (see HBASE-14070) we should make sure that the mutation has a - // larger timestamp than what was observed via Get. doBatchMutate already does this, but - // there is no way to pass the cellTs. See HBASE-14054. - long now = EnvironmentEdgeManager.currentTime(); - long ts = Math.max(now, cellTs); // ensure write is not eclipsed - byte[] byteTs = Bytes.toBytes(ts); - - for (Mutation w : rm.getMutations()) { - if (w instanceof Put) { - updateCellTimestamps(w.getFamilyCellMap().values(), byteTs); - } - // else delete is not needed since it already does a second get, and sets the timestamp - // from get (see prepareDeleteTimestamps). - } - - // All edits for the given row (across all column families) must - // happen atomically. - mutateRow(rm); - this.checkAndMutateChecksPassed.increment(); - return true; - } - this.checkAndMutateChecksFailed.increment(); - return false; - } finally { - rowLock.release(); - } - } finally { - closeRegionOperation(); - } - } - - private void doBatchMutate(Mutation mutation) throws IOException { - // Currently this is only called for puts and deletes, so no nonces. - OperationStatus[] batchMutate = this.batchMutate(new Mutation[]{mutation}); - if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) { - throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg()); - } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) { - throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg()); - } - } + private void doBatchMutate(Mutation mutation) throws IOException { + // Currently this is only called for puts and deletes, so no nonces. + OperationStatus[] batchMutate = this.batchMutate(new Mutation[]{mutation}); + if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) { + throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg()); + } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) { + throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg()); + } + } /** * Complete taking the snapshot on the region. Writes the region info and adds references to the @@ -3651,40 +3532,19 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi void rewriteCellTags(Map> familyMap, final Mutation m) { // Check if we have any work to do and early out otherwise // Update these checks as more logic is added here - if (m.getTTL() == Long.MAX_VALUE) { return; } // From this point we know we have some work to do - for (Map.Entry> e: familyMap.entrySet()) { List cells = e.getValue(); assert cells instanceof RandomAccess; int listSize = cells.size(); for (int i = 0; i < listSize; i++) { Cell cell = cells.get(i); - List newTags = new ArrayList(); - Iterator tagIterator = CellUtil.tagsIterator(cell); - - // Carry forward existing tags - - while (tagIterator.hasNext()) { - - // Add any filters or tag specific rewrites here - - newTags.add(tagIterator.next()); - } - - // Cell TTL handling - - // Check again if we need to add a cell TTL because early out logic - // above may change when there are more tag based features in core. - if (m.getTTL() != Long.MAX_VALUE) { - // Add a cell TTL tag - newTags.add(new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(m.getTTL()))); - } - + List newTags = TagUtil.carryForwardTags(null, cell); + newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL()); // Rewrite the cell with the updated set of tags cells.set(i, new TagRewriteCell(cell, TagUtil.fromList(newTags))); } @@ -3760,49 +3620,63 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi * should already have locked updatesLock.readLock(). This also does * not check the families for validity. * - * @param familyMap Map of kvs per family - * @param mvccNum The MVCC for this transaction. - * @param isInReplay true when adding replayed KVs into memstore - * @return the additional memory usage of the memstore caused by the - * new entries. + * @param familyMap Map of Cells by family + * @return the additional memory usage of the memstore caused by the new entries. */ - private long applyFamilyMapToMemstore(Map> familyMap, - long mvccNum, boolean isInReplay) throws IOException { + private long applyFamilyMapToMemstore(Map> familyMap, boolean replay, + long sequenceId) + throws IOException { long size = 0; - for (Map.Entry> e : familyMap.entrySet()) { byte[] family = e.getKey(); List cells = e.getValue(); assert cells instanceof RandomAccess; - Store store = getStore(family); - int listSize = cells.size(); - for (int i=0; i < listSize; i++) { + size += applyToMemstore(getStore(family), cells, false, replay, sequenceId); + } + return size; + } + + /** + * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be + * set; when set we will run operations that make sense in the increment/append scenario but + * that do not make sense otherwise. + * @return Memstore change in size on insert of these Cells. + * @throws IOException + * @see #applyToMemStore(Store, Cell, long) + */ + private long applyToMemstore(final Store store, final List cells, + final boolean delta, boolean replay, long sequenceId) + throws IOException { + // Any change in how we update Store/MemStore needs to also be done in other applyToMemstore!!!! + long size = 0; + boolean upsert = delta && store.getFamily().getMaxVersions() == 1; + int count = cells.size(); + if (upsert) { + size += store.upsert(cells, getSmallestReadPoint()); + } else { + for (int i = 0; i < count; i++) { Cell cell = cells.get(i); - if (cell.getSequenceId() == 0 || isInReplay) { - CellUtil.setSequenceId(cell, mvccNum); + if (cell.getSequenceId() == 0 || replay) { + CellUtil.setSequenceId(cell, sequenceId); } size += store.add(cell); } } - - return size; - } + return size; + } /** - * Remove all the keys listed in the map from the memstore. This method is - * called when a Put/Delete has updated memstore but subsequently fails to update - * the wal. This method is then invoked to rollback the memstore. + * @return Memstore change in size on insert of these Cells. + * @see #applyToMemstore(Store, List, boolean, boolean, long) */ - private void rollbackMemstore(List memstoreCells) { - int kvsRolledback = 0; - - for (Cell cell : memstoreCells) { - byte[] family = CellUtil.cloneFamily(cell); - Store store = getStore(family); - store.rollback(cell); - kvsRolledback++; + private long applyToMemstore(final Store store, final Cell cell, long sequenceId) + throws IOException { + // Any change in how we update Store/MemStore needs to also be done in other applyToMemstore!!!! + if (store == null) { + checkFamily(CellUtil.cloneFamily(cell)); + // Unreachable because checkFamily will throw exception } - LOG.debug("rollbackMemstore rolled back " + kvsRolledback); + return store.add(cell); } @Override @@ -3812,30 +3686,6 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } - /** - * During replay, there could exist column families which are removed between region server - * failure and replay - */ - private void removeNonExistentColumnFamilyForReplay( - final Map> familyMap) { - List nonExistentList = null; - for (byte[] family : familyMap.keySet()) { - if (!this.htableDescriptor.hasFamily(family)) { - if (nonExistentList == null) { - nonExistentList = new ArrayList(); - } - nonExistentList.add(family); - } - } - if (nonExistentList != null) { - for (byte[] family : nonExistentList) { - // Perhaps schema was changed between crash and replay - LOG.info("No family for " + Bytes.toString(family) + " omit from reply."); - familyMap.remove(family); - } - } - } - @Override public void checkTimestamps(final Map> familyMap, long now) throws FailedSanityCheckException { @@ -5477,12 +5327,12 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi return true; } finally { if (wal != null && !storeFiles.isEmpty()) { - // write a bulk load event when not all hfiles are loaded + // @rite a bulk load event when not all hfiles are loaded try { WALProtos.BulkLoadDescriptor loadDescriptor = ProtobufUtil.toBulkLoadDescriptor( this.getRegionInfo().getTable(), ByteStringer.wrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles, seqId); - WALUtil.writeBulkLoadMarkerAndSync(wal, this.htableDescriptor, getRegionInfo(), + WALUtil.writeBulkLoadMarkerAndSync(this.wal, getTableDesc(), getRegionInfo(), loadDescriptor, mvcc); } catch (IOException ioe) { if (this.rsServices != null) { @@ -5580,7 +5430,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi // getSmallestReadPoint, before scannerReadPoints is updated. IsolationLevel isolationLevel = scan.getIsolationLevel(); synchronized(scannerReadPoints) { - this.readPt = getReadpoint(isolationLevel); + this.readPt = getReadPoint(isolationLevel); scannerReadPoints.put(this, this.readPt); } @@ -5744,7 +5594,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } // As the data is obtained from two independent heaps, we need to // ensure that result list is sorted, because Result relies on that. - Collections.sort(results, comparator); + sort(results, comparator); return moreValues; } @@ -6857,7 +6707,6 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi @Override public void processRowsWithLocks(RowProcessor processor, long timeout, long nonceGroup, long nonce) throws IOException { - for (byte[] row : processor.getRowsToLock()) { checkRow(row, "processRowsWithLocks"); } @@ -6865,23 +6714,16 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi checkReadOnly(); } checkResources(); - startRegionOperation(); WALEdit walEdit = new WALEdit(); - // 1. Run pre-process hook - try { - processor.preProcess(this, walEdit); - } catch (IOException e) { - closeRegionOperation(); - throw e; - } + // STEP 1. Run pre-process hook + preProcess(processor, walEdit); // Short circuit the read only case if (processor.readOnly()) { try { long now = EnvironmentEdgeManager.currentTime(); - doProcessRowWithTimeout( - processor, now, this, null, null, timeout); + doProcessRowWithTimeout(processor, now, this, null, null, timeout); processor.postProcess(this, walEdit, true); } finally { closeRegionOperation(); @@ -6889,118 +6731,81 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi return; } - MultiVersionConcurrencyControl.WriteEntry writeEntry = null; boolean locked; - boolean walSyncSuccessful = false; List acquiredRowLocks; long addedSize = 0; List mutations = new ArrayList(); Collection rowsToLock = processor.getRowsToLock(); - long mvccNum = 0; - WALKey walKey = null; + // This is assigned by mvcc either explicity in the below or in the guts of the WAL append + // when it assigns the edit a sequencedid (A.K.A the mvcc write number). + WriteEntry writeEntry = null; try { - // 2. Acquire the row lock(s) + // STEP 2. Acquire the row lock(s) acquiredRowLocks = new ArrayList(rowsToLock.size()); for (byte[] row : rowsToLock) { // Attempt to lock all involved rows, throw if any lock times out // use a writer lock for mixed reads and writes acquiredRowLocks.add(getRowLock(row)); } - // 3. Region lock + // STEP 3. Region lock lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size()); locked = true; - + boolean success = false; long now = EnvironmentEdgeManager.currentTime(); try { - // 4. Let the processor scan the rows, generate mutations and add - // waledits - doProcessRowWithTimeout( - processor, now, this, mutations, walEdit, timeout); - + // STEP 4. Let the processor scan the rows, generate mutations and add waledits + doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout); if (!mutations.isEmpty()) { - - // 5. Call the preBatchMutate hook + // STEP 5. Call the preBatchMutate hook processor.preBatchMutate(this, walEdit); - long txid = 0; - // 6. Append no sync + // STEP 6. Append and sync if walEdit has data to write out. if (!walEdit.isEmpty()) { - // we use HLogKey here instead of WALKey directly to support legacy coprocessors. - walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(), - this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now, - processor.getClusterIds(), nonceGroup, nonce, mvcc); - txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(), - walKey, walEdit, false); - } - if(walKey == null){ - // since we use wal sequence Id as mvcc, for SKIP_WAL changes we need a "faked" WALEdit - // to get a sequence id assigned which is done by FSWALEntry#stampRegionSequenceId - walKey = this.appendEmptyEdit(this.wal); + writeEntry = doAppend(walEdit, getEffectiveDurability(processor.useDurability()), + processor.getClusterIds(), now, nonceGroup, nonce); + } else { + // We are here if WAL is being skipped. + writeEntry = this.mvcc.begin(); } - // 7. Start mvcc transaction - writeEntry = walKey.getWriteEntry(); - mvccNum = walKey.getSequenceId(); - - - - // 8. Apply to memstore + // STEP 7. Apply to memstore + long sequenceId = writeEntry.getWriteNumber(); for (Mutation m : mutations) { - // Handle any tag based cell features + // Handle any tag based cell features. + // TODO: Do we need to call rewriteCellTags down in applyToMemstore()? Why not before + // so tags go into WAL? rewriteCellTags(m.getFamilyCellMap(), m); - for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) { Cell cell = cellScanner.current(); - CellUtil.setSequenceId(cell, mvccNum); - Store store = getStore(cell); - if (store == null) { - checkFamily(CellUtil.cloneFamily(cell)); - // unreachable + if (walEdit.isEmpty()) { + // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id. + // If no WAL, need to stamp it here. + CellUtil.setSequenceId(cell, sequenceId); } - addedSize += store.add(cell); + Store store = getStore(cell); + addedSize += applyToMemstore(store, cell, sequenceId); } } + // STEP 8. Complete mvcc. + mvcc.completeAndWait(writeEntry); + writeEntry = null; - // 9. Release region lock + // STEP 9. Release region lock if (locked) { this.updatesLock.readLock().unlock(); locked = false; } - // 10. Release row lock(s) + // STEP 10. Release row lock(s) releaseRowLocks(acquiredRowLocks); - // 11. Sync edit log - if (txid != 0) { - syncOrDefer(txid, getEffectiveDurability(processor.useDurability())); - } - walSyncSuccessful = true; - // 12. call postBatchMutate hook + // STEP 11. call postBatchMutate hook processor.postBatchMutate(this); } + success = true; } finally { - // TODO: Make this method look like all other methods that are doing append/sync and - // memstore rollback such as append and doMiniBatchMutation. Currently it is a little - // different. Make them all share same code! - if (!mutations.isEmpty() && !walSyncSuccessful) { - LOG.warn("Wal sync failed. Roll back " + mutations.size() + - " memstore keyvalues for row(s):" + StringUtils.byteToHexString( - processor.getRowsToLock().iterator().next()) + "..."); - for (Mutation m : mutations) { - for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) { - Cell cell = cellScanner.current(); - getStore(cell).rollback(cell); - } - } - if (writeEntry != null) { - mvcc.complete(writeEntry); - writeEntry = null; - } - } - // 13. Roll mvcc forward - if (writeEntry != null) { - mvcc.completeAndWait(writeEntry); - } + // Call complete rather than completeAndWait because we probably had error if walKey != null + if (writeEntry != null) mvcc.complete(writeEntry); if (locked) { this.updatesLock.readLock().unlock(); } @@ -7008,18 +6813,26 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi releaseRowLocks(acquiredRowLocks); } - // 14. Run post-process hook - processor.postProcess(this, walEdit, walSyncSuccessful); - + // 12. Run post-process hook + processor.postProcess(this, walEdit, success); } finally { closeRegionOperation(); - if (!mutations.isEmpty() && - isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) { + if (!mutations.isEmpty() && isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) { requestFlush(); } } } + private void preProcess(final RowProcessor processor, final WALEdit walEdit) + throws IOException { + try { + processor.preProcess(this, walEdit); + } catch (IOException e) { + closeRegionOperation(); + throw e; + } + } + private void doProcessRowWithTimeout(final RowProcessor processor, final long now, final HRegion region, @@ -7070,500 +6883,393 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } } - /** - * @return The passed-in {@code tags} but with the tags from {@code cell} added. - */ - private static List carryForwardTags(final Cell cell, final List tags) { - if (cell.getTagsLength() <= 0) return tags; - List newTags = tags == null? new ArrayList(): /*Append Tags*/tags; - Iterator i = CellUtil.tagsIterator(cell); - while (i.hasNext()) newTags.add(i.next()); - return newTags; + public Result append(Append append) throws IOException { + return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE); } - /** - * Run a Get against passed in store on passed row, etc. - * @return Get result. - */ - private List doGet(final Store store, final byte [] row, - final Map.Entry> family, final TimeRange tr) - throws IOException { - // Sort the cells so that they match the order that they - // appear in the Get results. Otherwise, we won't be able to - // find the existing values if the cells are not specified - // in order by the client since cells are in an array list. - Collections.sort(family.getValue(), store.getComparator()); - // Get previous values for all columns in this family - Get get = new Get(row); - for (Cell cell : family.getValue()) { - get.addColumn(family.getKey(), CellUtil.cloneQualifier(cell)); - } - if (tr != null) get.setTimeRange(tr.getMin(), tr.getMax()); - return get(get, false); + @Override + public Result append(Append mutation, long nonceGroup, long nonce) throws IOException { + return doDelta(Operation.APPEND, mutation, nonceGroup, nonce, mutation.isReturnResults()); } - public Result append(Append append) throws IOException { - return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE); + public Result increment(Increment increment) throws IOException { + return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE); } - // TODO: There's a lot of boiler plate code identical to increment. - // We should refactor append and increment as local get-mutate-put - // transactions, so all stores only go through one code path for puts. - @Override - public Result append(Append mutate, long nonceGroup, long nonce) throws IOException { - Operation op = Operation.APPEND; - byte[] row = mutate.getRow(); - checkRow(row, op.toString()); - checkFamilies(mutate.getFamilyCellMap().keySet()); - boolean flush = false; - Durability durability = getEffectiveDurability(mutate.getDurability()); - boolean writeToWAL = durability != Durability.SKIP_WAL; - WALEdit walEdits = null; - List allKVs = new ArrayList(mutate.size()); - Map> tempMemstore = new HashMap>(); - long size = 0; - long txid = 0; + public Result increment(Increment mutation, long nonceGroup, long nonce) + throws IOException { + return doDelta(Operation.INCREMENT, mutation, nonceGroup, nonce, mutation.isReturnResults()); + } + + /** + * Add "deltas" to Cells. Deltas are increments or appends. Switch on op. + * + *

If increment, add deltas to current values or if an append, then + * append the deltas to the current Cell values. + * + *

Append and Increment code paths are mostly the same. They differ in just a few places. + * This method does the code path for increment and append and then in key spots, switches + * on the passed in op to do increment or append specific paths. + */ + private Result doDelta(Operation op, Mutation mutation, long nonceGroup, long nonce, + boolean returnResults) + throws IOException { checkReadOnly(); checkResources(); - // Lock row - startRegionOperation(op); + checkRow(mutation.getRow(), op.toString()); + checkFamilies(mutation.getFamilyCellMap().keySet()); this.writeRequestsCount.increment(); - RowLock rowLock = null; - WALKey walKey = null; - MultiVersionConcurrencyControl.WriteEntry writeEntry = null; - boolean doRollBackMemstore = false; + WriteEntry writeEntry = null; + startRegionOperation(op); + long accumulatedResultSize = 0; + List results = returnResults? new ArrayList(mutation.size()): null; + RowLock rowLock = getRowLock(mutation.getRow()); try { - rowLock = getRowLock(row); - assert rowLock != null; + lock(this.updatesLock.readLock()); try { - lock(this.updatesLock.readLock()); - try { - // Wait for all prior MVCC transactions to finish - while we hold the row lock - // (so that we are guaranteed to see the latest state when we do our Get) - mvcc.await(); - if (this.coprocessorHost != null) { - Result r = this.coprocessorHost.preAppendAfterRowLock(mutate); - if (r!= null) { - return r; - } - } - long now = EnvironmentEdgeManager.currentTime(); - // Process each family - for (Map.Entry> family : mutate.getFamilyCellMap().entrySet()) { - Store store = stores.get(family.getKey()); - List kvs = new ArrayList(family.getValue().size()); - - List results = doGet(store, row, family, null); - - // Iterate the input columns and update existing values if they were - // found, otherwise add new column initialized to the append value - - // Avoid as much copying as possible. We may need to rewrite and - // consolidate tags. Bytes are only copied once. - // Would be nice if KeyValue had scatter/gather logic - int idx = 0; - for (Cell cell : family.getValue()) { - Cell newCell; - Cell oldCell = null; - if (idx < results.size() - && CellUtil.matchingQualifier(results.get(idx), cell)) { - oldCell = results.get(idx); - long ts = Math.max(now, oldCell.getTimestamp()); - - // Process cell tags - // Make a union of the set of tags in the old and new KVs - List newTags = carryForwardTags(oldCell, new ArrayList()); - newTags = carryForwardTags(cell, newTags); - - // Cell TTL handling - - if (mutate.getTTL() != Long.MAX_VALUE) { - // Add the new TTL tag - newTags.add( - new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(mutate.getTTL()))); - } - - // Rebuild tags - byte[] tagBytes = TagUtil.fromList(newTags); - - // allocate an empty cell once - newCell = new KeyValue(row.length, cell.getFamilyLength(), - cell.getQualifierLength(), ts, KeyValue.Type.Put, - oldCell.getValueLength() + cell.getValueLength(), - tagBytes.length); - // copy in row, family, and qualifier - System.arraycopy(cell.getRowArray(), cell.getRowOffset(), - newCell.getRowArray(), newCell.getRowOffset(), cell.getRowLength()); - System.arraycopy(cell.getFamilyArray(), cell.getFamilyOffset(), - newCell.getFamilyArray(), newCell.getFamilyOffset(), - cell.getFamilyLength()); - System.arraycopy(cell.getQualifierArray(), cell.getQualifierOffset(), - newCell.getQualifierArray(), newCell.getQualifierOffset(), - cell.getQualifierLength()); - // copy in the value - CellUtil.copyValueTo(oldCell, newCell.getValueArray(), newCell.getValueOffset()); - System.arraycopy(cell.getValueArray(), cell.getValueOffset(), - newCell.getValueArray(), - newCell.getValueOffset() + oldCell.getValueLength(), - cell.getValueLength()); - // Copy in tag data - System.arraycopy(tagBytes, 0, newCell.getTagsArray(), newCell.getTagsOffset(), - tagBytes.length); - idx++; - } else { - // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP - CellUtil.updateLatestStamp(cell, now); - - // Cell TTL handling - - if (mutate.getTTL() != Long.MAX_VALUE) { - List newTags = new ArrayList(1); - newTags.add( - new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(mutate.getTTL()))); - // Add the new TTL tag - newCell = new TagRewriteCell(cell, TagUtil.fromList(newTags)); - } else { - newCell = cell; - } - } - - // Give coprocessors a chance to update the new cell - if (coprocessorHost != null) { - newCell = coprocessorHost.postMutationBeforeWAL(RegionObserver.MutationType.APPEND, - mutate, oldCell, newCell); - } - kvs.add(newCell); - - // Append update to WAL - if (writeToWAL) { - if (walEdits == null) { - walEdits = new WALEdit(); - } - walEdits.add(newCell); - } - } - - //store the kvs to the temporary memstore before writing WAL - tempMemstore.put(store, kvs); - } - - // Actually write to WAL now - if (walEdits != null && !walEdits.isEmpty()) { - if (writeToWAL) { - // Using default cluster id, as this can only happen in the originating - // cluster. A slave cluster receives the final value (not the delta) - // as a Put. - // we use HLogKey here instead of WALKey directly to support legacy coprocessors. - walKey = new HLogKey( - getRegionInfo().getEncodedNameAsBytes(), - this.htableDescriptor.getTableName(), - WALKey.NO_SEQUENCE_ID, - nonceGroup, - nonce, - mvcc); - txid = - this.wal.append(this.htableDescriptor, getRegionInfo(), walKey, walEdits, true); - } else { - recordMutationWithoutWal(mutate.getFamilyCellMap()); - } - } - if (walKey == null) { - // Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned - walKey = this.appendEmptyEdit(this.wal); - } - - // now start my own transaction - writeEntry = walKey.getWriteEntry(); - - - // Actually write to Memstore now - if (!tempMemstore.isEmpty()) { - for (Map.Entry> entry : tempMemstore.entrySet()) { - Store store = entry.getKey(); - if (store.getFamily().getMaxVersions() == 1) { - // upsert if VERSIONS for this CF == 1 - // Is this right? It immediately becomes visible? St.Ack 20150907 - size += store.upsert(entry.getValue(), getSmallestReadPoint()); - } else { - // otherwise keep older versions around - for (Cell cell: entry.getValue()) { - CellUtil.setSequenceId(cell, writeEntry.getWriteNumber()); - size += store.add(cell); - doRollBackMemstore = true; - } - } - // We add to all KVs here whereas when doing increment, we do it - // earlier... why? - allKVs.addAll(entry.getValue()); - } - - size = this.addAndGetGlobalMemstoreSize(size); - flush = isFlushSize(size); - } - } finally { - this.updatesLock.readLock().unlock(); + Result cpResult = doCoprocessorPreCall(op, mutation); + if (cpResult != null) return cpResult; + Durability effectiveDurability = getEffectiveDurability(mutation.getDurability()); + Map> forMemStore = + new HashMap>(mutation.getFamilyCellMap().size()); + // Reckon Cells to apply to WAL -- in returned walEdit -- and what to add to memstore and + // what to return back to the client (in 'forMemStore' and 'results' respectively). + WALEdit walEdit = reckonDeltas(op, mutation, effectiveDurability, forMemStore, results); + // Actually write to WAL now if a walEdit to apply. + if (walEdit != null && !walEdit.isEmpty()) { + writeEntry = doAppend(walEdit, durability, nonceGroup, nonce); + } else { + // If walEdits is empty, it means we skipped the WAL; update counters and start an mvcc + // transaction. + recordMutationWithoutWal(mutation.getFamilyCellMap()); + writeEntry = mvcc.begin(); } - + // Now write to MemStore. Do it a column family at a time. + long sequenceId = writeEntry.getWriteNumber(); + for (Map.Entry> e: forMemStore.entrySet()) { + accumulatedResultSize += + applyToMemstore(e.getKey(), e.getValue(), true, false, sequenceId); + } + mvcc.completeAndWait(writeEntry); + // Clear walKey so don't complete for second time in finally below. + writeEntry = null; } finally { - rowLock.release(); - rowLock = null; - } - // sync the transaction log outside the rowlock - if(txid != 0){ - syncOrDefer(txid, durability); + this.updatesLock.readLock().unlock(); } - doRollBackMemstore = false; + // If results is null, then client asked that we not return the calculated results. + return results != null? Result.create(results): null; } finally { - if (rowLock != null) { - rowLock.release(); - } - // if the wal sync was unsuccessful, remove keys from memstore - if (doRollBackMemstore) { - rollbackMemstore(allKVs); - if (writeEntry != null) mvcc.complete(writeEntry); - } else if (writeEntry != null) { - mvcc.completeAndWait(writeEntry); - } - + // Call complete rather than completeAndWait because we probably had error if walKey != null + if (writeEntry != null) mvcc.complete(writeEntry); + rowLock.release(); + // Request a cache flush if over the limit. Do it outside update lock. + if (isFlushSize(this.addAndGetGlobalMemstoreSize(accumulatedResultSize))) requestFlush(); closeRegionOperation(op); + if (this.metricsRegion != null) { + switch (op) { + case INCREMENT: + this.metricsRegion.updateIncrement(); + break; + case APPEND: + this.metricsRegion.updateAppend(); + break; + default: + break; + } + } } + } - if (this.metricsRegion != null) { - this.metricsRegion.updateAppend(); - } + private WriteEntry doAppend(WALEdit walEdit, Durability durability, long nonceGroup, long nonce) + throws IOException { + return doAppend(walEdit, durability, WALKey.EMPTY_UUIDS, System.currentTimeMillis(), + nonceGroup, nonce); + } - if (flush) { - // Request a cache flush. Do it outside update lock. - requestFlush(); + /** + * @return writeEntry associated with this append + */ + private WriteEntry doAppend(WALEdit walEdit, Durability durability, List clusterIds, + long now, long nonceGroup, long nonce) + throws IOException { + WriteEntry writeEntry = null; + // Using default cluster id, as this can only happen in the originating cluster. + // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey + // here instead of WALKey directly to support legacy coprocessors. + WALKey walKey = new WALKey(this.getRegionInfo().getEncodedNameAsBytes(), + this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now, clusterIds, + nonceGroup, nonce, mvcc); + try { + long txid = + this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit, true); + // Call sync on our edit. + if (txid != 0) sync(txid, durability); + writeEntry = walKey.getWriteEntry(); + } catch (IOException ioe) { + if (walKey != null) mvcc.complete(walKey.getWriteEntry()); + throw ioe; } + return writeEntry; + } - return mutate.isReturnResults() ? Result.create(allKVs) : null; + /** + * Do coprocessor pre-increment or pre-append call. + * @return Result returned out of the coprocessor, which means bypass all further processing and + * return the proffered Result instead, or null which means proceed. + */ + private Result doCoprocessorPreCall(final Operation op, final Mutation mutation) + throws IOException { + Result result = null; + if (this.coprocessorHost != null) { + switch(op) { + case INCREMENT: + result = this.coprocessorHost.preIncrementAfterRowLock((Increment)mutation); + break; + case APPEND: + result = this.coprocessorHost.preAppendAfterRowLock((Append)mutation); + break; + default: throw new UnsupportedOperationException(op.toString()); + } + } + return result; } - public Result increment(Increment increment) throws IOException { - return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE); + /** + * Reckon the Cells to apply to WAL, memstore, and to return to the Client; these Sets are not + * always the same dependent on whether to write WAL or if the amount to increment is zero (in + * this case we write back nothing, just return latest Cell value to the client). + * + * @param results Fill in here what goes back to the Client if it is non-null (if null, client + * doesn't want results). + * @param forMemStore Fill in here what to apply to the MemStore (by Store). + * @return A WALEdit to apply to WAL or null if we are to skip the WAL. + * @throws IOException + */ + private WALEdit reckonDeltas(final Operation op, final Mutation mutation, + final Durability effectiveDurability, final Map> forMemStore, + final List results) + throws IOException { + WALEdit walEdit = null; + long now = EnvironmentEdgeManager.currentTime(); + final boolean writeToWAL = effectiveDurability != Durability.SKIP_WAL; + // Process a Store/family at a time. + for (Map.Entry> entry: mutation.getFamilyCellMap().entrySet()) { + final byte [] columnFamilyName = entry.getKey(); + List deltas = entry.getValue(); + Store store = this.stores.get(columnFamilyName); + // Reckon for the Store what to apply to WAL and MemStore. + List toApply = + reckonDeltasByStore(store, op, mutation, effectiveDurability, now, deltas, results); + if (!toApply.isEmpty()) { + forMemStore.put(store, toApply); + if (writeToWAL) { + if (walEdit == null) walEdit = new WALEdit(); + walEdit.getCells().addAll(toApply); + } + } + } + return walEdit; } - // TODO: There's a lot of boiler plate code identical to append. - // We should refactor append and increment as local get-mutate-put - // transactions, so all stores only go through one code path for puts. + /** + * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed + * column family/Store. + * + * Does Get of current value and then adds passed in deltas for this Store returning the result. + * + * @param op Whether Increment or Append + * @param mutation The encompassing Mutation object + * @param deltas Changes to apply to this Store; either increment amount or data to append + * @param results In here we accumulate all the Cells we are to return to the client; this List + * can be larger than what we return in case where delta is zero; i.e. don't write + * out new values, just return current value. If null, client doesn't want results returned. + * @return Resulting Cells after deltas have been applied to current + * values. Side effect is our filling out of the results List. + * @throws IOException + */ + private List reckonDeltasByStore(final Store store, final Operation op, + final Mutation mutation, final Durability effectiveDurability, final long now, + final List deltas, final List results) + throws IOException { + byte [] columnFamily = store.getFamily().getName(); + List toApply = new ArrayList(deltas.size()); + // Get previous values for all columns in this family. + List currentValues = get(mutation, store, deltas, null/*DEFAULT*/, + op == Operation.INCREMENT? ((Increment)mutation).getTimeRange(): null); + // Iterate the input columns and update existing values if they were found, otherwise + // add new column initialized to the delta amount + int currentValuesIndex = 0; + for (int i = 0; i < deltas.size(); i++) { + Cell delta = deltas.get(i); + Cell currentValue = null; + if (currentValuesIndex < currentValues.size() && + CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta)) { + currentValue = currentValues.get(currentValuesIndex); + if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) { + currentValuesIndex++; + } + } + // Switch on whether this an increment or an append building the new Cell to apply. + Cell newCell = null; + MutationType mutationType = null; + boolean apply = true; + switch (op) { + case INCREMENT: + mutationType = MutationType.INCREMENT; + // If delta amount to apply is 0, don't write WAL or MemStore. + long deltaAmount = getLongValue(delta); + apply = deltaAmount != 0; + newCell = reckonIncrement(delta, deltaAmount, currentValue, columnFamily, now, + (Increment)mutation); + break; + case APPEND: + mutationType = MutationType.APPEND; + // Always apply Append. TODO: Does empty delta value mean reset Cell? It seems to. + newCell = reckonAppend(delta, currentValue, now, (Append)mutation); + break; + default: throw new UnsupportedOperationException(op.toString()); + } - // They are subtley different in quiet a few ways. This came out only - // after study. I am not sure that many of the differences are intentional. - // TODO: St.Ack 20150907 + // Give coprocessors a chance to update the new cell + if (coprocessorHost != null) { + newCell = + coprocessorHost.postMutationBeforeWAL(mutationType, mutation, currentValue, newCell); + } + // If apply, we need to update memstore/WAL with new value; add it toApply. + if (apply) toApply.add(newCell); + // Add to results to get returned to the Client. If null, cilent does not want results. + if (results != null) { + results.add(newCell); + } + } + return toApply; + } - @Override - public Result increment(Increment mutation, long nonceGroup, long nonce) + /** + * Calculate new Increment Cell. + * @return New Increment Cell with delta applied to currentValue if currentValue is not null; + * otherwise, a new Cell with the delta set as its value. + */ + private Cell reckonIncrement(final Cell delta, final long deltaAmount, final Cell currentValue, + byte [] columnFamily, final long now, Mutation mutation) throws IOException { - Operation op = Operation.INCREMENT; + // Forward any tags found on the delta. + List tags = TagUtil.carryForwardTags(delta); + long newValue = deltaAmount; + long ts = now; + if (currentValue != null) { + tags = TagUtil.carryForwardTags(tags, currentValue); + ts = Math.max(now, currentValue.getTimestamp()); + newValue += getLongValue(currentValue); + } + // Now make up the new Cell. TODO: FIX. This is carnel knowledge of how KeyValues are made... + // doesn't work well with offheaping or if we are doing a different Cell type. + byte [] incrementAmountInBytes = Bytes.toBytes(newValue); + tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); byte [] row = mutation.getRow(); - checkRow(row, op.toString()); - checkFamilies(mutation.getFamilyCellMap().keySet()); - boolean flush = false; - Durability durability = getEffectiveDurability(mutation.getDurability()); - boolean writeToWAL = durability != Durability.SKIP_WAL; - WALEdit walEdits = null; - List allKVs = new ArrayList(mutation.size()); - - Map> tempMemstore = new HashMap>(); - long size = 0; - long txid = 0; - checkReadOnly(); - checkResources(); - // Lock row - startRegionOperation(op); - this.writeRequestsCount.increment(); - RowLock rowLock = null; - WALKey walKey = null; - MultiVersionConcurrencyControl.WriteEntry writeEntry = null; - boolean doRollBackMemstore = false; - TimeRange tr = mutation.getTimeRange(); - try { - rowLock = getRowLock(row); - assert rowLock != null; - try { - lock(this.updatesLock.readLock()); - try { - // wait for all prior MVCC transactions to finish - while we hold the row lock - // (so that we are guaranteed to see the latest state) - mvcc.await(); - if (this.coprocessorHost != null) { - Result r = this.coprocessorHost.preIncrementAfterRowLock(mutation); - if (r != null) { - return r; - } - } - long now = EnvironmentEdgeManager.currentTime(); - // Process each family - for (Map.Entry> family: mutation.getFamilyCellMap().entrySet()) { - Store store = stores.get(family.getKey()); - List kvs = new ArrayList(family.getValue().size()); - - List results = doGet(store, row, family, tr); - - // Iterate the input columns and update existing values if they were - // found, otherwise add new column initialized to the increment amount - - // Avoid as much copying as possible. We may need to rewrite and - // consolidate tags. Bytes are only copied once. - // Would be nice if KeyValue had scatter/gather logic - int idx = 0; - // HERE WE DIVERGE FROM APPEND - List edits = family.getValue(); - for (int i = 0; i < edits.size(); i++) { - Cell cell = edits.get(i); - long amount = Bytes.toLong(CellUtil.cloneValue(cell)); - boolean noWriteBack = (amount == 0); - - List newTags = carryForwardTags(cell, new ArrayList()); - - Cell c = null; - long ts = now; - if (idx < results.size() && CellUtil.matchingQualifier(results.get(idx), cell)) { - c = results.get(idx); - ts = Math.max(now, c.getTimestamp()); - if(c.getValueLength() == Bytes.SIZEOF_LONG) { - amount += CellUtil.getValueAsLong(c); - } else { - // throw DoNotRetryIOException instead of IllegalArgumentException - throw new org.apache.hadoop.hbase.DoNotRetryIOException( - "Attempted to increment field that isn't 64 bits wide"); - } - // Carry tags forward from previous version - newTags = carryForwardTags(c, newTags); - if (i < (edits.size() - 1) && !CellUtil.matchingQualifier(cell, edits.get(i + 1))) { - idx++; - } - } - - // Append new incremented KeyValue to list - byte[] q = CellUtil.cloneQualifier(cell); - byte[] val = Bytes.toBytes(amount); - - // Add the TTL tag if the mutation carried one - if (mutation.getTTL() != Long.MAX_VALUE) { - newTags.add( - new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(mutation.getTTL()))); - } - - Cell newKV = new KeyValue(row, 0, row.length, - family.getKey(), 0, family.getKey().length, - q, 0, q.length, - ts, - KeyValue.Type.Put, - val, 0, val.length, - newTags); - - // Give coprocessors a chance to update the new cell - if (coprocessorHost != null) { - newKV = coprocessorHost.postMutationBeforeWAL( - RegionObserver.MutationType.INCREMENT, mutation, c, newKV); - } - allKVs.add(newKV); - - if (!noWriteBack) { - kvs.add(newKV); - - // Prepare WAL updates - if (writeToWAL) { - if (walEdits == null) { - walEdits = new WALEdit(); - } - walEdits.add(newKV); - } - } - } - - //store the kvs to the temporary memstore before writing WAL - if (!kvs.isEmpty()) { - tempMemstore.put(store, kvs); - } - } - - // Actually write to WAL now - if (walEdits != null && !walEdits.isEmpty()) { - if (writeToWAL) { - // Using default cluster id, as this can only happen in the originating - // cluster. A slave cluster receives the final value (not the delta) - // as a Put. - // we use HLogKey here instead of WALKey directly to support legacy coprocessors. - walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(), - this.htableDescriptor.getTableName(), - WALKey.NO_SEQUENCE_ID, - nonceGroup, - nonce, - mvcc); - txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(), - walKey, walEdits, true); - } else { - recordMutationWithoutWal(mutation.getFamilyCellMap()); - } - } - if (walKey == null) { - // Append a faked WALEdit in order for SKIP_WAL updates to get mvccNum assigned - walKey = this.appendEmptyEdit(this.wal); - } - - // now start my own transaction - writeEntry = walKey.getWriteEntry(); + return new KeyValue(row, 0, row.length, + columnFamily, 0, columnFamily.length, + delta.getQualifierArray(), delta.getQualifierOffset(), delta.getQualifierLength(), + ts, KeyValue.Type.Put, + incrementAmountInBytes, 0, incrementAmountInBytes.length, + tags); + } - // Actually write to Memstore now - if (!tempMemstore.isEmpty()) { - for (Map.Entry> entry : tempMemstore.entrySet()) { - Store store = entry.getKey(); - if (store.getFamily().getMaxVersions() == 1) { - // upsert if VERSIONS for this CF == 1 - // Is this right? It immediately becomes visible? St.Ack 20150907 - size += store.upsert(entry.getValue(), getSmallestReadPoint()); - } else { - // otherwise keep older versions around - for (Cell cell : entry.getValue()) { - CellUtil.setSequenceId(cell, writeEntry.getWriteNumber()); - size += store.add(cell); - doRollBackMemstore = true; - } - } - } - size = this.addAndGetGlobalMemstoreSize(size); - flush = isFlushSize(size); - } - } finally { - this.updatesLock.readLock().unlock(); - } - } finally { - rowLock.release(); - rowLock = null; - } - // sync the transaction log outside the rowlock - if(txid != 0){ - syncOrDefer(txid, durability); - } - doRollBackMemstore = false; - } finally { - if (rowLock != null) { - rowLock.release(); - } - // if the wal sync was unsuccessful, remove keys from memstore - if (doRollBackMemstore) { - for(List cells: tempMemstore.values()) { - rollbackMemstore(cells); - } - if (writeEntry != null) mvcc.complete(writeEntry); - } else if (writeEntry != null) { - mvcc.completeAndWait(writeEntry); + private Cell reckonAppend(final Cell delta, final Cell currentValue, final long now, + Append mutation) + throws IOException { + // Forward any tags found on the delta. + List tags = TagUtil.carryForwardTags(delta); + long ts = now; + Cell newCell = null; + byte [] row = mutation.getRow(); + if (currentValue != null) { + tags = TagUtil.carryForwardTags(tags, currentValue); + ts = Math.max(now, currentValue.getTimestamp()); + tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); + byte[] tagBytes = TagUtil.fromList(tags); + // Allocate an empty cell and copy in all parts. + // TODO: This is intimate knowledge of how a KeyValue is made. Undo!!! Prevents our doing + // other Cell types. Copying on-heap too if an off-heap Cell. + newCell = new KeyValue(row.length, delta.getFamilyLength(), + delta.getQualifierLength(), ts, KeyValue.Type.Put, + delta.getValueLength() + currentValue.getValueLength(), + tagBytes == null? 0: tagBytes.length); + // Copy in row, family, and qualifier + System.arraycopy(row, 0, newCell.getRowArray(), newCell.getRowOffset(), row.length); + System.arraycopy(delta.getFamilyArray(), delta.getFamilyOffset(), + newCell.getFamilyArray(), newCell.getFamilyOffset(), delta.getFamilyLength()); + System.arraycopy(delta.getQualifierArray(), delta.getQualifierOffset(), + newCell.getQualifierArray(), newCell.getQualifierOffset(), delta.getQualifierLength()); + // Copy in the value + CellUtil.copyValueTo(currentValue, newCell.getValueArray(), newCell.getValueOffset()); + System.arraycopy(delta.getValueArray(), delta.getValueOffset(), + newCell.getValueArray(), newCell.getValueOffset() + currentValue.getValueLength(), + delta.getValueLength()); + // Copy in tag data + if (tagBytes != null) { + System.arraycopy(tagBytes, 0, + newCell.getTagsArray(), newCell.getTagsOffset(), tagBytes.length); } - closeRegionOperation(Operation.INCREMENT); - if (this.metricsRegion != null) { - this.metricsRegion.updateIncrement(); + } else { + // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP + CellUtil.updateLatestStamp(delta, now); + newCell = delta; + tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); + if (tags != null) { + newCell = new TagRewriteCell(delta, TagUtil.fromList(tags)); } } + return newCell; + } - if (flush) { - // Request a cache flush. Do it outside update lock. - requestFlush(); + /** + * @return Get the long out of the passed in Cell + * @throws DoNotRetryIOException + */ + private static long getLongValue(final Cell cell) throws DoNotRetryIOException { + int len = cell.getValueLength(); + if (len != Bytes.SIZEOF_LONG) { + // throw DoNotRetryIOException instead of IllegalArgumentException + throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide"); } - return mutation.isReturnResults() ? Result.create(allKVs) : null; + return Bytes.toLong(cell.getValueArray(), cell.getValueOffset(), len); + } + + /** + * Do a specific Get on passed columnFamily and column qualifiers. + * @param mutation Mutation we are doing this Get for. + * @param columnFamily Which column family on row (TODO: Go all Gets in one go) + * @param coordinates Cells from mutation used as coordinates applied to Get. + * @return Return list of Cells found. + */ + private List get(final Mutation mutation, final Store store, + final List coordinates, final IsolationLevel isolation, final TimeRange tr) + throws IOException { + // Sort the cells so that they match the order that they appear in the Get results. Otherwise, + // we won't be able to find the existing values if the cells are not specified in order by the + // client since cells are in an array list. + // TODO: I don't get why we are sorting. St.Ack 20150107 + sort(coordinates, store.getComparator()); + Get get = new Get(mutation.getRow()); + if (isolation != null) { + get.setIsolationLevel(isolation); + } + for (Cell cell: coordinates) { + get.addColumn(store.getFamily().getName(), CellUtil.cloneQualifier(cell)); + } + // Increments carry time range. If an Increment instance, put it on the Get. + if (tr != null) get.setTimeRange(tr.getMin(), tr.getMax()); + return get(get, false); + } + + /** + * @return Sorted list of cells using comparator + */ + private static List sort(List cells, final Comparator comparator) { + Collections.sort(cells, comparator); + return cells; } // @@ -7582,7 +7288,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi public static final long FIXED_OVERHEAD = ClassSize.align( ClassSize.OBJECT + ClassSize.ARRAY + - 44 * ClassSize.REFERENCE + 3 * Bytes.SIZEOF_INT + + 44 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT + (14 * Bytes.SIZEOF_LONG) + 5 * Bytes.SIZEOF_BOOLEAN); @@ -7619,20 +7325,6 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi return heapSize; } - /* - * This method calls System.exit. - * @param message Message to print out. May be null. - */ - private static void printUsageAndExit(final String message) { - if (message != null && message.length() > 0) System.out.println(message); - System.out.println("Usage: HRegion CATALOG_TABLE_DIR [major_compact]"); - System.out.println("Options:"); - System.out.println(" major_compact Pass this option to major compact " + - "passed region."); - System.out.println("Default outputs scan of passed region."); - System.exit(1); - } - @Override public boolean registerService(Service instance) { /* @@ -7706,53 +7398,6 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi return responseBuilder.build(); } - /* - * Process table. - * Do major compaction or list content. - * @throws IOException - */ - private static void processTable(final FileSystem fs, final Path p, - final WALFactory walFactory, final Configuration c, - final boolean majorCompact) - throws IOException { - HRegion region; - FSTableDescriptors fst = new FSTableDescriptors(c); - // Currently expects tables have one region only. - if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) { - final WAL wal = walFactory.getMetaWAL( - HRegionInfo.FIRST_META_REGIONINFO.getEncodedNameAsBytes()); - region = HRegion.newHRegion(p, wal, fs, c, - HRegionInfo.FIRST_META_REGIONINFO, - fst.get(TableName.META_TABLE_NAME), null); - } else { - throw new IOException("Not a known catalog table: " + p.toString()); - } - try { - region.mvcc.advanceTo(region.initialize(null)); - if (majorCompact) { - region.compact(true); - } else { - // Default behavior - Scan scan = new Scan(); - // scan.addFamily(HConstants.CATALOG_FAMILY); - RegionScanner scanner = region.getScanner(scan); - try { - List kvs = new ArrayList(); - boolean done; - do { - kvs.clear(); - done = scanner.next(kvs); - if (kvs.size() > 0) LOG.info(kvs); - } while (done); - } finally { - scanner.close(); - } - } - } finally { - region.close(); - } - } - boolean shouldForceSplit() { return this.splitRequest; } @@ -8005,12 +7650,11 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } /** - * Calls sync with the given transaction ID if the region's table is not - * deferring it. + * Calls sync with the given transaction ID * @param txid should sync up to which transaction * @throws IOException If anything goes wrong with DFS */ - private void syncOrDefer(long txid, Durability durability) throws IOException { + private void sync(long txid, Durability durability) throws IOException { if (this.getRegionInfo().isMetaRegion()) { this.wal.sync(txid); } else { @@ -8071,45 +7715,6 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi } }; - /** - * Facility for dumping and compacting catalog tables. - * Only does catalog tables since these are only tables we for sure know - * schema on. For usage run: - *

-   *   ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
-   * 
- * @throws IOException - */ - public static void main(String[] args) throws IOException { - if (args.length < 1) { - printUsageAndExit(null); - } - boolean majorCompact = false; - if (args.length > 1) { - if (!args[1].toLowerCase().startsWith("major")) { - printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">"); - } - majorCompact = true; - } - final Path tableDir = new Path(args[0]); - final Configuration c = HBaseConfiguration.create(); - final FileSystem fs = FileSystem.get(c); - final Path logdir = new Path(c.get("hbase.tmp.dir")); - final String logname = "wal" + FSUtils.getTableName(tableDir) + System.currentTimeMillis(); - - final Configuration walConf = new Configuration(c); - FSUtils.setRootDir(walConf, logdir); - final WALFactory wals = new WALFactory(walConf, null, logname); - try { - processTable(fs, tableDir, wals, c, majorCompact); - } finally { - wals.close(); - // TODO: is this still right? - BlockCache bc = new CacheConfig(c).getBlockCache(); - if (bc != null) bc.shutdown(); - } - } - @Override public long getOpenSeqNum() { return this.openSeqNum; @@ -8147,39 +7752,9 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi assert newValue >= 0; } - /** - * Do not change this sequence id. - * @return sequenceId - */ @VisibleForTesting - public long getSequenceId() { - return this.mvcc.getReadPoint(); - } - - - /** - * Append a faked WALEdit in order to get a long sequence number and wal syncer will just ignore - * the WALEdit append later. - * @param wal - * @return Return the key used appending with no sync and no append. - * @throws IOException - */ - private WALKey appendEmptyEdit(final WAL wal) throws IOException { - // we use HLogKey here instead of WALKey directly to support legacy coprocessors. - @SuppressWarnings("deprecation") - WALKey key = new HLogKey(getRegionInfo().getEncodedNameAsBytes(), - getRegionInfo().getTable(), WALKey.NO_SEQUENCE_ID, 0, null, - HConstants.NO_NONCE, HConstants.NO_NONCE, getMVCC()); - - // Call append but with an empty WALEdit. The returned sequence id will not be associated - // with any edit and we can be sure it went in after all outstanding appends. - try { - wal.append(getTableDesc(), getRegionInfo(), key, WALEdit.EMPTY_WALEDIT, false); - } catch (Throwable t) { - // If exception, our mvcc won't get cleaned up by client, so do it here. - getMVCC().complete(key.getWriteEntry()); - } - return key; + public long getReadPoint() { + return getReadPoint(IsolationLevel.READ_COMMITTED); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java index 9ebdaee..813a00e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java @@ -1302,8 +1302,11 @@ public class HStore implements Store { HRegionInfo info = this.region.getRegionInfo(); CompactionDescriptor compactionDescriptor = ProtobufUtil.toCompactionDescriptor(info, family.getName(), inputPaths, outputPaths, fs.getStoreDir(getFamily().getNameAsString())); - WALUtil.writeCompactionMarker(region.getWAL(), this.region.getTableDesc(), - this.region.getRegionInfo(), compactionDescriptor, region.getMVCC()); + // Fix reaching into Region to get the maxWaitForSeqId. + // Does this method belong in Region altogether given it is making so many references up there? + // Could be Region#writeCompactionMarker(compactionDescriptor); + WALUtil.writeCompactionMarker(this.region.getWAL(), this.region.getTableDesc(), + this.region.getRegionInfo(), compactionDescriptor, this.region.getMVCC()); } @VisibleForTesting diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java index a9322e3..eae713f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.Cell; +import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Scan; /** @@ -30,6 +31,12 @@ import org.apache.hadoop.hbase.client.Scan; @InterfaceAudience.Private public interface KeyValueScanner extends Shipper { /** + * The byte array represents for NO_NEXT_INDEXED_KEY; + * The actual value is irrelevant because this is always compared by reference. + */ + public static final Cell NO_NEXT_INDEXED_KEY = new KeyValue(); + + /** * Look at the next Cell in this scanner, but do not iterate scanner. * @return the next Cell */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MultiVersionConcurrencyControl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MultiVersionConcurrencyControl.java index eba99e0..b5651be 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MultiVersionConcurrencyControl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MultiVersionConcurrencyControl.java @@ -166,7 +166,6 @@ public class MultiVersionConcurrencyControl { public boolean complete(WriteEntry writeEntry) { synchronized (writeQueue) { writeEntry.markCompleted(); - long nextReadValue = NONE; boolean ranOnce = false; while (!writeQueue.isEmpty()) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/Region.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/Region.java index 5da8bcb..c0bc8fe 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/Region.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/Region.java @@ -157,6 +157,13 @@ public interface Region extends ConfigurationObserver { boolean isLoadingCfsOnDemandDefault(); /** @return readpoint considering given IsolationLevel */ + long getReadPoint(IsolationLevel isolationLevel); + + /** + * @return readpoint considering given IsolationLevel + * @deprecated Since 1.2.0. Use {@link #getReadPoint(IsolationLevel)} instead. + */ + @Deprecated long getReadpoint(IsolationLevel isolationLevel); /** @@ -217,8 +224,8 @@ public interface Region extends ConfigurationObserver { // Region read locks /** - * Operation enum is used in {@link Region#startRegionOperation} to provide context for - * various checks before any region operation begins. + * Operation enum is used in {@link Region#startRegionOperation} and elsewhere to provide + * context for various checks. */ enum Operation { ANY, GET, PUT, DELETE, SCAN, APPEND, INCREMENT, SPLIT_REGION, MERGE_REGION, BATCH_MUTATE, @@ -323,9 +330,10 @@ public interface Region extends ConfigurationObserver { OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId) throws IOException; /** - * Atomically checks if a row/family/qualifier value matches the expected val - * If it does, it performs the row mutations. If the passed value is null, t - * is for the lack of column (ie: non-existence) + * Atomically checks if a row/family/qualifier value matches the expected value and if it does, + * it performs the mutation. If the passed value is null, the lack of column value + * (ie: non-existence) is used. See checkAndRowMutate to do many checkAndPuts at a time on a + * single row. * @param row to check * @param family column family to check * @param qualifier column qualifier to check @@ -340,9 +348,10 @@ public interface Region extends ConfigurationObserver { ByteArrayComparable comparator, Mutation mutation, boolean writeToWAL) throws IOException; /** - * Atomically checks if a row/family/qualifier value matches the expected val - * If it does, it performs the row mutations. If the passed value is null, t - * is for the lack of column (ie: non-existence) + * Atomically checks if a row/family/qualifier value matches the expected values and if it does, + * it performs the row mutations. If the passed value is null, the lack of column value + * (ie: non-existence) is used. Use to do many mutations on a single row. Use checkAndMutate + * to do one checkAndMutate at a time. * @param row to check * @param family column family to check * @param qualifier column qualifier to check @@ -350,7 +359,7 @@ public interface Region extends ConfigurationObserver { * @param comparator * @param mutations * @param writeToWAL - * @return true if mutation was applied, false otherwise + * @return true if mutations were applied, false otherwise * @throws IOException */ boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier, CompareOp compareOp, diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RowProcessor.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RowProcessor.java index cfe42e4..34901b7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RowProcessor.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RowProcessor.java @@ -35,7 +35,7 @@ import com.google.protobuf.Message; * Defines the procedure to atomically perform multiple scans and mutations * on a HRegion. * - * This is invoked by HRegion#processRowsWithLocks(). + * This is invoked by {@link Region#processRowsWithLocks(RowProcessor)}. * This class performs scans and generates mutations and WAL edits. * The locks and MVCC will be handled by HRegion. * @@ -98,10 +98,8 @@ public interface RowProcessor { /** * The hook to be executed after the process() but before applying the Mutations to region. Also - * by the time this hook is been called, mvcc transaction is started. - * @param region + * by the time this hook is called, mvcc transaction have started. * @param walEdit the output WAL edits to apply to write ahead log - * @throws IOException */ void preBatchMutate(HRegion region, WALEdit walEdit) throws IOException; @@ -109,8 +107,6 @@ public interface RowProcessor { * The hook to be executed after the process() and applying the Mutations to region. The * difference of this one with {@link #postProcess(HRegion, WALEdit, boolean)} is this hook will * be executed before the mvcc transaction completion. - * @param region - * @throws IOException */ void postBatchMutate(HRegion region) throws IOException; @@ -156,4 +152,4 @@ public interface RowProcessor { * @return The {@link Durability} to use */ Durability useDurability(); -} +} \ No newline at end of file diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java index 9ab68e4..fbf9743 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java @@ -260,7 +260,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner List scanners, ScanType scanType, long smallestReadPoint, long earliestPutTs, byte[] dropDeletesFromRow, byte[] dropDeletesToRow) throws IOException { this(store, scan, scanInfo, null, - ((HStore)store).getHRegion().getReadpoint(IsolationLevel.READ_COMMITTED), false); + ((HStore)store).getHRegion().getReadPoint(IsolationLevel.READ_COMMITTED), false); if (dropDeletesFromRow == null) { matcher = new ScanQueryMatcher(scan, scanInfo, null, scanType, smallestReadPoint, earliestPutTs, oldestUnexpiredTS, now, store.getCoprocessorHost()); @@ -659,7 +659,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner case SEEK_NEXT_COL: { Cell nextIndexedKey = getNextIndexedKey(); - if (nextIndexedKey != null && nextIndexedKey != HConstants.NO_NEXT_INDEXED_KEY + if (nextIndexedKey != null && nextIndexedKey != KeyValueScanner.NO_NEXT_INDEXED_KEY && matcher.compareKeyForNextColumn(nextIndexedKey, cell) >= 0) { return qcode == MatchCode.SEEK_NEXT_COL ? MatchCode.SKIP : MatchCode.INCLUDE; } @@ -669,7 +669,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner case SEEK_NEXT_ROW: { Cell nextIndexedKey = getNextIndexedKey(); - if (nextIndexedKey != null && nextIndexedKey != HConstants.NO_NEXT_INDEXED_KEY + if (nextIndexedKey != null && nextIndexedKey != KeyValueScanner.NO_NEXT_INDEXED_KEY && matcher.compareKeyForNextRow(nextIndexedKey, cell) >= 0) { return qcode == MatchCode.SEEK_NEXT_ROW ? MatchCode.SKIP : MatchCode.INCLUDE; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java index 9ae72e6..feb7cd2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java @@ -1,4 +1,5 @@ /** + * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -281,8 +282,6 @@ public class FSHLog implements WAL { private final int slowSyncNs; - private final static Object [] NO_ARGS = new Object []{}; - // If live datanode count is lower than the default replicas value, // RollWriter will be triggered in each sync(So the RollWriter will be // triggered one by one in a short time). Using it as a workaround to slow @@ -1069,6 +1068,19 @@ public class FSHLog implements WAL { } } + /** + * NOTE: This append, at a time that is usually after this call returns, starts an + * mvcc transaction by calling 'begin' wherein which we assign this update a sequenceid. At + * assignment time, we stamp all the passed in Cells inside WALEdit with their sequenceId. + * You must 'complete' the transaction this mvcc transaction by calling + * MultiVersionConcurrencyControl#complete(...) or a variant otherwise mvcc will get stuck. Do it + * in the finally of a try/finally + * block within which this append lives and any subsequent operations like sync or + * update of memstore, etc. Get the WriteEntry to pass mvcc out of the passed in WALKey + * walKey parameter. Be warned that the WriteEntry is not immediately available + * on return from this method. It WILL be available subsequent to a sync of this append; + * otherwise, you will just have to wait on the WriteEntry to get filled in. + */ @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION", justification="Will never be null") @Override @@ -1863,13 +1875,8 @@ public class FSHLog implements WAL { } // Coprocessor hook. - if (!coprocessorHost.preWALWrite(entry.getHRegionInfo(), entry.getKey(), - entry.getEdit())) { - if (entry.getEdit().isReplay()) { - // Set replication scope null so that this won't be replicated - entry.getKey().setScopes(null); - } - } + coprocessorHost.preWALWrite(entry.getHRegionInfo(), entry.getKey(), + entry.getEdit()); if (!listeners.isEmpty()) { for (WALActionsListener i: listeners) { // TODO: Why does listener take a table description and CPs take a regioninfo? Fix. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSWALEntry.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSWALEntry.java index 7f3eb61..695e2b2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSWALEntry.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSWALEntry.java @@ -106,6 +106,8 @@ class FSWALEntry extends Entry { /** * Here is where a WAL edit gets its sequenceid. + * SIDE-EFFECT is our stamping the sequenceid into every Cell AND setting the sequenceid into the + * MVCC WriteEntry!!!! * @return The sequenceid we stamped on this edit. * @throws IOException */ @@ -119,16 +121,13 @@ class FSWALEntry extends Entry { regionSequenceId = we.getWriteNumber(); } - if (!this.getEdit().isReplay() && inMemstore) { - for (Cell c:getEdit().getCells()) { + if (inMemstore) { + for (Cell c: getEdit().getCells()) { CellUtil.setSequenceId(c, regionSequenceId); } } - // This has to stay in this order - WALKey key = getKey(); - key.setLogSeqNum(regionSequenceId); - key.setWriteEntry(we); + getKey().setWriteEntry(we); return regionSequenceId; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogKey.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogKey.java index 3e548ad..7c40323 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogKey.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogKey.java @@ -49,7 +49,7 @@ import com.google.common.annotations.VisibleForTesting; * *

Some Transactional edits (START, COMMIT, ABORT) will not have an * associated row. - * @deprecated use WALKey. as of 2.0. Remove in 3.0 + * @deprecated use WALKey. Deprecated as of 1.0 (HBASE-12522). Remove in 2.0 */ @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.REPLICATION) @Deprecated @@ -166,7 +166,7 @@ public class HLogKey extends WALKey implements Writable { this.tablename.getName().length, out, compressionContext.tableDict); } - out.writeLong(this.logSeqNum); + out.writeLong(getSequenceId()); out.writeLong(this.writeTime); // Don't need to write the clusters information as we are using protobufs from 0.95 // Writing only the first clusterId for testing the legacy read @@ -213,7 +213,7 @@ public class HLogKey extends WALKey implements Writable { tablenameBytes = Compressor.readCompressed(in, compressionContext.tableDict); } - this.logSeqNum = in.readLong(); + setSequenceId(in.readLong()); this.writeTime = in.readLong(); this.clusterIds.clear(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/ReplayHLogKey.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/ReplayHLogKey.java index f7ae208..d5a1561 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/ReplayHLogKey.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/ReplayHLogKey.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hbase.regionserver.wal; -import java.io.IOException; import java.util.List; import java.util.UUID; @@ -49,7 +48,7 @@ public class ReplayHLogKey extends HLogKey { * @return long the new assigned sequence number */ @Override - public long getSequenceId() throws IOException { + public long getSequenceId() { return this.getOrigLogSeqNum(); } -} +} \ No newline at end of file diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/WALUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/WALUtil.java index c89a466..f268422 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/WALUtil.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/WALUtil.java @@ -37,9 +37,9 @@ import org.apache.hadoop.hbase.wal.WALKey; import com.google.protobuf.TextFormat; /** - * Helper methods to ease Region Server integration with the write ahead log. + * Helper methods to ease Region Server integration with the Write Ahead Log (WAL). * Note that methods in this class specifically should not require access to anything - * other than the API found in {@link WAL}. + * other than the API found in {@link WAL}. For internal use only. */ @InterfaceAudience.Private public class WALUtil { @@ -51,86 +51,108 @@ public class WALUtil { /** * Write the marker that a compaction has succeeded and is about to be committed. - * This provides info to the HMaster to allow it to recover the compaction if - * this regionserver dies in the middle (This part is not yet implemented). It also prevents - * the compaction from finishing if this regionserver has already lost its lease on the log. + * This provides info to the HMaster to allow it to recover the compaction if this regionserver + * dies in the middle. It also prevents the compaction from finishing if this regionserver has + * already lost its lease on the log. + * + *

This write is for internal use only. Not for external client consumption. * @param mvcc Used by WAL to get sequence Id for the waledit. */ - public static long writeCompactionMarker(WAL wal, HTableDescriptor htd, HRegionInfo hri, + public static WALKey writeCompactionMarker(WAL wal, HTableDescriptor htd, HRegionInfo hri, final CompactionDescriptor c, MultiVersionConcurrencyControl mvcc) throws IOException { - long trx = writeMarker(wal, htd, hri, WALEdit.createCompaction(hri, c), mvcc, true); + WALKey walKey = writeMarker(wal, htd, hri, WALEdit.createCompaction(hri, c), mvcc); if (LOG.isTraceEnabled()) { LOG.trace("Appended compaction marker " + TextFormat.shortDebugString(c)); } - return trx; + return walKey; } /** * Write a flush marker indicating a start / abort or a complete of a region flush + * + *

This write is for internal use only. Not for external client consumption. */ - public static long writeFlushMarker(WAL wal, HTableDescriptor htd, HRegionInfo hri, + public static WALKey writeFlushMarker(WAL wal, HTableDescriptor htd, HRegionInfo hri, final FlushDescriptor f, boolean sync, MultiVersionConcurrencyControl mvcc) throws IOException { - long trx = writeMarker(wal, htd, hri, WALEdit.createFlushWALEdit(hri, f), mvcc, sync); + WALKey walKey = + doFullAppendTransaction(wal, htd, hri, WALEdit.createFlushWALEdit(hri, f), mvcc, sync); if (LOG.isTraceEnabled()) { LOG.trace("Appended flush marker " + TextFormat.shortDebugString(f)); } - return trx; + return walKey; } /** - * Write a region open marker indicating that the region is opened + * Write a region open marker indicating that the region is opened. + * This write is for internal use only. Not for external client consumption. */ - public static long writeRegionEventMarker(WAL wal, HTableDescriptor htd, HRegionInfo hri, + public static WALKey writeRegionEventMarker(WAL wal, HTableDescriptor htd, HRegionInfo hri, final RegionEventDescriptor r, final MultiVersionConcurrencyControl mvcc) throws IOException { - long trx = writeMarker(wal, htd, hri, WALEdit.createRegionEventWALEdit(hri, r), mvcc, true); + WALKey walKey = writeMarker(wal, htd, hri, WALEdit.createRegionEventWALEdit(hri, r), mvcc); if (LOG.isTraceEnabled()) { LOG.trace("Appended region event marker " + TextFormat.shortDebugString(r)); } - return trx; + return walKey; } /** * Write a log marker that a bulk load has succeeded and is about to be committed. - * - * @param wal The log to write into. - * @param htd A description of the table that we are bulk loading into. - * @param hri A description of the region in the table that we are bulk loading into. + * This write is for internal use only. Not for external client consumption. + * @param wal The log to write into. + * @param htd A description of the table that we are bulk loading into. + * @param hri A description of the region in the table that we are bulk loading into. * @param desc A protocol buffers based description of the client's bulk loading request - * @return txid of this transaction or if nothing to do, the last txid + * @return walKey with sequenceid filled out for this bulk load marker * @throws IOException We will throw an IOException if we can not append to the HLog. */ - public static long writeBulkLoadMarkerAndSync(final WAL wal, final HTableDescriptor htd, + public static WALKey writeBulkLoadMarkerAndSync(final WAL wal, final HTableDescriptor htd, final HRegionInfo hri, final WALProtos.BulkLoadDescriptor desc, final MultiVersionConcurrencyControl mvcc) throws IOException { - long trx = writeMarker(wal, htd, hri, WALEdit.createBulkLoadEvent(hri, desc), mvcc, true); + WALKey walKey = writeMarker(wal, htd, hri, WALEdit.createBulkLoadEvent(hri, desc), mvcc); if (LOG.isTraceEnabled()) { LOG.trace("Appended Bulk Load marker " + TextFormat.shortDebugString(desc)); } - return trx; + return walKey; } - private static long writeMarker(final WAL wal, final HTableDescriptor htd, final HRegionInfo hri, - final WALEdit edit, final MultiVersionConcurrencyControl mvcc, final boolean sync) + private static WALKey writeMarker(final WAL wal, final HTableDescriptor htd, + final HRegionInfo hri, final WALEdit edit, final MultiVersionConcurrencyControl mvcc) + throws IOException { + // If sync == true in below, then timeout is not used; safe to pass UNSPECIFIED_TIMEOUT + return doFullAppendTransaction(wal, htd, hri, edit, mvcc, true); + } + + /** + * A 'full' WAL transaction involves starting an mvcc transaction followed by an append, + * an optional sync, and then a call to complete the mvcc transaction. This method does it all. + * Good for case of adding a single edit or marker to the WAL. + * + *

This write is for internal use only. Not for external client consumption. + * @return WALKey that was added to the WAL. + */ + public static WALKey doFullAppendTransaction(final WAL wal, final HTableDescriptor htd, + final HRegionInfo hri, final WALEdit edit, final MultiVersionConcurrencyControl mvcc, + final boolean sync) throws IOException { // TODO: Pass in current time to use? - WALKey key = - new HLogKey(hri.getEncodedNameAsBytes(), hri.getTable(), System.currentTimeMillis(), mvcc); - // Add it to the log but the false specifies that we don't need to add it to the memstore + WALKey walKey = + new WALKey(hri.getEncodedNameAsBytes(), hri.getTable(), System.currentTimeMillis(), mvcc); long trx = MultiVersionConcurrencyControl.NONE; try { - trx = wal.append(htd, hri, key, edit, false); - if (sync) wal.sync(trx); - } finally { - // If you get hung here, is it a real WAL or a mocked WAL? If the latter, you need to - // trip the latch that is inside in getWriteEntry up in your mock. See down in the append - // called from onEvent in FSHLog. - MultiVersionConcurrencyControl.WriteEntry we = key.getWriteEntry(); - if (mvcc != null && we != null) mvcc.complete(we); + trx = wal.append(htd, hri, walKey, edit, false); + if (sync) { + wal.sync(trx); + } + // Call complete only here because these are markers only. They are not for clients to read. + mvcc.complete(walKey.getWriteEntry()); + } catch (IOException ioe) { + mvcc.complete(walKey.getWriteEntry()); + throw ioe; } - return trx; + return walKey; } } \ No newline at end of file diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index c066803..394a419 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -1453,7 +1453,7 @@ public class HBaseFsck extends Configured implements Closeable { "You may need to restore the previously sidelined hbase:meta"); return false; } - meta.batchMutate(puts.toArray(new Put[puts.size()])); + meta.batchMutate(puts.toArray(new Put[puts.size()]), HConstants.NO_NONCE, HConstants.NO_NONCE); meta.close(); if (meta.getWAL() != null) { meta.getWAL().close(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALKey.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALKey.java index 4091a82..0d58cd7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALKey.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALKey.java @@ -30,52 +30,50 @@ import java.util.NavigableMap; import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.CountDownLatch; -import java.util.concurrent.TimeUnit; -import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl; -import org.apache.hadoop.hbase.util.ByteStringer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hbase.classification.InterfaceAudience; -import org.apache.hadoop.hbase.exceptions.TimeoutIOException; import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos; import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FamilyScope; import org.apache.hadoop.hbase.protobuf.generated.WALProtos.ScopeType; +import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl; import org.apache.hadoop.hbase.regionserver.SequenceId; +// imports for things that haven't moved from regionserver.wal yet. +import org.apache.hadoop.hbase.regionserver.wal.CompressionContext; +import org.apache.hadoop.hbase.regionserver.wal.WALCellCodec; +import org.apache.hadoop.hbase.util.ByteStringer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import com.google.common.annotations.VisibleForTesting; import com.google.protobuf.ByteString; -// imports for things that haven't moved from regionserver.wal yet. -import org.apache.hadoop.hbase.regionserver.wal.CompressionContext; -import org.apache.hadoop.hbase.regionserver.wal.WALCellCodec; - /** - * A Key for an entry in the change log. + * A Key for an entry in the WAL. * * The log intermingles edits to many tables and rows, so each log entry * identifies the appropriate table and row. Within a table and row, they're * also sorted. * - *

Some Transactional edits (START, COMMIT, ABORT) will not have an - * associated row. + *

Some Transactional edits (START, COMMIT, ABORT) will not have an associated row. * * Note that protected members marked @InterfaceAudience.Private are only protected * to support the legacy HLogKey class, which is in a different package. - * - *

*/ // TODO: Key and WALEdit are never used separately, or in one-to-many relation, for practical // purposes. They need to be merged into WALEntry. -// TODO: Cleanup. We have logSeqNum and then WriteEntry, both are sequence id'ing. Fix. @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.REPLICATION) public class WALKey implements SequenceId, Comparable { private static final Log LOG = LogFactory.getLog(WALKey.class); + private final CountDownLatch sequenceIdAssignedLatch = new CountDownLatch(1); + /** + * Used to represent when a particular wal key doesn't know/care about the sequence ordering. + */ + public static final long NO_SEQUENCE_ID = -1; @InterfaceAudience.Private // For internal use only. public MultiVersionConcurrencyControl getMvcc() { @@ -83,25 +81,22 @@ public class WALKey implements SequenceId, Comparable { } /** - * Will block until a write entry has been assigned by they WAL subsystem. - * @return A WriteEntry gotten from local WAL subsystem. Must be completed by calling - * {@link MultiVersionConcurrencyControl#complete(MultiVersionConcurrencyControl.WriteEntry)} - * or - * {@link MultiVersionConcurrencyControl#complete(MultiVersionConcurrencyControl.WriteEntry)} + * @return A WriteEntry gotten from local WAL subsystem. Use it to complete the + * mvcc transaction this WALKey was part of (the transaction is started when you call append; + * see the comment on FSHLog#append). To complete call + * {@link MultiVersionConcurrencyControl#complete(MultiVersionConcurrencyControl.WriteEntry)} + * or + * {@link MultiVersionConcurrencyControl#complete(MultiVersionConcurrencyControl.WriteEntry)} * @see #setWriteEntry(MultiVersionConcurrencyControl.WriteEntry) */ @InterfaceAudience.Private // For internal use only. public MultiVersionConcurrencyControl.WriteEntry getWriteEntry() throws InterruptedIOException { try { - this.seqNumAssignedLatch.await(); + this.sequenceIdAssignedLatch.await(); } catch (InterruptedException ie) { - // If interrupted... clear out our entry else we can block up mvcc. MultiVersionConcurrencyControl mvcc = getMvcc(); - LOG.debug("mvcc=" + mvcc + ", writeEntry=" + this.writeEntry); - if (mvcc != null) { - if (this.writeEntry != null) { - mvcc.complete(this.writeEntry); - } + if (LOG.isDebugEnabled()) { + LOG.debug("mvcc=" + mvcc + ", writeEntry=" + this.writeEntry); } InterruptedIOException iie = new InterruptedIOException(); iie.initCause(ie); @@ -112,11 +107,19 @@ public class WALKey implements SequenceId, Comparable { @InterfaceAudience.Private // For internal use only. public void setWriteEntry(MultiVersionConcurrencyControl.WriteEntry writeEntry) { + if (this.writeEntry != null) { + throw new RuntimeException("Non-null!!!"); + } this.writeEntry = writeEntry; - this.seqNumAssignedLatch.countDown(); + // Set our sequenceid now using WriteEntry. + if (this.writeEntry != null) { + this.sequenceId = this.writeEntry.getWriteNumber(); + } + this.sequenceIdAssignedLatch.countDown(); } - // should be < 0 (@see HLogKey#readFields(DataInput)) + // REMOVE!!!! No more Writables!!!! + // Should be < 0 (@see HLogKey#readFields(DataInput)) // version 2 supports WAL compression // public members here are only public because of HLogKey @InterfaceAudience.Private @@ -163,21 +166,23 @@ public class WALKey implements SequenceId, Comparable { @InterfaceAudience.Private protected static final Version VERSION = Version.COMPRESSED; - /** Used to represent when a particular wal key doesn't know/care about the sequence ordering. */ - public static final long NO_SEQUENCE_ID = -1; - - // visible for deprecated HLogKey @InterfaceAudience.Private protected byte [] encodedRegionName; // visible for deprecated HLogKey @InterfaceAudience.Private protected TableName tablename; - // visible for deprecated HLogKey - @InterfaceAudience.Private - protected long logSeqNum; + /** + * SequenceId for this edit. Set post-construction at write-to-WAL time. Until then it is + * NO_SEQUENCE_ID. Change it so multiple threads can read it -- e.g. access is synchronized. + */ + private long sequenceId; + + /** + * Used during WAL replay; the sequenceId of the edit when it came into the system. + */ private long origLogSeqNum = 0; - private CountDownLatch seqNumAssignedLatch = new CountDownLatch(1); + // Time at which this edit was written. // visible for deprecated HLogKey @InterfaceAudience.Private @@ -193,6 +198,9 @@ public class WALKey implements SequenceId, Comparable { private long nonceGroup = HConstants.NO_NONCE; private long nonce = HConstants.NO_NONCE; private MultiVersionConcurrencyControl mvcc; + /** + * Set in a way visible to multiple threads; e.g. synchronized getter/setters. + */ private MultiVersionConcurrencyControl.WriteEntry writeEntry; public static final List EMPTY_UUIDS = Collections.unmodifiableList(new ArrayList()); @@ -215,10 +223,15 @@ public class WALKey implements SequenceId, Comparable { HConstants.NO_NONCE, HConstants.NO_NONCE, null); } + /** + * @deprecated Remove. Useless. + */ + @Deprecated // REMOVE public WALKey(final byte[] encodedRegionName, final TableName tablename) { this(encodedRegionName, tablename, System.currentTimeMillis()); } + // TODO: Fix being able to pass in sequenceid. public WALKey(final byte[] encodedRegionName, final TableName tablename, final long now) { init(encodedRegionName, tablename, @@ -257,6 +270,7 @@ public class WALKey implements SequenceId, Comparable { * @param now Time at which this edit was written. * @param clusterIds the clusters that have consumed the change(used in Replication) */ + // TODO: Fix being able to pass in sequenceid. public WALKey(final byte[] encodedRegionName, final TableName tablename, long logSeqNum, @@ -300,6 +314,7 @@ public class WALKey implements SequenceId, Comparable { * @param nonceGroup * @param nonce */ + // TODO: Fix being able to pass in sequenceid. public WALKey(final byte[] encodedRegionName, final TableName tablename, long logSeqNum, @@ -325,7 +340,7 @@ public class WALKey implements SequenceId, Comparable { long nonceGroup, long nonce, MultiVersionConcurrencyControl mvcc) { - this.logSeqNum = logSeqNum; + this.sequenceId = logSeqNum; this.writeTime = now; this.clusterIds = clusterIds; this.encodedRegionName = encodedRegionName; @@ -333,6 +348,15 @@ public class WALKey implements SequenceId, Comparable { this.nonceGroup = nonceGroup; this.nonce = nonce; this.mvcc = mvcc; + if (logSeqNum != NO_SEQUENCE_ID) { + setSequenceId(logSeqNum); + } + } + + // For HLogKey and deserialization. DO NOT USE. See setWriteEntry below. + @InterfaceAudience.Private + protected void setSequenceId(long sequenceId) { + this.sequenceId = sequenceId; } /** @@ -352,32 +376,24 @@ public class WALKey implements SequenceId, Comparable { return tablename; } - /** @return log sequence number */ - public long getLogSeqNum() { - return this.logSeqNum; - } - - /** - * Allow that the log sequence id to be set post-construction - * Only public for org.apache.hadoop.hbase.regionserver.wal.FSWALEntry - * @param sequence + /** @return log sequence number + * @deprecated Use {@link #getSequenceId()} */ - @InterfaceAudience.Private - public void setLogSeqNum(final long sequence) { - this.logSeqNum = sequence; - + @Deprecated + public long getLogSeqNum() { + return getSequenceId(); } /** - * Used to set original seq Id for WALKey during wal replay - * @param seqId + * Used to set original sequenceId for WALKey during WAL replay */ - public void setOrigLogSeqNum(final long seqId) { - this.origLogSeqNum = seqId; + public void setOrigLogSeqNum(final long sequenceId) { + this.origLogSeqNum = sequenceId; } /** - * Return a positive long if current WALKey is created from a replay edit + * Return a positive long if current WALKey is created from a replay edit; a replay edit is an + * edit that came in when replaying WALs of a crashed server. * @return original sequence number of the WALEdit */ public long getOrigLogSeqNum() { @@ -385,43 +401,14 @@ public class WALKey implements SequenceId, Comparable { } /** - * Wait for sequence number to be assigned & return the assigned value + * SequenceId is only available post WAL-assign. Calls before this will get you a + * {@link #NO_SEQUENCE_ID}. See the comment on FSHLog#append and #getWriteNumber in this method + * for more on when this sequenceId comes available. * @return long the new assigned sequence number - * @throws IOException */ @Override - public long getSequenceId() throws IOException { - return getSequenceId(-1); - } - - /** - * Wait for sequence number to be assigned & return the assigned value. - * @param maxWaitForSeqId maximum time to wait in milliseconds for sequenceid - * @return long the new assigned sequence number - * @throws IOException - */ - public long getSequenceId(final long maxWaitForSeqId) throws IOException { - // TODO: This implementation waiting on a latch is problematic because if a higher level - // determines we should stop or abort, there is no global list of all these blocked WALKeys - // waiting on a sequence id; they can't be cancelled... interrupted. See getNextSequenceId. - // - // UPDATE: I think we can remove the timeout now we are stamping all walkeys with sequenceid, - // even those that have failed (previously we were not... so they would just hang out...). - // St.Ack 20150910 - try { - if (maxWaitForSeqId < 0) { - this.seqNumAssignedLatch.await(); - } else if (!this.seqNumAssignedLatch.await(maxWaitForSeqId, TimeUnit.MILLISECONDS)) { - throw new TimeoutIOException("Failed to get sequenceid after " + maxWaitForSeqId + - "ms; WAL system stuck or has gone away?"); - } - } catch (InterruptedException ie) { - LOG.warn("Thread interrupted waiting for next log sequence number"); - InterruptedIOException iie = new InterruptedIOException(); - iie.initCause(ie); - throw iie; - } - return this.logSeqNum; + public long getSequenceId() { + return this.sequenceId; } /** @@ -495,7 +482,7 @@ public class WALKey implements SequenceId, Comparable { @Override public String toString() { return tablename + "/" + Bytes.toString(encodedRegionName) + "/" + - logSeqNum; + sequenceId; } /** @@ -509,7 +496,7 @@ public class WALKey implements SequenceId, Comparable { Map stringMap = new HashMap(); stringMap.put("table", tablename); stringMap.put("region", Bytes.toStringBinary(encodedRegionName)); - stringMap.put("sequence", logSeqNum); + stringMap.put("sequence", getSequenceId()); return stringMap; } @@ -527,7 +514,7 @@ public class WALKey implements SequenceId, Comparable { @Override public int hashCode() { int result = Bytes.hashCode(this.encodedRegionName); - result ^= this.logSeqNum; + result ^= getSequenceId(); result ^= this.writeTime; return result; } @@ -536,9 +523,11 @@ public class WALKey implements SequenceId, Comparable { public int compareTo(WALKey o) { int result = Bytes.compareTo(this.encodedRegionName, o.encodedRegionName); if (result == 0) { - if (this.logSeqNum < o.logSeqNum) { + long sid = getSequenceId(); + long otherSid = o.getSequenceId(); + if (sid < otherSid) { result = -1; - } else if (this.logSeqNum > o.logSeqNum) { + } else if (sid > otherSid) { result = 1; } if (result == 0) { @@ -592,7 +581,7 @@ public class WALKey implements SequenceId, Comparable { builder.setTableName(compressor.compress(this.tablename.getName(), compressionContext.tableDict)); } - builder.setLogSequenceNumber(this.logSeqNum); + builder.setLogSequenceNumber(getSequenceId()); builder.setWriteTime(writeTime); if (this.origLogSeqNum > 0) { builder.setOrigSequenceNumber(this.origLogSeqNum); @@ -658,10 +647,10 @@ public class WALKey implements SequenceId, Comparable { this.scopes.put(family, scope.getScopeType().getNumber()); } } - this.logSeqNum = walKey.getLogSequenceNumber(); + setSequenceId(walKey.getLogSequenceNumber()); this.writeTime = walKey.getWriteTime(); if(walKey.hasOrigSequenceNumber()) { this.origLogSeqNum = walKey.getOrigSequenceNumber(); } } -} +} \ No newline at end of file diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/IncrementPerformanceTest.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/IncrementPerformanceTest.java new file mode 100644 index 0000000..aed3d0a --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/IncrementPerformanceTest.java @@ -0,0 +1,128 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.Snapshot; +import com.codahale.metrics.Timer; + + +/** + * Simple Increments Performance Test. Run this from main. It is to go against a cluster. + * Presumption is the table exists already. Defaults are a zk ensemble of localhost:2181, + * a tableName of 'tableName', a column famly name of 'columnFamilyName', with 80 threads by + * default and 10000 increments per thread. To change any of these configs, pass -DNAME=VALUE as + * in -DtableName="newTableName". It prints out configuration it is running with at the start and + * on the end it prints out percentiles. + */ +public class IncrementPerformanceTest implements Tool { + private static final Log LOG = LogFactory.getLog(IncrementPerformanceTest.class); + private static final byte [] QUALIFIER = new byte [] {'q'}; + private Configuration conf; + private final MetricRegistry metrics = new MetricRegistry(); + private static final String TABLENAME = "tableName"; + private static final String COLUMN_FAMILY = "columnFamilyName"; + private static final String THREAD_COUNT = "threadCount"; + private static final int DEFAULT_THREAD_COUNT = 80; + private static final String INCREMENT_COUNT = "incrementCount"; + private static final int DEFAULT_INCREMENT_COUNT = 10000; + + IncrementPerformanceTest() {} + + public int run(final String [] args) throws Exception { + Configuration conf = getConf(); + final TableName tableName = TableName.valueOf(conf.get(TABLENAME), TABLENAME); + final byte [] columnFamilyName = Bytes.toBytes(conf.get(COLUMN_FAMILY, COLUMN_FAMILY)); + int threadCount = conf.getInt(THREAD_COUNT, DEFAULT_THREAD_COUNT); + final int incrementCount = conf.getInt(INCREMENT_COUNT, DEFAULT_INCREMENT_COUNT); + LOG.info("Running test with " + HConstants.ZOOKEEPER_QUORUM + "=" + + getConf().get(HConstants.ZOOKEEPER_QUORUM) + ", tableName=" + tableName + + ", columnFamilyName=" + columnFamilyName + ", threadCount=" + threadCount + + ", incrementCount=" + incrementCount); + + ExecutorService service = Executors.newFixedThreadPool(threadCount); + Set> futures = new HashSet>(); + final AtomicInteger integer = new AtomicInteger(0); // needed a simple "final" counter + while (integer.incrementAndGet() <= threadCount) { + futures.add(service.submit(new Runnable() { + @Override + public void run() { + try { + try (Connection connection = ConnectionFactory.createConnection(getConf())) { + try (Table table = connection.getTable(tableName)) { + Timer timer = metrics.timer("increments"); + for (int i = 0; i < incrementCount; i++) { + byte[] row = Bytes.toBytes(i); + Timer.Context context = timer.time(); + try { + table.incrementColumnValue(row, columnFamilyName, QUALIFIER, 1l); + } catch (IOException e) { + // swallow..it's a test. + } finally { + context.stop(); + } + } + } + } + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + })); + } + + for(Future future : futures) future.get(); + service.shutdown(); + Snapshot s = metrics.timer("increments").getSnapshot(); + LOG.info(String.format("75th=%s, 95th=%s, 99th=%s", s.get75thPercentile(), + s.get95thPercentile(), s.get99thPercentile())); + return 0; + } + + @Override + public Configuration getConf() { + return this.conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + public static void main(String[] args) throws Exception { + System.exit(ToolRunner.run(HBaseConfiguration.create(), new IncrementPerformanceTest(), args)); + } +} \ No newline at end of file diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestFromClientSide.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestFromClientSide.java index 8734aea..63d9cd0 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestFromClientSide.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestFromClientSide.java @@ -48,7 +48,6 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellUtil; -import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HConstants; @@ -119,6 +118,7 @@ import org.junit.experimental.categories.Category; @Category({LargeTests.class, ClientTests.class}) @SuppressWarnings ("deprecation") public class TestFromClientSide { + // NOTE: Increment tests were moved to their own class, TestIncrementsFromClientSide. private static final Log LOG = LogFactory.getLog(TestFromClientSide.class); protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); private static byte [] ROW = Bytes.toBytes("testRow"); @@ -3046,7 +3046,7 @@ public class TestFromClientSide { equals(value, CellUtil.cloneValue(key))); } - private void assertIncrementKey(Cell key, byte [] row, byte [] family, + static void assertIncrementKey(Cell key, byte [] row, byte [] family, byte [] qualifier, long value) throws Exception { assertTrue("Expected row [" + Bytes.toString(row) + "] " + @@ -3270,7 +3270,7 @@ public class TestFromClientSide { return stamps; } - private boolean equals(byte [] left, byte [] right) { + static boolean equals(byte [] left, byte [] right) { if (left == null && right == null) return true; if (left == null && right.length == 0) return true; if (right == null && left.length == 0) return true; @@ -4399,264 +4399,6 @@ public class TestFromClientSide { } @Test - public void testIncrementWithDeletes() throws Exception { - LOG.info("Starting testIncrementWithDeletes"); - final TableName TABLENAME = - TableName.valueOf("testIncrementWithDeletes"); - Table ht = TEST_UTIL.createTable(TABLENAME, FAMILY); - final byte[] COLUMN = Bytes.toBytes("column"); - - ht.incrementColumnValue(ROW, FAMILY, COLUMN, 5); - TEST_UTIL.flush(TABLENAME); - - Delete del = new Delete(ROW); - ht.delete(del); - - ht.incrementColumnValue(ROW, FAMILY, COLUMN, 5); - - Get get = new Get(ROW); - Result r = ht.get(get); - assertEquals(1, r.size()); - assertEquals(5, Bytes.toLong(r.getValue(FAMILY, COLUMN))); - } - - @Test - public void testIncrementingInvalidValue() throws Exception { - LOG.info("Starting testIncrementingInvalidValue"); - final TableName TABLENAME = TableName.valueOf("testIncrementingInvalidValue"); - Table ht = TEST_UTIL.createTable(TABLENAME, FAMILY); - final byte[] COLUMN = Bytes.toBytes("column"); - Put p = new Put(ROW); - // write an integer here (not a Long) - p.addColumn(FAMILY, COLUMN, Bytes.toBytes(5)); - ht.put(p); - try { - ht.incrementColumnValue(ROW, FAMILY, COLUMN, 5); - fail("Should have thrown DoNotRetryIOException"); - } catch (DoNotRetryIOException iox) { - // success - } - Increment inc = new Increment(ROW); - inc.addColumn(FAMILY, COLUMN, 5); - try { - ht.increment(inc); - fail("Should have thrown DoNotRetryIOException"); - } catch (DoNotRetryIOException iox) { - // success - } - } - - @Test - public void testIncrementInvalidArguments() throws Exception { - LOG.info("Starting testIncrementInvalidArguments"); - final TableName TABLENAME = TableName.valueOf("testIncrementInvalidArguments"); - Table ht = TEST_UTIL.createTable(TABLENAME, FAMILY); - final byte[] COLUMN = Bytes.toBytes("column"); - try { - // try null row - ht.incrementColumnValue(null, FAMILY, COLUMN, 5); - fail("Should have thrown IOException"); - } catch (IOException iox) { - // success - } - try { - // try null family - ht.incrementColumnValue(ROW, null, COLUMN, 5); - fail("Should have thrown IOException"); - } catch (IOException iox) { - // success - } - try { - // try null qualifier - ht.incrementColumnValue(ROW, FAMILY, null, 5); - fail("Should have thrown IOException"); - } catch (IOException iox) { - // success - } - // try null row - try { - Increment incNoRow = new Increment((byte [])null); - incNoRow.addColumn(FAMILY, COLUMN, 5); - fail("Should have thrown IllegalArgumentException"); - } catch (IllegalArgumentException iax) { - // success - } catch (NullPointerException npe) { - // success - } - // try null family - try { - Increment incNoFamily = new Increment(ROW); - incNoFamily.addColumn(null, COLUMN, 5); - fail("Should have thrown IllegalArgumentException"); - } catch (IllegalArgumentException iax) { - // success - } - // try null qualifier - try { - Increment incNoQualifier = new Increment(ROW); - incNoQualifier.addColumn(FAMILY, null, 5); - fail("Should have thrown IllegalArgumentException"); - } catch (IllegalArgumentException iax) { - // success - } - } - - @Test - public void testIncrementOutOfOrder() throws Exception { - LOG.info("Starting testIncrementOutOfOrder"); - final TableName TABLENAME = TableName.valueOf("testIncrementOutOfOrder"); - Table ht = TEST_UTIL.createTable(TABLENAME, FAMILY); - - byte [][] QUALIFIERS = new byte [][] { - Bytes.toBytes("B"), Bytes.toBytes("A"), Bytes.toBytes("C") - }; - - Increment inc = new Increment(ROW); - for (int i=0; iThere is similar test up in TestAtomicOperation. It does a test where it has 100 threads + * doing increments across two column families all on one row and the increments are connected to + * prove atomicity on row. + */ +@Category(MediumTests.class) +public class TestRegionIncrement { + private static final Log LOG = LogFactory.getLog(TestRegionIncrement.class); + @Rule public TestName name = new TestName(); + @Rule public final TestRule timeout = + CategoryBasedTimeout.builder().withTimeout(this.getClass()). + withLookingForStuckThread(true).build(); + private static HBaseTestingUtility TEST_UTIL; + private final static byte [] INCREMENT_BYTES = Bytes.toBytes("increment"); + private static final int THREAD_COUNT = 10; + private static final int INCREMENT_COUNT = 10000; + + @Before + public void setUp() throws Exception { + TEST_UTIL = HBaseTestingUtility.createLocalHTU(); + } + + @After + public void tearDown() throws Exception { + TEST_UTIL.cleanupTestDir(); + } + + private HRegion getRegion(final Configuration conf, final String tableName) throws IOException { + WAL wal = new FSHLog(FileSystem.get(conf), TEST_UTIL.getDataTestDir(), + TEST_UTIL.getDataTestDir().toString(), conf); + return (HRegion)TEST_UTIL.createLocalHRegion(Bytes.toBytes(tableName), + HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, tableName, conf, + false, Durability.SKIP_WAL, wal, INCREMENT_BYTES); + } + + private void closeRegion(final HRegion region) throws IOException { + region.close(); + region.getWAL().close(); + } + + @Test + public void testMVCCCausingMisRead() throws IOException { + final HRegion region = getRegion(TEST_UTIL.getConfiguration(), this.name.getMethodName()); + try { + // ADD TEST HERE!! + } finally { + closeRegion(region); + } + } + + /** + * Increments a single cell a bunch of times. + */ + private static class SingleCellIncrementer extends Thread { + private final int count; + private final HRegion region; + private final Increment increment; + + SingleCellIncrementer(final int i, final int count, final HRegion region, + final Increment increment) { + super("" + i); + setDaemon(true); + this.count = count; + this.region = region; + this.increment = increment; + } + + @Override + public void run() { + for (int i = 0; i < this.count; i++) { + try { + this.region.increment(this.increment); + // LOG.info(getName() + " " + i); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + } + + /** + * Increments a random row's Cell count times. + */ + private static class CrossRowCellIncrementer extends Thread { + private final int count; + private final HRegion region; + private final Increment [] increments; + + CrossRowCellIncrementer(final int i, final int count, final HRegion region, final int range) { + super("" + i); + setDaemon(true); + this.count = count; + this.region = region; + this.increments = new Increment[range]; + for (int ii = 0; ii < range; ii++) { + this.increments[ii] = new Increment(Bytes.toBytes(i)); + this.increments[ii].addColumn(INCREMENT_BYTES, INCREMENT_BYTES, 1); + } + } + + @Override + public void run() { + for (int i = 0; i < this.count; i++) { + try { + int index = ThreadLocalRandom.current().nextInt(0, this.increments.length); + this.region.increment(this.increments[index]); + // LOG.info(getName() + " " + index); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + } + + /** + * Have each thread update its own Cell. Avoid contention with another thread. + * @throws IOException + * @throws InterruptedException + */ + @Test + public void testUnContendedSingleCellIncrement() + throws IOException, InterruptedException { + final HRegion region = getRegion(TEST_UTIL.getConfiguration(), + TestIncrementsFromClientSide.filterStringSoTableNameSafe(this.name.getMethodName())); + long startTime = System.currentTimeMillis(); + try { + SingleCellIncrementer [] threads = new SingleCellIncrementer[THREAD_COUNT]; + for (int i = 0; i < threads.length; i++) { + byte [] rowBytes = Bytes.toBytes(i); + Increment increment = new Increment(rowBytes); + increment.addColumn(INCREMENT_BYTES, INCREMENT_BYTES, 1); + threads[i] = new SingleCellIncrementer(i, INCREMENT_COUNT, region, increment); + } + for (int i = 0; i < threads.length; i++) { + threads[i].start(); + } + for (int i = 0; i < threads.length; i++) { + threads[i].join(); + } + RegionScanner regionScanner = region.getScanner(new Scan()); + List cells = new ArrayList(THREAD_COUNT); + while(regionScanner.next(cells)) continue; + assertEquals(THREAD_COUNT, cells.size()); + long total = 0; + for (Cell cell: cells) total += + Bytes.toLong(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()); + assertEquals(INCREMENT_COUNT * THREAD_COUNT, total); + } finally { + closeRegion(region); + LOG.info(this.name.getMethodName() + " " + (System.currentTimeMillis() - startTime) + "ms"); + } + } + + /** + * Have each thread update its own Cell. Avoid contention with another thread. + * This is + * @throws IOException + * @throws InterruptedException + */ + @Test + public void testContendedAcrossCellsIncrement() + throws IOException, InterruptedException { + final HRegion region = getRegion(TEST_UTIL.getConfiguration(), + TestIncrementsFromClientSide.filterStringSoTableNameSafe(this.name.getMethodName())); + long startTime = System.currentTimeMillis(); + try { + CrossRowCellIncrementer [] threads = new CrossRowCellIncrementer[THREAD_COUNT]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new CrossRowCellIncrementer(i, INCREMENT_COUNT, region, THREAD_COUNT); + } + for (int i = 0; i < threads.length; i++) { + threads[i].start(); + } + for (int i = 0; i < threads.length; i++) { + threads[i].join(); + } + RegionScanner regionScanner = region.getScanner(new Scan()); + List cells = new ArrayList(100); + while(regionScanner.next(cells)) continue; + assertEquals(THREAD_COUNT, cells.size()); + long total = 0; + for (Cell cell: cells) total += + Bytes.toLong(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()); + assertEquals(INCREMENT_COUNT * THREAD_COUNT, total); + } finally { + closeRegion(region); + LOG.info(this.name.getMethodName() + " " + (System.currentTimeMillis() - startTime) + "ms"); + } + } +} \ No newline at end of file diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestTags.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestTags.java index 0f7f23a..d99643d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestTags.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestTags.java @@ -551,7 +551,7 @@ public class TestTags { public static class TestCoprocessorForTags extends BaseRegionObserver { - public static boolean checkTagPresence = false; + public static volatile boolean checkTagPresence = false; public static List tags = null; @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java index af47465..f7994f3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java @@ -408,13 +408,13 @@ public class TestFSHLog { } region.flush(true); // FlushResult.flushSequenceId is not visible here so go get the current sequence id. - long currentSequenceId = region.getSequenceId(); + long currentSequenceId = region.getReadPoint(); // Now release the appends goslow.setValue(false); synchronized (goslow) { goslow.notifyAll(); } - assertTrue(currentSequenceId >= region.getSequenceId()); + assertTrue(currentSequenceId >= region.getReadPoint()); } finally { region.close(true); wal.close(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java index 549a018..51e2340 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java @@ -882,7 +882,7 @@ public class TestWALReplay { for (HColumnDescriptor hcd : htd.getFamilies()) { addRegionEdits(rowName, hcd.getName(), 5, this.ee, region, "x"); } - long lastestSeqNumber = region.getSequenceId(); + long lastestSeqNumber = region.getReadPoint(); // get the current seq no wal.doCompleteCacheFlush = true; // allow complete cache flush with the previous seq number got after first diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestRegionReplicaReplicationEndpointNoMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestRegionReplicaReplicationEndpointNoMaster.java index a870ed8..84d3bfc 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestRegionReplicaReplicationEndpointNoMaster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestRegionReplicaReplicationEndpointNoMaster.java @@ -318,5 +318,4 @@ public class TestRegionReplicaReplicationEndpointNoMaster { closeRegion(HTU, rs0, hriSecondary); connection.close(); } - } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestCoprocessorScanPolicy.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestCoprocessorScanPolicy.java index c988761..b4eb798 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestCoprocessorScanPolicy.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestCoprocessorScanPolicy.java @@ -298,7 +298,7 @@ public class TestCoprocessorScanPolicy { newTtl == null ? oldSI.getTtl() : newTtl, family.getKeepDeletedCells(), oldSI.getTimeToPurgeDeletes(), oldSI.getComparator()); return new StoreScanner(store, scanInfo, scan, targetCols, - ((HStore) store).getHRegion().getReadpoint(IsolationLevel.READ_COMMITTED)); + ((HStore) store).getHRegion().getReadPoint(IsolationLevel.READ_COMMITTED)); } else { return s; }