diff --git llap-server/src/java/org/apache/hadoop/hive/llap/cache/LowLevelCacheMemoryManager.java llap-server/src/java/org/apache/hadoop/hive/llap/cache/LowLevelCacheMemoryManager.java index 2132574fb5..25d6e558de 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/cache/LowLevelCacheMemoryManager.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/cache/LowLevelCacheMemoryManager.java @@ -36,6 +36,11 @@ private final LowLevelCachePolicy evictor; private final LlapDaemonCacheMetrics metrics; private long maxSize; + private LlapOomDebugDump memoryDumpRoot; + + private static final long LOCKING_DEBUG_DUMP_PERIOD_NS = 30 * 1000000000L; // 30 sec. // TODO# use + private static final int LOCKING_DEBUG_DUMP_THRESHOLD = 5; + private static final AtomicLong lastCacheDumpNs = new AtomicLong(0); public LowLevelCacheMemoryManager( long maxSize, LowLevelCachePolicy evictor, LlapDaemonCacheMetrics metrics) { @@ -63,9 +68,10 @@ public void reserveMemory(final long memoryToReserve) { public boolean reserveMemory(final long memoryToReserve, boolean waitForEviction) { // TODO: if this cannot evict enough, it will spin infinitely. Terminate at some point? int badCallCount = 0; - int nextLog = 4; long evictedTotalMetric = 0, reservedTotalMetric = 0, remainingToReserve = memoryToReserve; boolean result = true; + int waitTimeMs = 4; + boolean didDumpIoState = false; while (remainingToReserve > 0) { long usedMem = usedMemory.get(), newUsedMem = usedMem + remainingToReserve; if (newUsedMem <= maxSize) { @@ -75,28 +81,29 @@ public boolean reserveMemory(final long memoryToReserve, boolean waitForEviction } continue; } - if (evictor == null) return false; - // TODO: for one-block case, we could move notification for the last block out of the loop. + if (evictor == null) { + result = false; + break; + } long evicted = evictor.evictSomeBlocks(remainingToReserve); if (evicted == 0) { if (!waitForEviction) { result = false; - break; + break; // Test code path where we don't do more than one attempt. } - ++badCallCount; - if (badCallCount == nextLog) { - LlapIoImpl.LOG.warn("Cannot evict blocks for " + badCallCount + " calls; cache full?"); - nextLog <<= 1; - try { - Thread.sleep(Math.min(1000, nextLog)); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - result = false; - break; - } + didDumpIoState = logEvictionIssue(++badCallCount, didDumpIoState); + waitTimeMs = Math.min(1000, waitTimeMs << 1); + assert waitTimeMs > 0; + try { + Thread.sleep(waitTimeMs); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + result = false; + break; } continue; } + evictedTotalMetric += evicted; badCallCount = 0; // Adjust the memory - we have to account for what we have just evicted. @@ -110,6 +117,7 @@ public boolean reserveMemory(final long memoryToReserve, boolean waitForEviction } usedMem = usedMemory.get(); } + } if (!result) { releaseMemory(reservedTotalMetric); @@ -120,6 +128,39 @@ public boolean reserveMemory(final long memoryToReserve, boolean waitForEviction } + private boolean logEvictionIssue(int badCallCount, boolean didDumpIoState) { + if (badCallCount <= LOCKING_DEBUG_DUMP_THRESHOLD) return didDumpIoState; + String ioStateDump = maybeDumpIoState(didDumpIoState); + if (ioStateDump == null) { + LlapIoImpl.LOG.warn("Cannot evict blocks for " + badCallCount + " calls; cache full?"); + return didDumpIoState; + } else { + LlapIoImpl.LOG.warn("Cannot evict blocks; IO state:\n " + ioStateDump); + return true; + } + } + + private String maybeDumpIoState(boolean didDumpIoState) { + if (didDumpIoState) return null; // No more than once per reader. + long now = System.nanoTime(), last = lastCacheDumpNs.get(); + while (true) { + if (last != 0 && (now - last) < LOCKING_DEBUG_DUMP_PERIOD_NS) { + return null; // We have recently dumped IO state into log. + } + if (lastCacheDumpNs.compareAndSet(last, now)) break; + now = System.nanoTime(); + last = lastCacheDumpNs.get(); + } + try { + StringBuilder sb = new StringBuilder(); + memoryDumpRoot.debugDumpShort(sb); + return sb.toString(); + } catch (Throwable t) { + return "Failed to dump cache state: " + t.getClass() + " " + t.getMessage(); + } + } + + @Override public long forceReservedMemory(int allocationSize, int count) { if (evictor == null) return 0; @@ -152,4 +193,9 @@ public void debugDumpShort(StringBuilder sb) { public void updateMaxSize(long maxSize) { this.maxSize = maxSize; } + + + public void setMemoryDumpRoot(LlapOomDebugDump memoryDumpRoot) { + this.memoryDumpRoot = memoryDumpRoot; + } } diff --git llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapIoImpl.java llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapIoImpl.java index 58535d7646..253532a9bc 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapIoImpl.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapIoImpl.java @@ -84,7 +84,7 @@ private final LlapDaemonIOMetrics ioMetrics; private ObjectName buddyAllocatorMXBean; private final Allocator allocator; - private final LlapOomDebugDump memoryDump; + private final LlapOomDebugDump memoryDumpRoot; private LlapIoImpl(Configuration conf) throws IOException { String ioMode = HiveConf.getVar(conf, HiveConf.ConfVars.LLAP_IO_MEMORY_MODE); @@ -150,7 +150,8 @@ private LlapIoImpl(Configuration conf) throws IOException { // Cache uses allocator to allocate and deallocate, create allocator and then caches. BuddyAllocator allocator = new BuddyAllocator(conf, memManager, cacheMetrics); this.allocator = allocator; - this.memoryDump = allocator; + this.memoryDumpRoot = allocator; + memManager.setMemoryDumpRoot(this.memoryDumpRoot); // TODO: This should be refactored... LowLevelCacheImpl cacheImpl = new LowLevelCacheImpl( cacheMetrics, cachePolicy, allocator, true); cache = cacheImpl; @@ -174,7 +175,7 @@ private LlapIoImpl(Configuration conf) throws IOException { bufferManager = cacheImpl; // Cache also serves as buffer manager. } else { this.allocator = new SimpleAllocator(conf); - memoryDump = null; + memoryDumpRoot = null; SimpleBufferManager sbm = new SimpleBufferManager(allocator, cacheMetrics); bufferManager = sbm; cache = sbm; @@ -202,9 +203,9 @@ private void registerMXBeans() { @Override public String getMemoryInfo() { - if (memoryDump == null) return "\nNot using the allocator"; + if (memoryDumpRoot == null) return "\nNot using the allocator"; StringBuilder sb = new StringBuilder(); - memoryDump.debugDumpShort(sb); + memoryDumpRoot.debugDumpShort(sb); return sb.toString(); }