Details
-
Bug
-
Status: Open
-
Critical
-
Resolution: Unresolved
-
0.11.0.0
-
None
-
None
Description
Hi,
We have a cluster with six nodes from time to time on one particular node we see the connections to the broker turn into CLOSE_WAIT and when we took thread dumps of the broker and analyzed them we found the threads are causing circular deadlock.
- Threads causing circular deadlock: executor-Heartbeat -
> kafka-request-handler-7> kafka-request-handler-1-> kafka-request-handler-7
executor-Heartbeat
priority:5 - threadId:0x00007fa04c076000 - nativeId:0x3277b - nativeId (decimal):206715 - state:BLOCKED
stackTrace:
java.lang.Thread.State: BLOCKED (on object monitor)
at kafka.coordinator.group.GroupCoordinator.onExpireHeartbeat(GroupCoordinator.scala:777)
- waiting to lock <0x00000006d4d81288> (a kafka.coordinator.group.GroupMetadata)
at kafka.coordinator.group.DelayedHeartbeat.onExpiration(DelayedHeartbeat.scala:38)
at kafka.server.DelayedOperation.run(DelayedOperation.scala:113)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Locked ownable synchronizers:
- <0x0000000727100b98> (a java.util.concurrent.ThreadPoolExecutor$Worker)
kafka-request-handler-7
priority:5 - threadId:0x00007fa0d580e000 - nativeId:0x1873f - nativeId (decimal):100159 - state:BLOCKED
stackTrace:
java.lang.Thread.State: BLOCKED (on object monitor)
at kafka.server.DelayedProduce.safeTryComplete(DelayedProduce.scala:75)
- waiting to lock <0x00000006d4d7a8e0> (a kafka.coordinator.group.GroupMetadata)
at kafka.server.DelayedOperationPurgatory$Watchers.tryCompleteWatched(DelayedOperation.scala:338)
at kafka.server.DelayedOperationPurgatory.checkAndComplete(DelayedOperation.scala:244)
at kafka.server.ReplicaManager.tryCompleteDelayedProduce(ReplicaManager.scala:250)
at kafka.cluster.Partition.tryCompleteDelayedRequests(Partition.scala:418)
at kafka.cluster.Partition.appendRecordsToLeader(Partition.scala:500)
at kafka.server.ReplicaManager$$anonfun$appendToLocalLog$2.apply(ReplicaManager.scala:546)
at kafka.server.ReplicaManager$$anonfun$appendToLocalLog$2.apply(ReplicaManager.scala:532)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.Map$Map1.foreach(Map.scala:116)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at kafka.server.ReplicaManager.appendToLocalLog(ReplicaManager.scala:532)
at kafka.server.ReplicaManager.appendRecords(ReplicaManager.scala:373)
at kafka.coordinator.group.GroupMetadataManager.appendForGroup(GroupMetadataManager.scala:239)
at kafka.coordinator.group.GroupMetadataManager.storeOffsets(GroupMetadataManager.scala:381)
at kafka.coordinator.group.GroupCoordinator.doCommitOffsets(GroupCoordinator.scala:465) - locked <0x00000006d4d81288> (a kafka.coordinator.group.GroupMetadata)
at kafka.coordinator.group.GroupCoordinator.handleCommitOffsets(GroupCoordinator.scala:428)
at kafka.server.KafkaApis.handleOffsetCommitRequest(KafkaApis.scala:356)
at kafka.server.KafkaApis.handle(KafkaApis.scala:105)
at kafka.server.KafkaRequestHandler.run(KafkaRequestHandler.scala:66)
at java.lang.Thread.run(Thread.java:745)
Locked ownable synchronizers: - None
kafka-request-handler-1
priority:5 - threadId:0x00007fa0d5803000 - nativeId:0x18739 - nativeId (decimal):100153 - state:BLOCKED
stackTrace:
java.lang.Thread.State: BLOCKED (on object monitor)
at kafka.server.DelayedProduce.safeTryComplete(DelayedProduce.scala:75)
- waiting to lock <0x00000006d4d81288> (a kafka.coordinator.group.GroupMetadata)
at kafka.server.DelayedOperationPurgatory$Watchers.tryCompleteWatched(DelayedOperation.scala:338)
at kafka.server.DelayedOperationPurgatory.checkAndComplete(DelayedOperation.scala:244)
at kafka.server.ReplicaManager.tryCompleteDelayedProduce(ReplicaManager.scala:250)
at kafka.cluster.Partition.tryCompleteDelayedRequests(Partition.scala:418)
at kafka.cluster.Partition.appendRecordsToLeader(Partition.scala:500)
at kafka.server.ReplicaManager$$anonfun$appendToLocalLog$2.apply(ReplicaManager.scala:546)
at kafka.server.ReplicaManager$$anonfun$appendToLocalLog$2.apply(ReplicaManager.scala:532)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.Map$Map1.foreach(Map.scala:116)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at kafka.server.ReplicaManager.appendToLocalLog(ReplicaManager.scala:532)
at kafka.server.ReplicaManager.appendRecords(ReplicaManager.scala:373)
at kafka.coordinator.group.GroupMetadataManager.appendForGroup(GroupMetadataManager.scala:239)
at kafka.coordinator.group.GroupMetadataManager.storeOffsets(GroupMetadataManager.scala:381)
at kafka.coordinator.group.GroupCoordinator.doCommitOffsets(GroupCoordinator.scala:465) - locked <0x00000006d4d7a8e0> (a kafka.coordinator.group.GroupMetadata)
at kafka.coordinator.group.GroupCoordinator.handleCommitOffsets(GroupCoordinator.scala:428)
at kafka.server.KafkaApis.handleOffsetCommitRequest(KafkaApis.scala:356)
at kafka.server.KafkaApis.handle(KafkaApis.scala:105)
at kafka.server.KafkaRequestHandler.run(KafkaRequestHandler.scala:66)
at java.lang.Thread.run(Thread.java:745)
Locked ownable synchronizers: - None
kafka-request-handler-7
priority:5 - threadId:0x00007fa0d580e000 - nativeId:0x1873f - nativeId (decimal):100159 - state:BLOCKED
stackTrace:
java.lang.Thread.State: BLOCKED (on object monitor)
at kafka.server.DelayedProduce.safeTryComplete(DelayedProduce.scala:75)
- waiting to lock <0x00000006d4d7a8e0> (a kafka.coordinator.group.GroupMetadata)
at kafka.server.DelayedOperationPurgatory$Watchers.tryCompleteWatched(DelayedOperation.scala:338)
at kafka.server.DelayedOperationPurgatory.checkAndComplete(DelayedOperation.scala:244)
at kafka.server.ReplicaManager.tryCompleteDelayedProduce(ReplicaManager.scala:250)
at kafka.cluster.Partition.tryCompleteDelayedRequests(Partition.scala:418)
at kafka.cluster.Partition.appendRecordsToLeader(Partition.scala:500)
at kafka.server.ReplicaManager$$anonfun$appendToLocalLog$2.apply(ReplicaManager.scala:546)
at kafka.server.ReplicaManager$$anonfun$appendToLocalLog$2.apply(ReplicaManager.scala:532)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.Map$Map1.foreach(Map.scala:116)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at kafka.server.ReplicaManager.appendToLocalLog(ReplicaManager.scala:532)
at kafka.server.ReplicaManager.appendRecords(ReplicaManager.scala:373)
at kafka.coordinator.group.GroupMetadataManager.appendForGroup(GroupMetadataManager.scala:239)
at kafka.coordinator.group.GroupMetadataManager.storeOffsets(GroupMetadataManager.scala:381)
at kafka.coordinator.group.GroupCoordinator.doCommitOffsets(GroupCoordinator.scala:465) - locked <0x00000006d4d81288> (a kafka.coordinator.group.GroupMetadata)
at kafka.coordinator.group.GroupCoordinator.handleCommitOffsets(GroupCoordinator.scala:428)
at kafka.server.KafkaApis.handleOffsetCommitRequest(KafkaApis.scala:356)
at kafka.server.KafkaApis.handle(KafkaApis.scala:105)
at kafka.server.KafkaRequestHandler.run(KafkaRequestHandler.scala:66)
at java.lang.Thread.run(Thread.java:745)
Locked ownable synchronizers: - None
- Threads causing circular deadlock: group-metadata-manager-0 -
> kafka-request-handler-7> kafka-request-handler-1-> kafka-request-handler-7