Details
-
Bug
-
Status: Open
-
Critical
-
Resolution: Unresolved
-
1.17.2
-
None
-
None
-
- flink 1.17.2
- native kubernetes session cluster HA(3 jobmanager replica)
Description
I commit batch jobs to my session cluster with rest api. The jobmanager pod would restart when occurred the error.
Seems to be because the new leader elected and exist running job at the same time. Then the job send to the new leader and error.
2024-10-18 03:07:22,107 INFO org.apache.flink.client.deployment.application.executors.EmbeddedExecutor [] - Submitting Job with JobId=a9d339b6ba26ab51746514cc7aea0537. 2024-10-18 03:07:22,546 INFO org.apache.flink.kubernetes.kubeclient.resources.KubernetesLeaderElector [] - New leader elected 57c4be1d-58f0-4c2c-89d8-11aefe1ec273 for flink-cluster-cluster-config-map. 2024-10-18 03:07:22,549 ERROR org.apache.flink.runtime.entrypoint.ClusterEntrypoint [] - Fatal error occurred in the cluster entrypoint.org.apache.flink.util.FlinkException: JobMaster for job 99c02051a54c77499f53f09cd4b7a0d9 failed. at org.apache.flink.runtime.dispatcher.Dispatcher.jobMasterFailed(Dispatcher.java:1360) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.dispatcher.Dispatcher.jobManagerRunnerFailed(Dispatcher.java:772) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.dispatcher.Dispatcher.lambda$runJob$6(Dispatcher.java:694) ~[flink-dist-1.17.2.jar:1.17.2] at java.util.concurrent.CompletableFuture.uniHandle(Unknown Source) ~[?:?] at java.util.concurrent.CompletableFuture$UniHandle.tryFire(Unknown Source) ~[?:?] at java.util.concurrent.CompletableFuture$Completion.run(Unknown Source) ~[?:?] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRunAsync$4(AkkaRpcActor.java:453) ~[flink-rpc-akka_27725420-d3ff-407e-864b- d8e6936565db.jar:1.17.2] at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:453) ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:218) ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:84) ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:168) ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.actor.Actor.aroundReceive(Actor.scala:537) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.actor.Actor.aroundReceive$(Actor.scala:535) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.actor.ActorCell.receiveMessage(ActorCell.scala:579) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.actor.ActorCell.invoke(ActorCell.scala:547) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.dispatch.Mailbox.run(Mailbox.scala:231) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at akka.dispatch.Mailbox.exec(Mailbox.scala:243) [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2] at java.util.concurrent.ForkJoinTask.doExec(Unknown Source) [?:?] at java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown Source) [?:?] at java.util.concurrent.ForkJoinPool.scan(Unknown Source) [?:?] at java.util.concurrent.ForkJoinPool.runWorker(Unknown Source) [?:?] at java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source) [?:?] caused by: org.apache.flink.util.FlinkException: Could not suspend the job manager. at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$null$13(JobMasterServiceLeadershipRunner.java:438) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$handleAsyncOperationError$14(JobMasterServiceLeadershipRunner.java:436) ~[flink-dist-1.17.2.jar:1.17.2] at java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown Source) ~[?:?] at java.util.concurrent.CompletableFuture.uniWhenCompleteStage(Unknown Source) ~[?:?] at java.util.concurrent.CompletableFuture.whenComplete(Unknown Source) ~[?:?] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.handleAsyncOperationError(JobMasterServiceLeadershipRunner.java:433) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:405) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225) ~[flink-dist-1.17.2.jar:1.17.2] at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?] at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?] at java.lang.Thread.run(Unknown Source) ~[?:?] caused by: java.util.concurrent.CompletionException: java.lang.UnsupportedOperationException: Still waiting for the leadership. at java.util.concurrent.CompletableFuture.encodeThrowable(Unknown Source) ~[?:?] at java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) ~[?:?] at java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) ~[?:?] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:398) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225) ~[flink-dist-1.17.2.jar:1.17.2] at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?] at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?] at java.lang.Thread.run(Unknown Source) ~[?:?] caused by: java.lang.UnsupportedOperationException: Still waiting for the leadership. at org.apache.flink.runtime.jobmaster.JobMasterServiceProcess$WaitingForLeadership.getLeaderSessionId(JobMasterServiceProcess.java:71) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcess(JobMasterServiceLeadershipRunner.java:414) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.callIfRunning(JobMasterServiceLeadershipRunner.java:469) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$stopJobMasterServiceProcessAsync$12(JobMasterServiceLeadershipRunner.java:400) ~[flink-dist-1.17.2.jar:1.17.2] at java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) ~[?:?] at java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) ~[?:?] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:398) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236) ~[flink-dist-1.17.2.jar:1.17.2] at org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225) ~[flink-dist-1.17.2.jar:1.17.2] at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?] at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?] at java.lang.Thread.run(Unknown Source) ~[?:?] INFO org.apache.flink.runtime.entrypoint.ClusterEntrypoint [] - Shutting KubernetesSessionClusterEntrypoint down with application status UNKNOWN. Diagnostics Cluster entrypoint has been closed externally..INFO org.apache.flink.runtime.blob.BlobServer [] - Stopped BLOB server at 0.0.0.0:6124