Details
-
Sub-task
-
Status: Closed
-
Major
-
Resolution: Fixed
-
0.10
Description
- Post restart, YK tries to recover the completed apps and schedules placeholder pods(even though the real pods are in completed state), which may not be needed. This leads to resource mismanagement.
gang-app-timeout-1006-5jqqk 0/1 Completed 0 69m gang-app-timeout-1007-tw44t 0/1 Completed 0 66m gang-app-timeout-1008-dmzc4 0/1 Completed 0 64m gang-app-timeout-1008-dwxgq 0/1 Completed 0 64m gang-app-timeout-1008-sl2x9 0/1 Completed 0 64m tg-timeout-1006-gang-app-timeout-1006-0 1/1 Running 0 60s tg-timeout-1006-gang-app-timeout-1006-1 1/1 Running 0 60s tg-timeout-1006-gang-app-timeout-1006-2 1/1 Running 0 60s tg-timeout-1007-gang-app-timeout-1007-0 1/1 Running 0 60s tg-timeout-1007-gang-app-timeout-1007-1 1/1 Running 0 60s tg-timeout-1007-gang-app-timeout-1007-2 0/1 Pending 0 60s tg-timeout-1008-gang-app-timeout-1008-0 1/1 Running 0 60s tg-timeout-1008-gang-app-timeout-1008-1 1/1 Running 0 60s tg-timeout-1008-gang-app-timeout-1008-2 1/1 Running 0 60s
- All the completed apps are marked as failed, post restart and the allocations are not released. This could be a resource leak post restart.
[ { "allocations": null, "applicationID": "gang-app-timeout-1009", "applicationState": "Accepted", "partition": "[mycluster]default", "queueName": "root.fifo", "submissionTime": 1615868052062417676, "usedResource": "[]" }, { "allocations": null, "applicationID": "gang-app-timeout-1011", "applicationState": "Accepted", "partition": "[mycluster]default", "queueName": "root.fifo", "submissionTime": 1615868052062788287, "usedResource": "[]" }, { "allocations": null, "applicationID": "gang-app-timeout-1010", "applicationState": "Accepted", "partition": "[mycluster]default", "queueName": "root.fifo", "submissionTime": 1615868052057156621, "usedResource": "[]" }, { "allocations": null, "applicationID": "gang-app-timeout-1003", "applicationState": "Accepted", "partition": "[mycluster]default", "queueName": "root.fifo", "submissionTime": 1615868052062023562, "usedResource": "[]" }, { "allocations": [ { "allocationKey": "0a761a05-4b00-4e34-a54d-22411007553a", "allocationTags": { "kubernetes.io/label/applicationId": "gang-app-timeout-1008", "kubernetes.io/label/queue": "fifo", "kubernetes.io/meta/namespace": "fifo", "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-0" }, "applicationId": "gang-app-timeout-1008", "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal", "partition": "default", "priority": "0", "queueName": "root.fifo", "resource": "[memory:300 vcore:300]", "uuid": "9704811c-422d-4efa-bb42-ab565fb5f16b" }, { "allocationKey": "2505258b-3358-4143-b2a2-9084ffa0977b", "allocationTags": { "kubernetes.io/label/applicationId": "gang-app-timeout-1008", "kubernetes.io/label/queue": "fifo", "kubernetes.io/meta/namespace": "fifo", "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-1" }, "applicationId": "gang-app-timeout-1008", "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal", "partition": "default", "priority": "0", "queueName": "root.fifo", "resource": "[memory:300 vcore:300]", "uuid": "e0ff467d-ec18-4d5b-b981-861835f1604a" }, { "allocationKey": "29dbfaec-7632-4bff-b4ea-e313521497f1", "allocationTags": { "kubernetes.io/label/applicationId": "gang-app-timeout-1008", "kubernetes.io/label/queue": "fifo", "kubernetes.io/meta/namespace": "fifo", "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-2" }, "applicationId": "gang-app-timeout-1008", "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal", "partition": "default", "priority": "0", "queueName": "root.fifo", "resource": "[memory:300 vcore:300]", "uuid": "6723d3ac-c7c8-4935-bb23-3b443909a252" } ], "applicationID": "gang-app-timeout-1008", "applicationState": "Failed", "partition": "[mycluster]default", "queueName": "root.fifo", "submissionTime": 1615868050004448061, "usedResource": "[]" }, { "allocations": [ { "allocationKey": "05d87d17-a6dc-4bc0-b495-c76f1cd0a3cb", "allocationTags": { "kubernetes.io/label/applicationId": "gang-app-timeout-1007", "kubernetes.io/label/queue": "fifo", "kubernetes.io/meta/namespace": "fifo", "kubernetes.io/meta/podName": "tg-timeout-1007-gang-app-timeout-1007-0" }, "applicationId": "gang-app-timeout-1007", "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal", "partition": "default", "priority": "0", "queueName": "root.fifo", "resource": "[memory:300 vcore:300]", "uuid": "67401008-61b0-4957-8361-6d0e8917c21f" }, { "allocationKey": "1af95692-0186-44fe-b712-30edb51b85c2", "allocationTags": { "kubernetes.io/label/applicationId": "gang-app-timeout-1007", "kubernetes.io/label/queue": "fifo", "kubernetes.io/meta/namespace": "fifo", "kubernetes.io/meta/podName": "tg-timeout-1007-gang-app-timeout-1007-1" }, "applicationId": "gang-app-timeout-1007", "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal", "partition": "default", "priority": "0", "queueName": "root.fifo", "resource": "[memory:300 vcore:300]", "uuid": "5d1f129e-3e40-4103-b2e6-53daf408465f" } ], "applicationID": "gang-app-timeout-1007", "applicationState": "Failed", "partition": "[mycluster]default", "queueName": "root.fifo", "submissionTime": 1615868050004840460, "usedResource": "[]" }, { "allocations": [ { "allocationKey": "8524d2ab-a591-4fca-8a5f-3847e8d173ab", "allocationTags": { "kubernetes.io/label/applicationId": "gang-app-timeout-1006", "kubernetes.io/label/queue": "fifo", "kubernetes.io/meta/namespace": "fifo", "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-1" }, "applicationId": "gang-app-timeout-1006", "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal", "partition": "default", "priority": "0", "queueName": "root.fifo", "resource": "[memory:300 vcore:300]", "uuid": "909735f0-607b-4799-bf4c-8b45f59c174b" }, { "allocationKey": "b33078a1-aac6-4217-afd5-3c80248782dd", "allocationTags": { "kubernetes.io/label/applicationId": "gang-app-timeout-1006", "kubernetes.io/label/queue": "fifo", "kubernetes.io/meta/namespace": "fifo", "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-2" }, "applicationId": "gang-app-timeout-1006", "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal", "partition": "default", "priority": "0", "queueName": "root.fifo", "resource": "[memory:300 vcore:300]", "uuid": "80f04647-ada2-4851-9361-d6bcb5c18c65" }, { "allocationKey": "e7aa1b09-fac8-43bf-aae9-48215086ae36", "allocationTags": { "kubernetes.io/label/applicationId": "gang-app-timeout-1006", "kubernetes.io/label/queue": "fifo", "kubernetes.io/meta/namespace": "fifo", "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-0" }, "applicationId": "gang-app-timeout-1006", "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal", "partition": "default", "priority": "0", "queueName": "root.fifo", "resource": "[memory:300 vcore:300]", "uuid": "f6172318-7e4a-4252-8bf5-8346de4a4d48" } ], "applicationID": "gang-app-timeout-1006", "applicationState": "Failed", "partition": "[mycluster]default", "queueName": "root.fifo", "submissionTime": 1615868050003595376, "usedResource": "[]" } ]
YK UI snapshot showing apps marked as failed.
Attached log. yk_recover.log
Attachments
Attachments
Issue Links
- links to