Uploaded image for project: 'Apache YuniKorn'
  1. Apache YuniKorn
  2. YUNIKORN-2 Support Gang Scheduling
  3. YUNIKORN-575

Skip creating placeholders for the completed apps post restart

    XMLWordPrintableJSON

    Details

      Description

      • Post restart, YK tries to recover the completed apps and schedules placeholder pods(even though the real pods are in completed state), which may not be needed. This leads to resource mismanagement.
        gang-app-timeout-1006-5jqqk               0/1     Completed   0          69m
        gang-app-timeout-1007-tw44t               0/1     Completed   0          66m
        gang-app-timeout-1008-dmzc4               0/1     Completed   0          64m
        gang-app-timeout-1008-dwxgq               0/1     Completed   0          64m
        gang-app-timeout-1008-sl2x9               0/1     Completed   0          64m
        tg-timeout-1006-gang-app-timeout-1006-0   1/1     Running     0          60s
        tg-timeout-1006-gang-app-timeout-1006-1   1/1     Running     0          60s
        tg-timeout-1006-gang-app-timeout-1006-2   1/1     Running     0          60s
        tg-timeout-1007-gang-app-timeout-1007-0   1/1     Running     0          60s
        tg-timeout-1007-gang-app-timeout-1007-1   1/1     Running     0          60s
        tg-timeout-1007-gang-app-timeout-1007-2   0/1     Pending     0          60s
        tg-timeout-1008-gang-app-timeout-1008-0   1/1     Running     0          60s
        tg-timeout-1008-gang-app-timeout-1008-1   1/1     Running     0          60s
        tg-timeout-1008-gang-app-timeout-1008-2   1/1     Running     0          60s
        
      • All the completed apps are marked as failed, post restart and the allocations are not released. This could be a resource leak post restart.
        [
            {
                "allocations": null,
                "applicationID": "gang-app-timeout-1009",
                "applicationState": "Accepted",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868052062417676,
                "usedResource": "[]"
            },
            {
                "allocations": null,
                "applicationID": "gang-app-timeout-1011",
                "applicationState": "Accepted",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868052062788287,
                "usedResource": "[]"
            },
            {
                "allocations": null,
                "applicationID": "gang-app-timeout-1010",
                "applicationState": "Accepted",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868052057156621,
                "usedResource": "[]"
            },
            {
                "allocations": null,
                "applicationID": "gang-app-timeout-1003",
                "applicationState": "Accepted",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868052062023562,
                "usedResource": "[]"
            },
            {
                "allocations": [
                    {
                        "allocationKey": "0a761a05-4b00-4e34-a54d-22411007553a",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1008",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-0"
                        },
                        "applicationId": "gang-app-timeout-1008",
                        "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "9704811c-422d-4efa-bb42-ab565fb5f16b"
                    },
                    {
                        "allocationKey": "2505258b-3358-4143-b2a2-9084ffa0977b",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1008",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-1"
                        },
                        "applicationId": "gang-app-timeout-1008",
                        "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "e0ff467d-ec18-4d5b-b981-861835f1604a"
                    },
                    {
                        "allocationKey": "29dbfaec-7632-4bff-b4ea-e313521497f1",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1008",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-2"
                        },
                        "applicationId": "gang-app-timeout-1008",
                        "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "6723d3ac-c7c8-4935-bb23-3b443909a252"
                    }
                ],
                "applicationID": "gang-app-timeout-1008",
                "applicationState": "Failed",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868050004448061,
                "usedResource": "[]"
            },
            {
                "allocations": [
                    {
                        "allocationKey": "05d87d17-a6dc-4bc0-b495-c76f1cd0a3cb",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1007",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1007-gang-app-timeout-1007-0"
                        },
                        "applicationId": "gang-app-timeout-1007",
                        "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "67401008-61b0-4957-8361-6d0e8917c21f"
                    },
                    {
                        "allocationKey": "1af95692-0186-44fe-b712-30edb51b85c2",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1007",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1007-gang-app-timeout-1007-1"
                        },
                        "applicationId": "gang-app-timeout-1007",
                        "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "5d1f129e-3e40-4103-b2e6-53daf408465f"
                    }
                ],
                "applicationID": "gang-app-timeout-1007",
                "applicationState": "Failed",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868050004840460,
                "usedResource": "[]"
            },
            {
                "allocations": [
                    {
                        "allocationKey": "8524d2ab-a591-4fca-8a5f-3847e8d173ab",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1006",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-1"
                        },
                        "applicationId": "gang-app-timeout-1006",
                        "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "909735f0-607b-4799-bf4c-8b45f59c174b"
                    },
                    {
                        "allocationKey": "b33078a1-aac6-4217-afd5-3c80248782dd",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1006",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-2"
                        },
                        "applicationId": "gang-app-timeout-1006",
                        "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "80f04647-ada2-4851-9361-d6bcb5c18c65"
                    },
                    {
                        "allocationKey": "e7aa1b09-fac8-43bf-aae9-48215086ae36",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1006",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-0"
                        },
                        "applicationId": "gang-app-timeout-1006",
                        "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "f6172318-7e4a-4252-8bf5-8346de4a4d48"
                    }
                ],
                "applicationID": "gang-app-timeout-1006",
                "applicationState": "Failed",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868050003595376,
                "usedResource": "[]"
            }
        ]
        

      YK UI snapshot showing apps marked as failed.

      Attached log. yk_recover.log

        Attachments

        1. Screen Shot 2021-03-15 at 9.27.10 PM.png
          284 kB
          Ayub Pathan
        2. yk_recover.log
          68 kB
          Ayub Pathan

          Issue Links

            Activity

              People

              • Assignee:
                wwei Weiwei Yang
                Reporter:
                ayubpathan Ayub Pathan
              • Votes:
                0 Vote for this issue
                Watchers:
                2 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved: