Uploaded image for project: 'Apache YuniKorn'
  1. Apache YuniKorn
  2. YUNIKORN-2 Support Gang Scheduling
  3. YUNIKORN-575

Skip creating placeholders for the completed apps post restart

    XMLWordPrintableJSON

Details

    Description

      • Post restart, YK tries to recover the completed apps and schedules placeholder pods(even though the real pods are in completed state), which may not be needed. This leads to resource mismanagement.
        gang-app-timeout-1006-5jqqk               0/1     Completed   0          69m
        gang-app-timeout-1007-tw44t               0/1     Completed   0          66m
        gang-app-timeout-1008-dmzc4               0/1     Completed   0          64m
        gang-app-timeout-1008-dwxgq               0/1     Completed   0          64m
        gang-app-timeout-1008-sl2x9               0/1     Completed   0          64m
        tg-timeout-1006-gang-app-timeout-1006-0   1/1     Running     0          60s
        tg-timeout-1006-gang-app-timeout-1006-1   1/1     Running     0          60s
        tg-timeout-1006-gang-app-timeout-1006-2   1/1     Running     0          60s
        tg-timeout-1007-gang-app-timeout-1007-0   1/1     Running     0          60s
        tg-timeout-1007-gang-app-timeout-1007-1   1/1     Running     0          60s
        tg-timeout-1007-gang-app-timeout-1007-2   0/1     Pending     0          60s
        tg-timeout-1008-gang-app-timeout-1008-0   1/1     Running     0          60s
        tg-timeout-1008-gang-app-timeout-1008-1   1/1     Running     0          60s
        tg-timeout-1008-gang-app-timeout-1008-2   1/1     Running     0          60s
        
      • All the completed apps are marked as failed, post restart and the allocations are not released. This could be a resource leak post restart.
        [
            {
                "allocations": null,
                "applicationID": "gang-app-timeout-1009",
                "applicationState": "Accepted",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868052062417676,
                "usedResource": "[]"
            },
            {
                "allocations": null,
                "applicationID": "gang-app-timeout-1011",
                "applicationState": "Accepted",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868052062788287,
                "usedResource": "[]"
            },
            {
                "allocations": null,
                "applicationID": "gang-app-timeout-1010",
                "applicationState": "Accepted",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868052057156621,
                "usedResource": "[]"
            },
            {
                "allocations": null,
                "applicationID": "gang-app-timeout-1003",
                "applicationState": "Accepted",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868052062023562,
                "usedResource": "[]"
            },
            {
                "allocations": [
                    {
                        "allocationKey": "0a761a05-4b00-4e34-a54d-22411007553a",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1008",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-0"
                        },
                        "applicationId": "gang-app-timeout-1008",
                        "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "9704811c-422d-4efa-bb42-ab565fb5f16b"
                    },
                    {
                        "allocationKey": "2505258b-3358-4143-b2a2-9084ffa0977b",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1008",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-1"
                        },
                        "applicationId": "gang-app-timeout-1008",
                        "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "e0ff467d-ec18-4d5b-b981-861835f1604a"
                    },
                    {
                        "allocationKey": "29dbfaec-7632-4bff-b4ea-e313521497f1",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1008",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1008-gang-app-timeout-1008-2"
                        },
                        "applicationId": "gang-app-timeout-1008",
                        "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "6723d3ac-c7c8-4935-bb23-3b443909a252"
                    }
                ],
                "applicationID": "gang-app-timeout-1008",
                "applicationState": "Failed",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868050004448061,
                "usedResource": "[]"
            },
            {
                "allocations": [
                    {
                        "allocationKey": "05d87d17-a6dc-4bc0-b495-c76f1cd0a3cb",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1007",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1007-gang-app-timeout-1007-0"
                        },
                        "applicationId": "gang-app-timeout-1007",
                        "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "67401008-61b0-4957-8361-6d0e8917c21f"
                    },
                    {
                        "allocationKey": "1af95692-0186-44fe-b712-30edb51b85c2",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1007",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1007-gang-app-timeout-1007-1"
                        },
                        "applicationId": "gang-app-timeout-1007",
                        "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "5d1f129e-3e40-4103-b2e6-53daf408465f"
                    }
                ],
                "applicationID": "gang-app-timeout-1007",
                "applicationState": "Failed",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868050004840460,
                "usedResource": "[]"
            },
            {
                "allocations": [
                    {
                        "allocationKey": "8524d2ab-a591-4fca-8a5f-3847e8d173ab",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1006",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-1"
                        },
                        "applicationId": "gang-app-timeout-1006",
                        "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "909735f0-607b-4799-bf4c-8b45f59c174b"
                    },
                    {
                        "allocationKey": "b33078a1-aac6-4217-afd5-3c80248782dd",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1006",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-2"
                        },
                        "applicationId": "gang-app-timeout-1006",
                        "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "80f04647-ada2-4851-9361-d6bcb5c18c65"
                    },
                    {
                        "allocationKey": "e7aa1b09-fac8-43bf-aae9-48215086ae36",
                        "allocationTags": {
                            "kubernetes.io/label/applicationId": "gang-app-timeout-1006",
                            "kubernetes.io/label/queue": "fifo",
                            "kubernetes.io/meta/namespace": "fifo",
                            "kubernetes.io/meta/podName": "tg-timeout-1006-gang-app-timeout-1006-0"
                        },
                        "applicationId": "gang-app-timeout-1006",
                        "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal",
                        "partition": "default",
                        "priority": "0",
                        "queueName": "root.fifo",
                        "resource": "[memory:300 vcore:300]",
                        "uuid": "f6172318-7e4a-4252-8bf5-8346de4a4d48"
                    }
                ],
                "applicationID": "gang-app-timeout-1006",
                "applicationState": "Failed",
                "partition": "[mycluster]default",
                "queueName": "root.fifo",
                "submissionTime": 1615868050003595376,
                "usedResource": "[]"
            }
        ]
        

      YK UI snapshot showing apps marked as failed.

      Attached log. yk_recover.log

      Attachments

        1. Screen Shot 2021-03-15 at 9.27.10 PM.png
          284 kB
          Ayub Pathan
        2. yk_recover.log
          68 kB
          Ayub Pathan

        Issue Links

          Activity

            People

              wwei Weiwei Yang
              ayubpathan Ayub Pathan
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: