Uploaded image for project: 'Apache Ozone'
  1. Apache Ozone
  2. HDDS-9807

[EC][SCM] Incorrect check of available space on datanodes in case of allocating blocks

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Major
    • Resolution: Fixed
    • 1.4.0
    • 1.5.0
    • EC, SCM

    Description

      SCM checks the datanodes availability to allocate blocks incorrectly - it doesn't consider the committed space (created containers max size sum).

      Imagine the case:

      1. The cluster has 10 datanodes with 2Gb storage mounted to /data
      ./hadoop-ozone/dist/target/ozone-1.4.0-SNAPSHOT/compose/ozone/docker-compose.yaml

      version: "3.8"
      
      # reusable fragments (see https://docs.docker.com/compose/compose-file/#extension-fields)
      x-common-config: 
        &common-config
        image: ${OZONE_RUNNER_IMAGE}:${OZONE_RUNNER_VERSION}
        volumes: 
          - ../..:/opt/hadoop
        env_file: 
          - docker-config
      
      x-replication: 
        &replication
        OZONE-SITE.XML_ozone.server.default.replication: ${OZONE_REPLICATION_FACTOR:-1}
      
      services: 
        datanode1:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9001:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs1:/data
            - ../..:/opt/hadoop
        datanode2:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9002:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs2:/data
            - ../..:/opt/hadoop
        datanode3:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9003:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs3:/data
            - ../..:/opt/hadoop
        datanode4:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9004:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs4:/data
            - ../..:/opt/hadoop
        datanode5:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9005:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs5:/data
            - ../..:/opt/hadoop
        datanode6:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9006:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs6:/data
            - ../..:/opt/hadoop
        datanode7:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9007:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs7:/data
            - ../..:/opt/hadoop
        datanode8:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9008:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs8:/data
            - ../..:/opt/hadoop
        datanode9:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9009:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs9:/data
            - ../..:/opt/hadoop
        datanode10:
          <<: *common-config
          ports: 
            - 19864
            - 9882
            - 9010:5005
          environment: 
            <<: *replication
            OZONE_OPTS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005
          command: [ "ozone","datanode" ]
          volumes: 
            - tmpfs10:/data
            - ../..:/opt/hadoop
        om: 
          <<: *common-config
          environment: 
            ENSURE_OM_INITIALIZED: /data/metadata/om/current/VERSION
            OZONE_OPTS: 
            <<: *replication
          ports: 
            - 9874:9874
            - 9862:9862
          command: ["ozone","om"]
        scm: 
          <<: *common-config
          ports: 
            - 9876:9876
            - 9860:9860
          environment: 
            ENSURE_SCM_INITIALIZED: /data/metadata/scm/current/VERSION
            OZONE-SITE.XML_hdds.scm.safemode.min.datanode: ${OZONE_SAFEMODE_MIN_DATANODES:-1}
            OZONE_OPTS: 
            <<: *replication
          command: ["ozone","scm"]
        httpfs: 
          <<: *common-config
          environment: 
            OZONE-SITE.XML_hdds.scm.safemode.min.datanode: ${OZONE_SAFEMODE_MIN_DATANODES:-1}
            <<: *replication
          ports: 
            - 14000:14000
          command: [ "ozone","httpfs" ]
        s3g: 
          <<: *common-config
          environment: 
            OZONE_OPTS: 
            <<: *replication
          ports: 
            - 9878:9878
          command: ["ozone","s3g"]
        recon: 
          <<: *common-config
          ports: 
            - 9888:9888
          environment: 
            OZONE_OPTS: 
            <<: *replication
          command: ["ozone","recon"]
      volumes: 
        tmpfs1:
          driver: local
          driver_opts: 
            o: "size=2g,uid=1000"
            device: tmpfs
            type: tmpfs
        tmpfs2:
          driver: local
          driver_opts: 
            o: "size=2g,uid=2000"
            device: tmpfs
            type: tmpfs
        tmpfs3:
          driver: local
          driver_opts: 
            o: "size=2g,uid=3000"
            device: tmpfs
            type: tmpfs
        tmpfs4:
          driver: local
          driver_opts: 
            o: "size=2g,uid=4000"
            device: tmpfs
            type: tmpfs
        tmpfs5:
          driver: local
          driver_opts: 
            o: "size=2g,uid=5000"
            device: tmpfs
            type: tmpfs
        tmpfs6:
          driver: local
          driver_opts: 
            o: "size=2g,uid=6000"
            device: tmpfs
            type: tmpfs
        tmpfs7:
          driver: local
          driver_opts: 
            o: "size=2g,uid=7000"
            device: tmpfs
            type: tmpfs
        tmpfs8:
          driver: local
          driver_opts: 
            o: "size=2g,uid=8000"
            device: tmpfs
            type: tmpfs
        tmpfs9:
          driver: local
          driver_opts: 
            o: "size=2g,uid=9000"
            device: tmpfs
            type: tmpfs
        tmpfs10:
          driver: local
          driver_opts: 
            o: "size=2g,uid=10000"
            device: tmpfs
            type: tmpfs
      
      

      ./hadoop-ozone/dist/target/ozone-1.4.0-SNAPSHOT/compose/ozone/.env

      ...
      OZONE_REPLICATION_FACTOR=3
      ...
      

      ./hadoop-ozone/dist/target/ozone-1.4.0-SNAPSHOT/compose/ozone/docker-config

      ...
      OZONE-SITE.XML_ozone.scm.pipeline.creation.auto.factor.one=false
      ...
      

      2. There is an EC-bucket with rs-6-3-1024k replication config

      ozone sh volume create data
      ozone sh bucket create data/bucket1 --type EC --replication rs-6-3-1024k --layout LEGACY
      ozone sh bucket link data/bucket1 s3v/bucket1
      

      3. Create 200KiB file and put it to the bucket

      head -c 200KiB </dev/urandom > /tmp/test_file_200KiB
      ozone sh key put s3v/bucket1/test_key_200KiB_001 /tmp/test_file_200KiB
      

      A new EC-pipeline will be created:

      #scm log
      2023-11-30 08:33:26,124 [IPC Server handler 7 on default port 9863] INFO pipeline.WritableECContainerProvider: Created and opened new pipeline Pipeline[ Id: 70b771a8-0141-4447-8e0f-730b9fba2c34, Nodes: 05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5)3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3)dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6)1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7)afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2)9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12)65fd7e45-140b-4524-b0e3-800ca5fb0724(ozone-datanode8-1.ozone_default/192.168.176.8)993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13)8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10), ReplicationConfig: EC{rs-6-3-1024k}, State:ALLOCATED, leaderId:, CreationTimestamp2023-11-30T08:33:26.080416Z[UTC]]
      
      # ozone admin pipeline list
      Pipeline[ Id: 077f1a30-0dec-4538-a66f-509583223052, Nodes: 05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5)1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7)8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10), ReplicationConfig: RATIS/THREE, State:OPEN, leaderId:05f06265-66e3-407d-9429-a31754686468, CreationTimestamp2023-11-30T08:30:47.873Z[UTC]]
      Pipeline[ Id: cda08d91-afee-4d31-ad16-02ea3313e502, Nodes: afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2)65fd7e45-140b-4524-b0e3-800ca5fb0724(ozone-datanode8-1.ozone_default/192.168.176.8)993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13), ReplicationConfig: RATIS/THREE, State:OPEN, leaderId:afd41e81-1ead-4c5a-b087-8f1bb69e2574, CreationTimestamp2023-11-30T08:30:47.508Z[UTC]]
      Pipeline[ Id: 70b771a8-0141-4447-8e0f-730b9fba2c34, Nodes: 05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5)3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3)dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6)1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7)afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2)9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12)65fd7e45-140b-4524-b0e3-800ca5fb0724(ozone-datanode8-1.ozone_default/192.168.176.8)993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13)8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10), ReplicationConfig: EC{rs-6-3-1024k}, State:OPEN, leaderId:, CreationTimestamp2023-11-30T08:33:26.080Z[UTC]]
      Pipeline[ Id: d46e8c43-ed23-460a-8200-bb4af0599cae, Nodes: 9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12)e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15)3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3), ReplicationConfig: RATIS/THREE, State:OPEN, leaderId:3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa, CreationTimestamp2023-11-30T08:30:48.503Z[UTC]]
      

      Datanodes usageinfo

      Usage Information (1 Datanodes)
      
      UUID         : 8614d173-4001-46d4-a4e2-1a30339b8585 
      IP Address   : 192.168.176.10 
      Hostname     : ozone-datanode1-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 179290112 B (170.98 MB) 
      Total Used % : 8.35% 
      Ozone Used   : 204800 B (200 KB) 
      Ozone Used % : 0.01% 
      Remaining    : 1968193536 B (1.83 GB) 
      Remaining %  : 91.65% 
      Container(s) : 1 
      
      Usage Information (1 Datanodes)
      
      UUID         : 993705b9-1599-4901-a629-56fbc4c29971 
      IP Address   : 192.168.176.13 
      Hostname     : ozone-datanode2-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 179290112 B (170.98 MB) 
      Total Used % : 8.35% 
      Ozone Used   : 204800 B (200 KB) 
      Ozone Used % : 0.01% 
      Remaining    : 1968193536 B (1.83 GB) 
      Remaining %  : 91.65% 
      Container(s) : 1 
      
      Usage Information (1 Datanodes)
      
      UUID         : 9a144484-a05a-42e4-813e-4aaccf390ea8 
      IP Address   : 192.168.176.12 
      Hostname     : ozone-datanode9-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 179085312 B (170.79 MB) 
      Total Used % : 8.34% 
      Ozone Used   : 0 B (0 B) 
      Ozone Used % : 0.00% 
      Remaining    : 1968398336 B (1.83 GB) 
      Remaining %  : 91.66% 
      Container(s) : 1 
      
      Usage Information (1 Datanodes)
      
      UUID         : afd41e81-1ead-4c5a-b087-8f1bb69e2574 
      IP Address   : 192.168.176.2 
      Hostname     : ozone-datanode6-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 179085312 B (170.79 MB) 
      Total Used % : 8.34% 
      Ozone Used   : 0 B (0 B) 
      Ozone Used % : 0.00% 
      Remaining    : 1968398336 B (1.83 GB) 
      Remaining %  : 91.66% 
      Container(s) : 1 
      
      Usage Information (1 Datanodes)
      
      UUID         : dbb2a07e-5b7d-4aef-a7cc-aed3134563ae 
      IP Address   : 192.168.176.6 
      Hostname     : ozone-datanode10-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 174878720 B (166.78 MB) 
      Total Used % : 8.14% 
      Ozone Used   : 0 B (0 B) 
      Ozone Used % : 0.00% 
      Remaining    : 1972604928 B (1.84 GB) 
      Remaining %  : 91.86% 
      Container(s) : 1 
      
      Usage Information (1 Datanodes)
      
      UUID         : e68158dc-6f86-4304-b78d-86c4fa93cd7d 
      IP Address   : 192.168.176.15 
      Hostname     : ozone-datanode7-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 31440896 B (29.98 MB) 
      Total Used % : 1.46% 
      Ozone Used   : 0 B (0 B) 
      Ozone Used % : 0.00% 
      Remaining    : 2116042752 B (1.97 GB) 
      Remaining %  : 98.54% 
      Container(s) : 0 
      
      Usage Information (1 Datanodes)
      
      UUID         : 05f06265-66e3-407d-9429-a31754686468 
      IP Address   : 192.168.176.5 
      Hostname     : ozone-datanode4-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 179290112 B (170.98 MB) 
      Total Used % : 8.35% 
      Ozone Used   : 204800 B (200 KB) 
      Ozone Used % : 0.01% 
      Remaining    : 1968193536 B (1.83 GB) 
      Remaining %  : 91.65% 
      Container(s) : 1 
      
      Usage Information (1 Datanodes)
      
      UUID         : 1f6131be-4cec-465d-a5cd-cf7b87824b7f 
      IP Address   : 192.168.176.7 
      Hostname     : ozone-datanode3-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 179085312 B (170.79 MB) 
      Total Used % : 8.34% 
      Ozone Used   : 0 B (0 B) 
      Ozone Used % : 0.00% 
      Remaining    : 1968398336 B (1.83 GB) 
      Remaining %  : 91.66% 
      Container(s) : 1 
      
      Usage Information (1 Datanodes)
      
      UUID         : 3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa 
      IP Address   : 192.168.176.3 
      Hostname     : ozone-datanode5-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 179085312 B (170.79 MB) 
      Total Used % : 8.34% 
      Ozone Used   : 0 B (0 B) 
      Ozone Used % : 0.00% 
      Remaining    : 1968398336 B (1.83 GB) 
      Remaining %  : 91.66% 
      Container(s) : 1 
      
      Usage Information (1 Datanodes)
      
      UUID         : 65fd7e45-140b-4524-b0e3-800ca5fb0724 
      IP Address   : 192.168.176.8 
      Hostname     : ozone-datanode8-1.ozone_default 
      Capacity     : 2147483648 B (2 GB) 
      Total Used   : 179290112 B (170.98 MB) 
      Total Used % : 8.35% 
      Ozone Used   : 204800 B (200 KB) 
      Ozone Used % : 0.01% 
      Remaining    : 1968193536 B (1.83 GB) 
      Remaining %  : 91.65% 
      Container(s) : 1 
      
      

      4. Now let's try to create a 100MiB file and put it down to the same bucket

      head -c 100MiB </dev/urandom > /tmp/test_file_100MiB
      ozone sh key put s3v/bucket1/test_key_100MiB_001 /tmp/test_file_100MiB
      

      The request will fail with the next error on the client side:

      INTERNAL_ERROR No enough datanodes to choose. TotalNodes = 10 AvailableNodes = 0 RequiredNodes = 9 ExcludedNodes = 10 UsedNodes = 0
      

      The SCM creates EC-pipelines up to the max pipeline count:

      ozone-scm-1         | 2023-11-30 08:40:04,485 [IPC Server handler 20 on default port 9863] INFO algorithms.SCMContainerPlacementRackScatter: Chosen nodes: [1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7), 993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13), 9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12), 3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3), 8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10), e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15), 65fd7e45-140b-4524-b0e3-800ca5fb0724(ozone-datanode8-1.ozone_default/192.168.176.8), afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2), dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6)]. isPolicySatisfied: true.
      ozone-scm-1         | 2023-11-30 08:40:04,502 [IPC Server handler 20 on default port 9863] INFO pipeline.WritableECContainerProvider: Created and opened new pipeline Pipeline[ Id: 42d76b70-84f5-42a1-980a-e3fc3445edb6, Nodes: 1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7)993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13)9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12)3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3)8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10)e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15)65fd7e45-140b-4524-b0e3-800ca5fb0724(ozone-datanode8-1.ozone_default/192.168.176.8)afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2)dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6), ReplicationConfig: EC{rs-6-3-1024k}, State:ALLOCATED, leaderId:, CreationTimestamp2023-11-30T08:40:04.487343Z[UTC]]
      ozone-scm-1         | 2023-11-30 08:40:04,503 [IPC Server handler 20 on default port 9863] INFO algorithms.SCMContainerPlacementRackScatter: Chosen nodes: [afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2), 1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7), 8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10), 3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3), dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6), 05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5), e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15), 993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13), 9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12)]. isPolicySatisfied: true.
      ozone-scm-1         | 2023-11-30 08:40:04,510 [IPC Server handler 20 on default port 9863] INFO pipeline.WritableECContainerProvider: Created and opened new pipeline Pipeline[ Id: 498dfea3-17ee-4600-a3b9-94727c1cd729, Nodes: afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2)1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7)8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10)3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3)dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6)05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5)e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15)993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13)9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12), ReplicationConfig: EC{rs-6-3-1024k}, State:ALLOCATED, leaderId:, CreationTimestamp2023-11-30T08:40:04.503388Z[UTC]]
      ozone-scm-1         | 2023-11-30 08:40:04,511 [IPC Server handler 20 on default port 9863] INFO algorithms.SCMContainerPlacementRackScatter: Chosen nodes: [8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10), 993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13), 05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5), 9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12), 65fd7e45-140b-4524-b0e3-800ca5fb0724(ozone-datanode8-1.ozone_default/192.168.176.8), afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2), e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15), dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6), 3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3)]. isPolicySatisfied: true.
      ozone-scm-1         | 2023-11-30 08:40:04,518 [IPC Server handler 20 on default port 9863] INFO pipeline.WritableECContainerProvider: Created and opened new pipeline Pipeline[ Id: 93539a72-b48d-4a22-8d0d-bac58d217e42, Nodes: 8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10)993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13)05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5)9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12)65fd7e45-140b-4524-b0e3-800ca5fb0724(ozone-datanode8-1.ozone_default/192.168.176.8)afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2)e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15)dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6)3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3), ReplicationConfig: EC{rs-6-3-1024k}, State:ALLOCATED, leaderId:, CreationTimestamp2023-11-30T08:40:04.511440Z[UTC]]
      ozone-scm-1         | 2023-11-30 08:40:04,518 [IPC Server handler 20 on default port 9863] INFO algorithms.SCMContainerPlacementRackScatter: Chosen nodes: [1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7), afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2), dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6), 9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12), 993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13), 8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10), 05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5), 3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3), e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15)]. isPolicySatisfied: true.
      ozone-scm-1         | 2023-11-30 08:40:04,529 [IPC Server handler 20 on default port 9863] INFO pipeline.WritableECContainerProvider: Created and opened new pipeline Pipeline[ Id: 8f8cce33-8631-4e42-b3ac-34f8708be23a, Nodes: 1f6131be-4cec-465d-a5cd-cf7b87824b7f(ozone-datanode3-1.ozone_default/192.168.176.7)afd41e81-1ead-4c5a-b087-8f1bb69e2574(ozone-datanode6-1.ozone_default/192.168.176.2)dbb2a07e-5b7d-4aef-a7cc-aed3134563ae(ozone-datanode10-1.ozone_default/192.168.176.6)9a144484-a05a-42e4-813e-4aaccf390ea8(ozone-datanode9-1.ozone_default/192.168.176.12)993705b9-1599-4901-a629-56fbc4c29971(ozone-datanode2-1.ozone_default/192.168.176.13)8614d173-4001-46d4-a4e2-1a30339b8585(ozone-datanode1-1.ozone_default/192.168.176.10)05f06265-66e3-407d-9429-a31754686468(ozone-datanode4-1.ozone_default/192.168.176.5)3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa(ozone-datanode5-1.ozone_default/192.168.176.3)e68158dc-6f86-4304-b78d-86c4fa93cd7d(ozone-datanode7-1.ozone_default/192.168.176.15), ReplicationConfig: EC{rs-6-3-1024k}, State:ALLOCATED, leaderId:, CreationTimestamp2023-11-30T08:40:04.518973Z[UTC]]
      

      But pipelines's reserved datanodes are unavailable to create new containers:

      ozone-datanode8-1   | 2023-11-30 08:40:06,062 [65fd7e45-140b-4524-b0e3-800ca5fb0724-ChunkReader-6] INFO volume.CapacityVolumeChoosingPolicy: No volumes have enough space for a new container.  Most available space: 894656512 bytes; required space: 1073741824, volumes: {/data/hdds/hdds=free: 1968193536, committed: 1073537024}
      ozone-datanode1-1   | 2023-11-30 08:40:06,063 [8614d173-4001-46d4-a4e2-1a30339b8585-ChunkReader-5] INFO volume.CapacityVolumeChoosingPolicy: No volumes have enough space for a new container.  Most available space: 894656512 bytes; required space: 1073741824, volumes: {/data/hdds/hdds=free: 1968193536, committed: 1073537024}
      ozone-datanode9-1   | 2023-11-30 08:40:06,070 [9a144484-a05a-42e4-813e-4aaccf390ea8-ChunkReader-4] INFO volume.CapacityVolumeChoosingPolicy: No volumes have enough space for a new container.  Most available space: 894656512 bytes; required space: 1073741824, volumes: {/data/hdds/hdds=free: 1968398336, committed: 1073741824}
      ozone-datanode2-1   | 2023-11-30 08:40:06,078 [993705b9-1599-4901-a629-56fbc4c29971-ChunkReader-6] INFO volume.CapacityVolumeChoosingPolicy: No volumes have enough space for a new container.  Most available space: 894656512 bytes; required space: 1073741824, volumes: {/data/hdds/hdds=free: 1968193536, committed: 1073537024}
      ozone-datanode3-1   | 2023-11-30 08:40:06,093 [1f6131be-4cec-465d-a5cd-cf7b87824b7f-ChunkReader-4] INFO volume.CapacityVolumeChoosingPolicy: No volumes have enough space for a new container.  Most available space: 894656512 bytes; required space: 1073741824, volumes: {/data/hdds/hdds=free: 1968398336, committed: 1073741824}
      ozone-datanode5-1   | 2023-11-30 08:40:06,102 [3c4549f6-e3b9-44a5-8e8b-5c1078ddaffa-ChunkReader-4] INFO volume.CapacityVolumeChoosingPolicy: No volumes have enough space for a new container.  Most available space: 894656512 bytes; required space: 1073741824, volumes: {/data/hdds/hdds=free: 1968398336, committed: 1073741824}
      ozone-datanode6-1   | 2023-11-30 08:40:06,136 [afd41e81-1ead-4c5a-b087-8f1bb69e2574-ChunkReader-3] INFO volume.CapacityVolumeChoosingPolicy: No volumes have enough space for a new container.  Most available space: 894656512 bytes; required space: 1073741824, volumes: {/data/hdds/hdds=free: 1968398336, committed: 1073741824}
      ozone-datanode10-1  | 2023-11-30 08:40:06,150 [dbb2a07e-5b7d-4aef-a7cc-aed3134563ae-ChunkReader-4] INFO volume.CapacityVolumeChoosingPolicy: No volumes have enough space for a new container.  Most available space: 898863104 bytes; required space: 1073741824, volumes: {/data/hdds/hdds=free: 1972604928, committed: 1073741824}
      ozone-datanode8-1   | 2023-11-30 08:40:06,239 [65fd7e45-140b-4524-b0e3-800ca5fb0724-ChunkReader-6] WARN keyvalue.KeyValueHandler: Operation: CreateContainer , Trace ID:  , Message: Container creation failed, due to disk out of space , Result: DISK_OUT_OF_SPACE , StorageContainerException Occurred.
      ozone-datanode8-1   | org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException: Container creation failed, due to disk out of space
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer.create(KeyValueContainer.java:162)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.keyvalue.KeyValueHandler.handleCreateContainer(KeyValueHandler.java:367)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.keyvalue.KeyValueHandler.dispatchRequest(KeyValueHandler.java:239)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.keyvalue.KeyValueHandler.handle(KeyValueHandler.java:222)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.createContainer(HddsDispatcher.java:469)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.dispatchRequest(HddsDispatcher.java:275)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.lambda$dispatch$0(HddsDispatcher.java:179)
      ozone-datanode8-1   | 	at org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:89)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.dispatch(HddsDispatcher.java:178)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.common.transport.server.GrpcXceiverService$1.onNext(GrpcXceiverService.java:57)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.common.transport.server.GrpcXceiverService$1.onNext(GrpcXceiverService.java:50)
      ozone-datanode8-1   | 	at org.apache.ratis.thirdparty.io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:262)
      ozone-datanode8-1   | 	at org.apache.ratis.thirdparty.io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
      ozone-datanode8-1   | 	at org.apache.hadoop.hdds.tracing.GrpcServerInterceptor$1.onMessage(GrpcServerInterceptor.java:49)
      ozone-datanode8-1   | 	at org.apache.ratis.thirdparty.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:329)
      ozone-datanode8-1   | 	at org.apache.ratis.thirdparty.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:314)
      ozone-datanode8-1   | 	at org.apache.ratis.thirdparty.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:833)
      ozone-datanode8-1   | 	at org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
      ozone-datanode8-1   | 	at org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
      ozone-datanode8-1   | 	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
      ozone-datanode8-1   | 	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
      ozone-datanode8-1   | 	at java.base/java.lang.Thread.run(Thread.java:829)
      ozone-datanode8-1   | Caused by: org.apache.hadoop.util.DiskChecker$DiskOutOfSpaceException: No volumes have enough space for a new container.  Most available space: 894656512 bytes
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.common.volume.VolumeChoosingUtil.throwDiskOutOfSpace(VolumeChoosingUtil.java:38)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.common.volume.CapacityVolumeChoosingPolicy.chooseVolume(CapacityVolumeChoosingPolicy.java:68)
      ozone-datanode8-1   | 	at org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer.create(KeyValueContainer.java:160)
      ozone-datanode8-1   | 	... 21 more
      

      because the SCM and datanodes check the volume availability in a different manner:

      SCM (org.apache.hadoop.hdds.scm.container.placement.algorithms.SCMContainerPlacementRackScatter#chooseNode(java.lang.String, java.util.List<org.apache.hadoop.hdds.scm.net.Node>, long, long) -> org.apache.hadoop.hdds.scm.SCMCommonPlacementPolicy#isValidNode -> org.apache.hadoop.hdds.scm.SCMCommonPlacementPolicy#hasEnoughSpace)

      if (dataSizeRequired > 0) {
            for (StorageReportProto reportProto : datanodeInfo.getStorageReports()) {
              if (reportProto.getRemaining() > dataSizeRequired) {
                enoughForData = true;
                break;
              }
            }
          } else {
            enoughForData = true;
          }
      

      Datanode (org.apache.hadoop.ozone.container.common.volume.AvailableSpaceFilter#test)

      public boolean test(HddsVolume vol) {
          long volumeCapacity = vol.getCapacity();
          long free = vol.getAvailable();
          long committed = vol.getCommittedBytes();
          long available = free - committed;
          long volumeFreeSpace =
              VolumeUsage.getMinVolumeFreeSpace(vol.getConf(), volumeCapacity);
          boolean hasEnoughSpace =
              available > Math.max(requiredSpace, volumeFreeSpace);
      
          mostAvailableSpace = Math.max(available, mostAvailableSpace);
      
          if (!hasEnoughSpace) {
            fullVolumes.put(vol, new AvailableSpace(free, committed));
          }
      
          return hasEnoughSpace;
        }
      

      The SCM doesn't take into account the committed space and guesses that the datanode is available to allocate new containers but it's not

      Attachments

        Issue Links

          Activity

            People

              vtutrinov Vyacheslav Tutrinov
              vtutrinov Vyacheslav Tutrinov
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: