Uploaded image for project: 'Apache Ozone'
  1. Apache Ozone
  2. HDDS-6261

OM crashes when trying to overwrite a key during upgrade downgrade testing

    XMLWordPrintableJSON

Details

    Description

      While working on HDDS-6084 (related to upgrade acceptance testing), erose and I found that if:
      1) a key is created with a new OM version (1.3.0)
      2) downgrade to OM 1.1.0
      3) try to overwrite the key created in (1)

      Step (3) will result in all 3 OMs crashing.

      The issue seems to be introduced in the unreleased branch of 1.3.0. The same issue cannot be reproduced in 1.1.0 to 1.2.0 upgrade/downgrade testing (which should exclude HDDS-5243 or HDDS-5393 as a potential root cause). This could indicate some unreleased changes has broken the key versioning after the downgrade. (Could well be an incompatible change. Need further investigation.)

      om2_1    | 2022-02-03 21:36:15,228 [OM StateMachine ApplyTransaction Thread - 0] ERROR ratis.OzoneManagerStateMachine: Terminating with exit status 1: Request cmdType: CreateKey
      om2_1    | traceID: ""
      om2_1    | clientId: "client-72B024AF247D"
      om2_1    | userInfo {
      om2_1    |   userName: "dlfknslnfslf"
      om2_1    |   remoteAddress: "10.9.0.19"
      om2_1    |   hostName: "ha_s3g_1.ha_net"
      om2_1    | }
      om2_1    | version: 1
      om2_1    | createKeyRequest {
      om2_1    |   keyArgs {
      om2_1    |     volumeName: "s3v"
      om2_1    |     bucketName: "old1-bucket"
      om2_1    |     keyName: "key2"
      om2_1    |     dataSize: 17539
      om2_1    |     type: RATIS
      om2_1    |     factor: THREE
      om2_1    |     keyLocations {
      om2_1    |       blockID {
      om2_1    |         containerBlockID {
      om2_1    |           containerID: 1
      om2_1    |           localID: 107736214721200128
      om2_1    |         }
      om2_1    |         blockCommitSequenceId: 0
      om2_1    |       }
      om2_1    |       offset: 0
      om2_1    |       length: 268435456
      om2_1    |       createVersion: 0
      om2_1    |       pipeline {
      om2_1    |         members {
      om2_1    |           uuid: "b92bf4c8-3b0c-40b0-bb2b-05b6d3594e13"
      om2_1    |           ipAddress: "10.9.0.16"
      om2_1    |           hostName: "ha_dn2_1.ha_net"
      om2_1    |           ports {
      om2_1    |             name: "REPLICATION"
      om2_1    |             value: 9886
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS"
      om2_1    |             value: 9858
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS_ADMIN"
      om2_1    |             value: 9857
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS_SERVER"
      om2_1    |             value: 9856
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "STANDALONE"
      om2_1    |             value: 9859
      om2_1    |           }
      om2_1    |           networkName: "b92bf4c8-3b0c-40b0-bb2b-05b6d3594e13"
      om2_1    |           networkLocation: "/default-rack"
      om2_1    |           persistedOpState: IN_SERVICE
      om2_1    |           persistedOpStateExpiry: 0
      om2_1    |           uuid128 {
      om2_1    |             mostSigBits: -5103716611873029968
      om2_1    |             leastSigBits: -4959864281830437357
      om2_1    |           }
      om2_1    |         }
      om2_1    |         members {
      om2_1    |           uuid: "f0b7e615-d4ee-4ec4-a6b5-ec68b82c07e9"
      om2_1    |           ipAddress: "10.9.0.15"
      om2_1    |           hostName: "ha_dn1_1.ha_net"
      om2_1    |           ports {
      om2_1    |             name: "REPLICATION"
      om2_1    |             value: 9886
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS"
      om2_1    |             value: 9858
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS_ADMIN"
      om2_1    |             value: 9857
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS_SERVER"
      om2_1    |             value: 9856
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "STANDALONE"
      om2_1    |             value: 9859
      om2_1    |           }
      om2_1    |           networkName: "f0b7e615-d4ee-4ec4-a6b5-ec68b82c07e9"
      om2_1    |           networkLocation: "/default-rack"
      om2_1    |           persistedOpState: IN_SERVICE
      om2_1    |           persistedOpStateExpiry: 0
      om2_1    |           uuid128 {
      om2_1    |             mostSigBits: -1101158602427707708
      om2_1    |             leastSigBits: -6433976558118238231
      om2_1    |           }
      om2_1    |         }
      om2_1    |         members {
      om2_1    |           uuid: "c7912312-811d-469d-8c40-c739cd2a1e62"
      om2_1    |           ipAddress: "10.9.0.17"
      om2_1    |           hostName: "ha_dn3_1.ha_net"
      om2_1    |           ports {
      om2_1    |             name: "REPLICATION"
      om2_1    |             value: 9886
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS"
      om2_1    |             value: 9858
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS_ADMIN"
      om2_1    |             value: 9857
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "RATIS_SERVER"
      om2_1    |             value: 9856
      om2_1    |           }
      om2_1    |           ports {
      om2_1    |             name: "STANDALONE"
      om2_1    |             value: 9859
      om2_1    |           }
      om2_1    |           networkName: "c7912312-811d-469d-8c40-c739cd2a1e62"
      om2_1    |           networkLocation: "/default-rack"
      om2_1    |           persistedOpState: IN_SERVICE
      om2_1    |           persistedOpStateExpiry: 0
      om2_1    |           uuid128 {
      om2_1    |             mostSigBits: -4066430426156284259
      om2_1    |             leastSigBits: -8340447458821005726
      om2_1    |           }
      om2_1    |         }
      om2_1    |         state: PIPELINE_OPEN
      om2_1    |         type: RATIS
      om2_1    |         factor: THREE
      om2_1    |         id {
      om2_1    |           id: "c0b6f272-9a39-4dc3-ada8-c3b833cc6e17"
      om2_1    |           uuid128 {
      om2_1    |             mostSigBits: -4560190998638408253
      om2_1    |             leastSigBits: -5933277313150194153
      om2_1    |           }
      om2_1    |         }
      om2_1    |         leaderID: "f0b7e615-d4ee-4ec4-a6b5-ec68b82c07e9"
      om2_1    |         creationTimeStamp: 1643924132125
      om2_1    |         suggestedLeaderID {
      om2_1    |           mostSigBits: -1101158602427707708
      om2_1    |           leastSigBits: -6433976558118238231
      om2_1    |         }
      om2_1    |         leaderID128 {
      om2_1    |           mostSigBits: -1101158602427707708
      om2_1    |           leastSigBits: -6433976558118238231
      om2_1    |         }
      om2_1    |       }
      om2_1    |       partNumber: 0
      om2_1    |     }
      om2_1    |     isMultipartKey: false
      om2_1    |     acls {
      om2_1    |       type: USER
      om2_1    |       name: "dlfknslnfslf"
      om2_1    |       rights: "\200"
      om2_1    |       aclScope: ACCESS
      om2_1    |     }
      om2_1    |     modificationTime: 1643924174840
      om2_1    |   }
      om2_1    |   clientID: 107736214722445312
      om2_1    | }
      om2_1    | failed with exception
      om2_1    | java.lang.IllegalArgumentException
      om2_1    | 	at com.google.common.base.Preconditions.checkArgument(Preconditions.java:128)
      om2_1    | 	at org.apache.hadoop.ozone.om.helpers.OmKeyInfo.<init>(OmKeyInfo.java:81)
      om2_1    | 	at org.apache.hadoop.ozone.om.helpers.OmKeyInfo$Builder.build(OmKeyInfo.java:378)
      om2_1    | 	at org.apache.hadoop.ozone.om.helpers.OmKeyInfo.getFromProtobuf(OmKeyInfo.java:460)
      om2_1    | 	at org.apache.hadoop.ozone.om.codec.OmKeyInfoCodec.fromPersistedFormat(OmKeyInfoCodec.java:59)
      om2_1    | 	at org.apache.hadoop.ozone.om.codec.OmKeyInfoCodec.fromPersistedFormat(OmKeyInfoCodec.java:36)
      om2_1    | 	at org.apache.hadoop.hdds.utils.db.CodecRegistry.asObject(CodecRegistry.java:55)
      om2_1    | 	at org.apache.hadoop.hdds.utils.db.TypedTable.getFromTableIfExist(TypedTable.java:261)
      om2_1    | 	at org.apache.hadoop.hdds.utils.db.TypedTable.getIfExist(TypedTable.java:248)
      om2_1    | 	at org.apache.hadoop.ozone.om.request.key.OMKeyCreateRequest.validateAndUpdateCache(OMKeyCreateRequest.java:236)
      om2_1    | 	at org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequest(OzoneManagerRequestHandler.java:227)
      om2_1    | 	at org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:415)
      om2_1    | 	at org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$applyTransaction$1(OzoneManagerStateMachine.java:240)
      om2_1    | 	at java.base/java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1700)
      om2_1    | 	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
      om2_1    | 	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
      om2_1    | 	at java.base/java.lang.Thread.run(Thread.java:834)
      om2_1    | 2022-02-03 21:36:15,253 [shutdown-hook-0] INFO om.OzoneManagerStarter: SHUTDOWN_MSG:
      om2_1    | /************************************************************
      om2_1    | SHUTDOWN_MSG: Shutting down OzoneManager at a250845831a7/10.9.0.12
      om2_1    | ************************************************************/
      

      This is where OM is crashing in ozone 1.1.0 code:

      https://github.com/apache/ozone/blob/ozone-1.1.0/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/helpers/OmKeyInfo.java#L81-L82

      Attachments

        Issue Links

          Activity

            People

              smeng Siyao Meng
              smeng Siyao Meng
              Votes:
              0 Vote for this issue
              Watchers:
              5 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: