Uploaded image for project: 'Apache Hudi'
  1. Apache Hudi
  2. HUDI-4825

Commit metadata in Json contains redundant information

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Critical
    • Resolution: Fixed
    • None
    • 0.12.1
    • None
    • 1

    Description

      The commit metadata in Json (*.commit, *.deltacommit) written to the Hudi timeline under .hoodie contains redundant fields that can be trimmed.  As shown below, the same set of write stats is written to both "partitionToWriteStats" and "writeStats", doubling the size and increasing the serde overhead.  Other fields like "totalRecordsDeleted", "writePartitionPaths", "fileIdAndRelativePaths", etc., can be removed as well as they are derived from "partitionToWriteStats" and not directly used by HoodieCommitMetadata class.

      Example commit metadata:

       

      {
        "partitionToWriteStats" : {
          "2022/1/31" : [ {
            "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
            "path" : "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
            "prevCommit" : "20220410134320333",
            "numWrites" : 250175,
            "numDeletes" : 0,
            "numUpdateWrites" : 0,
            "numInserts" : 50035,
            "totalWriteBytes" : 90720802,
            "totalWriteErrors" : 0,
            "tempPath" : null,
            "partitionPath" : "2022/1/31",
            "totalLogRecords" : 0,
            "totalLogFilesCompacted" : 0,
            "totalLogSizeCompacted" : 0,
            "totalUpdatedRecordsCompacted" : 0,
            "totalLogBlocks" : 0,
            "totalCorruptLogBlock" : 0,
            "totalRollbackBlocks" : 0,
            "fileSizeInBytes" : 90720802,
            "minEventTime" : null,
            "maxEventTime" : null
          } ],
          ...
        },
        "compacted" : false,
        "extraMetadata" : {
          "schema" : "{\"type\":\"record\",\"name\":\"hoodie_source\",\"namespace\":\"hoodie.source\",\"fields\":[{\"name\":\"key\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"partition\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"ts\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"textField\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"decimalField\",\"type\":[\"null\",\"float\"],\"default\":null},{\"name\":\"longField\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"arrayField\",\"type\":[\"null\",{\"type\":\"array\",\"items\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"mapField\",\"type\":[\"null\",{\"type\":\"map\",\"values\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"round\",\"type\":[\"null\",\"int\"],\"default\":null}]}",
          "deltastreamer.checkpoint.key" : "17"
        },
        "operationType" : "INSERT",
        "writeStats" : [ {
          "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
          "path" : "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
          "prevCommit" : "20220410134320333",
          "numWrites" : 250175,
          "numDeletes" : 0,
          "numUpdateWrites" : 0,
          "numInserts" : 50035,
          "totalWriteBytes" : 90720802,
          "totalWriteErrors" : 0,
          "tempPath" : null,
          "partitionPath" : "2022/1/31",
          "totalLogRecords" : 0,
          "totalLogFilesCompacted" : 0,
          "totalLogSizeCompacted" : 0,
          "totalUpdatedRecordsCompacted" : 0,
          "totalLogBlocks" : 0,
          "totalCorruptLogBlock" : 0,
          "totalRollbackBlocks" : 0,
          "fileSizeInBytes" : 90720802,
          "minEventTime" : null,
          "maxEventTime" : null
        }, 
        ... 
        ],
        "totalRecordsDeleted" : 0,
        "totalLogFilesSize" : 0,
        "totalScanTime" : 0,
        "totalCreateTime" : 0,
        "totalUpsertTime" : 309120,
        "minAndMaxEventTime" : {
          "Optional.empty" : {
            "val" : null,
            "present" : false
          }
        },
        "writePartitionPaths" : [ "2022/1/31", "2022/1/30", "2022/1/28", "2022/1/27", "2022/2/2", "2022/1/29", "2022/1/24", "2022/2/1", "2022/1/26", "2022/1/25" ],
        "fileIdAndRelativePaths" : {
          "3e31414c-fb4c-4ce9-aa27-a43640d94430-0" : "2022/1/25/3e31414c-fb4c-4ce9-aa27-a43640d94430-0_9-9-47_20220410134618909.parquet",
          ...
        },
        "totalLogRecordsCompacted" : 0,
        "totalLogFilesCompacted" : 0,
        "totalCompactedRecordsUpdated" : 0
      } 

       

       

      Attachments

        Issue Links

          Activity

            People

              guoyihua Ethan Guo
              guoyihua Ethan Guo
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: