Uploaded image for project: 'Apache Hudi'
  1. Apache Hudi
  2. HUDI-453

Throw failed to archive commits error when writing data to MOR/COW table

    XMLWordPrintableJSON

Details

    Description

      Throw failed to archive commits error when writing data to table, here are reproduce steps.

      1, Build from latest source

      mvn clean package -DskipTests -DskipITs -Dcheckstyle.skip=true -Drat.skip=true
      

      2, Write Data

      export SPARK_HOME=/work/BigData/install/spark/spark-2.3.3-bin-hadoop2.6
      ${SPARK_HOME}/bin/spark-shell --jars `ls packaging/hudi-spark-bundle/target/hudi-spark-bundle-*.*.*-SNAPSHOT.jar` --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
      
      import org.apache.spark.sql.SaveMode._
      
      var datas = List("{ \"name\": \"kenken\", \"ts\": 1574297893836, \"age\": 12, \"location\": \"latitude\"}")
      val df = spark.read.json(spark.sparkContext.parallelize(datas, 2))
      df.write.format("org.apache.hudi").
          option("hoodie.insert.shuffle.parallelism", "10").
          option("hoodie.upsert.shuffle.parallelism", "10").
          option("hoodie.delete.shuffle.parallelism", "10").
          option("hoodie.bulkinsert.shuffle.parallelism", "10").
          option("hoodie.datasource.write.recordkey.field", "name").
          option("hoodie.datasource.write.partitionpath.field", "location").
          option("hoodie.datasource.write.precombine.field", "ts").
          option("hoodie.table.name", "hudi_mor_table").
          mode(Overwrite).
          save("file:///tmp/hudi_mor_table")
      

      3, Append Data

      df.write.format("org.apache.hudi").
          option("hoodie.insert.shuffle.parallelism", "10").
          option("hoodie.upsert.shuffle.parallelism", "10").
          option("hoodie.delete.shuffle.parallelism", "10").
          option("hoodie.bulkinsert.shuffle.parallelism", "10").
          option("hoodie.datasource.write.recordkey.field", "name").
          option("hoodie.datasource.write.partitionpath.field", "location").
          option("hoodie.datasource.write.precombine.field", "ts").
          option("hoodie.keep.max.commits", "5").
          option("hoodie.keep.min.commits", "4").
          option("hoodie.cleaner.commits.retained", "3").
          option("hoodie.table.name", "hudi_mor_table").
          mode(Append).
          save("file:///tmp/hudi_mor_table")
      
      

      4, Repeat about six times Append Data operation(above), will get the stackstrace

      19/12/23 01:30:48 ERROR HoodieCommitArchiveLog: Failed to archive commits, .commit file: 20191224004558.clean.requested
      java.io.IOException: Not an Avro data file
      at org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:50)
      at org.apache.hudi.common.util.AvroUtils.deserializeAvroMetadata(AvroUtils.java:147)
      at org.apache.hudi.common.util.CleanerUtils.getCleanerPlan(CleanerUtils.java:88)
      at org.apache.hudi.io.HoodieCommitArchiveLog.convertToAvroRecord(HoodieCommitArchiveLog.java:294)
      at org.apache.hudi.io.HoodieCommitArchiveLog.archive(HoodieCommitArchiveLog.java:253)
      at org.apache.hudi.io.HoodieCommitArchiveLog.archiveIfRequired(HoodieCommitArchiveLog.java:122)
      at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:562)
      at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:523)
      at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:514)
      at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:159)
      at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:91)
      at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
      at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
      at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
      at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
      at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
      at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
      at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
      at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
      at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
      at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
      at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
      at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
      at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:656)
      at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:656)
      at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
      at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:656)
      at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
      at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
      at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:225)
      

      Attachments

        Issue Links

          Activity

            People

              lamber-ken lamber-ken
              lamber-ken lamber-ken
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Time Tracking

                  Estimated:
                  Original Estimate - Not Specified
                  Not Specified
                  Remaining:
                  Remaining Estimate - 0h
                  0h
                  Logged:
                  Time Spent - 20m
                  20m