Details
-
Bug
-
Status: Closed
-
Major
-
Resolution: Fixed
-
None
Description
Throw failed to archive commits error when writing data to table, here are reproduce steps.
1, Build from latest source
mvn clean package -DskipTests -DskipITs -Dcheckstyle.skip=true -Drat.skip=true
2, Write Data
export SPARK_HOME=/work/BigData/install/spark/spark-2.3.3-bin-hadoop2.6 ${SPARK_HOME}/bin/spark-shell --jars `ls packaging/hudi-spark-bundle/target/hudi-spark-bundle-*.*.*-SNAPSHOT.jar` --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' import org.apache.spark.sql.SaveMode._ var datas = List("{ \"name\": \"kenken\", \"ts\": 1574297893836, \"age\": 12, \"location\": \"latitude\"}") val df = spark.read.json(spark.sparkContext.parallelize(datas, 2)) df.write.format("org.apache.hudi"). option("hoodie.insert.shuffle.parallelism", "10"). option("hoodie.upsert.shuffle.parallelism", "10"). option("hoodie.delete.shuffle.parallelism", "10"). option("hoodie.bulkinsert.shuffle.parallelism", "10"). option("hoodie.datasource.write.recordkey.field", "name"). option("hoodie.datasource.write.partitionpath.field", "location"). option("hoodie.datasource.write.precombine.field", "ts"). option("hoodie.table.name", "hudi_mor_table"). mode(Overwrite). save("file:///tmp/hudi_mor_table")
3, Append Data
df.write.format("org.apache.hudi"). option("hoodie.insert.shuffle.parallelism", "10"). option("hoodie.upsert.shuffle.parallelism", "10"). option("hoodie.delete.shuffle.parallelism", "10"). option("hoodie.bulkinsert.shuffle.parallelism", "10"). option("hoodie.datasource.write.recordkey.field", "name"). option("hoodie.datasource.write.partitionpath.field", "location"). option("hoodie.datasource.write.precombine.field", "ts"). option("hoodie.keep.max.commits", "5"). option("hoodie.keep.min.commits", "4"). option("hoodie.cleaner.commits.retained", "3"). option("hoodie.table.name", "hudi_mor_table"). mode(Append). save("file:///tmp/hudi_mor_table")
4, Repeat about six times Append Data operation(above), will get the stackstrace
19/12/23 01:30:48 ERROR HoodieCommitArchiveLog: Failed to archive commits, .commit file: 20191224004558.clean.requested java.io.IOException: Not an Avro data file at org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:50) at org.apache.hudi.common.util.AvroUtils.deserializeAvroMetadata(AvroUtils.java:147) at org.apache.hudi.common.util.CleanerUtils.getCleanerPlan(CleanerUtils.java:88) at org.apache.hudi.io.HoodieCommitArchiveLog.convertToAvroRecord(HoodieCommitArchiveLog.java:294) at org.apache.hudi.io.HoodieCommitArchiveLog.archive(HoodieCommitArchiveLog.java:253) at org.apache.hudi.io.HoodieCommitArchiveLog.archiveIfRequired(HoodieCommitArchiveLog.java:122) at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:562) at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:523) at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:514) at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:159) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:91) at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68) at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80) at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:656) at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:656) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77) at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:656) at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:225)
Attachments
Issue Links
- links to