Uploaded image for project: 'Apache Hudi'
  1. Apache Hudi
  2. HUDI-3549

Investigate spark3 read issues w/ hudi spark bundle 3.2 with S3 dataset

    XMLWordPrintableJSON

Details

    Description

      scala> df.write.format("hudi").
           |   options(getQuickstartWriteConfigs).
           |   option(PRECOMBINE_FIELD_OPT_KEY, "ts").
           |   option(RECORDKEY_FIELD_OPT_KEY, "uuid").
           |   option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
           |   option(TABLE_NAME, tableName).
           |   mode(Overwrite).
           |   save(basePath)
      warning: one deprecation; for details, enable `:setting -deprecation' or `:replay -deprecation'
      2022-03-02 14:57:00,922 WARN config.DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf
      2022-03-02 14:57:00,930 WARN config.DFSPropertiesConfiguration: Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file
      2022-03-02 14:57:00,947 WARN hudi.HoodieSparkSqlWriter$: hoodie table at /tmp/hudi_trips_cow already exists. Deleting existing data & overwriting with new data.
      2022-03-02 14:57:01,523 WARN metadata.HoodieBackedTableMetadata: Metadata table was not found at path /tmp/hudi_trips_cow/.hoodie/metadata
      2022-03-02 14:57:10,929 WARN scheduler.TaskSetManager: Lost task 1.0 in stage 15.0 (TID 15) (ip-172-31-47-53.us-east-2.compute.internal executor 2): java.lang.NoSuchMethodError: org.apache.spark.sql.execution.datasources.DataSourceUtils$.createDateRebaseFuncInWrite(Lscala/Enumeration$Value;Ljava/lang/String;)Lscala/Function1;
      	at org.apache.hudi.spark.org.apache.spark.sql.avro.AvroSerializer.<init>(AvroSerializer.scala:64)
      	at org.apache.hudi.spark.org.apache.spark.sql.avro.AvroSerializer.<init>(AvroSerializer.scala:56)
      	at org.apache.hudi.spark.org.apache.spark.sql.avro.HoodieAvroSerializer.<init>(HoodieAvroSerializer.scala:26)
      	at org.apache.spark.sql.adapter.Spark3Adapter.createAvroSerializer(Spark3Adapter.scala:47)
      	at org.apache.hudi.AvroConversionUtils$.$anonfun$createInternalRowToAvroConverter$1(AvroConversionUtils.scala:79)
      	at org.apache.hudi.HoodieSparkUtils$.$anonfun$createRdd$5(HoodieSparkUtils.scala:166)
      	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
      	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
      	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
      	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:199)
      	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
      	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
      	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
      	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
      	at org.apache.spark.scheduler.Task.run(Task.scala:131)
      	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
      	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
      	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
      	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
      	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
      	at java.lang.Thread.run(Thread.java:750)
      
      
      2022-03-02 14:57:12,923 ERROR scheduler.TaskSetManager: Task 1 in stage 15.0 failed 4 times; aborting job
      org.apache.hudi.exception.HoodieUpsertException: Failed to upsert for commit time 20220302145700945
        at org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:64)
        at org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor.execute(SparkUpsertCommitActionExecutor.java:46)
        at org.apache.hudi.table.HoodieSparkCopyOnWriteTable.upsert(HoodieSparkCopyOnWriteTable.java:121)
        at org.apache.hudi.table.HoodieSparkCopyOnWriteTable.upsert(HoodieSparkCopyOnWriteTable.java:105)
        at org.apache.hudi.client.SparkRDDWriteClient.upsert(SparkRDDWriteClient.java:159)
        at org.apache.hudi.DataSourceUtils.doWriteOperation(DataSourceUtils.java:218)
        at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:289)
        at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:162)
        at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
        at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
        at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
        at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
        at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
        at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
        at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
        at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
        at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
        at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
        at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
        at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
        at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
        at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
        at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
        at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
        at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
        at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
        at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
        at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
        at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
        at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
        at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:128)
        at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
        at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
        at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:303)
        at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
        ... 66 elided 

      Attachments

        Issue Links

          Activity

            People

              alexey.kudinkin Alexey Kudinkin
              shivnarayan sivabalan narayanan
              Ethan Guo, Raymond Xu
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: