Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Incomplete
-
2.4.5
-
None
-
None
Description
Code:
spark = SparkSession.builder \ .master("yarn") \ .enableHiveSupport() \ .getOrCreate() test_data_path = 's3a://ph-stream/common/public/prod/15' test_data_df = spark.read.parquet(test_data_path).limit(1) test_data_df.write .mode('overwrite') .option('path', test_data_df) .saveAsTable('prod15')
Error Info:
2020-09-03 14:31:47,900 WARN util.Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf. Traceback (most recent call last): File "main.py", line 82, in <module> test_data_df.write.mode('overwrite').option('path', test_data_path).saveAsTable('prod15') File "/usr/local/lib/python3.5/dist-packages/pyspark/sql/readwriter.py", line 778, in saveAsTable self._jwrite.saveAsTable(name) File "/usr/local/lib/python3.5/dist-packages/py4j/java_gateway.py", line 1257, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/usr/local/lib/python3.5/dist-packages/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/usr/local/lib/python3.5/dist-packages/py4j/protocol.py", line 328, in get_return_value format(target_id, ".", name), value) py4j.protocol.Py4JJavaError: An error occurred while calling o102.saveAsTable. : java.lang.NoClassDefFoundError: parquet/hadoop/ParquetOutputFormat at java.lang.Class.forName0(Native Method) at java.lang.Class.forName(Class.java:348) at org.apache.spark.util.Utils$.classForName(Utils.scala:238) at org.apache.spark.sql.hive.client.HiveClientImpl$.org$apache$spark$sql$hive$client$HiveClientImpl$$toOutputFormat(HiveClientImpl.scala:915) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$toHiveTable$8.apply(HiveClientImpl.scala:949) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$toHiveTable$8.apply(HiveClientImpl.scala:949) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.hive.client.HiveClientImpl$.toHiveTable(HiveClientImpl.scala:949) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply$mcV$sp(HiveClientImpl.scala:484) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply(HiveClientImpl.scala:482) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply(HiveClientImpl.scala:482) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:277) at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:215) at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:214) at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:260) at org.apache.spark.sql.hive.client.HiveClientImpl.createTable(HiveClientImpl.scala:482) at org.apache.spark.sql.hive.HiveExternalCatalog.saveTableIntoHive(HiveExternalCatalog.scala:499) at org.apache.spark.sql.hive.HiveExternalCatalog.org$apache$spark$sql$hive$HiveExternalCatalog$$createDataSourceTable(HiveExternalCatalog.scala:387) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply$mcV$sp(HiveExternalCatalog.scala:263) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply(HiveExternalCatalog.scala:236) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply(HiveExternalCatalog.scala:236) at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97) at org.apache.spark.sql.hive.HiveExternalCatalog.createTable(HiveExternalCatalog.scala:236) at org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.createTable(ExternalCatalogWithListener.scala:94) at org.apache.spark.sql.catalyst.catalog.SessionCatalog.createTable(SessionCatalog.scala:324) at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSourceTables.scala:185) at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104) at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102) at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:83) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:81) at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676) at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676) at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75) at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676) at org.apache.spark.sql.DataFrameWriter.createTable(DataFrameWriter.scala:474) at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:453) at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:409) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.ClassNotFoundException: parquet.hadoop.ParquetOutputFormat at java.net.URLClassLoader.findClass(URLClassLoader.java:382) at java.lang.ClassLoader.loadClass(ClassLoader.java:418) at org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1.doLoadClass(IsolatedClientLoader.scala:226) at org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1.loadClass(IsolatedClientLoader.scala:215) at java.lang.ClassLoader.loadClass(ClassLoader.java:351) ... 57 more