Details
Description
Stack Trace
Py4JJavaErrorTraceback (most recent call last) <ipython-input-13-c35b9cad36ad> in <module>() ----> 1 sdf = sql.createDataFrame(df) /opt/spark2/python/pyspark/sql/context.py in createDataFrame(self, data, schema, samplingRatio, verifySchema) 307 Py4JJavaError: ... 308 """ --> 309 return self.sparkSession.createDataFrame(data, schema, samplingRatio, verifySchema) 310 311 @since(1.3) /opt/spark2/python/pyspark/sql/session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema) 524 rdd, schema = self._createFromLocal(map(prepare, data), schema) 525 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) --> 526 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) 527 df = DataFrame(jdf, self._wrapped) 528 df._schema = schema /opt/spark2/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py in __call__(self, *args) 1131 answer = self.gateway_client.send_command(command) 1132 return_value = get_return_value( -> 1133 answer, self.gateway_client, self.target_id, self.name) 1134 1135 for temp_arg in temp_args: /opt/spark2/python/pyspark/sql/utils.py in deco(*a, **kw) 61 def deco(*a, **kw): 62 try: ---> 63 return f(*a, **kw) 64 except py4j.protocol.Py4JJavaError as e: 65 s = e.java_exception.toString() /opt/spark2/python/lib/py4j-0.10.3-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 317 raise Py4JJavaError( 318 "An error occurred while calling {0}{1}{2}.\n". --> 319 format(target_id, ".", name), value) 320 else: 321 raise Py4JError( Py4JJavaError: An error occurred while calling o47.applySchemaToPythonRDD. : org.apache.spark.SparkException: Keytab file: .keytab-f0b9b814-460e-4fa8-8e7d-029186b696c4 specified in spark.yarn.keytab does not exist at org.apache.spark.sql.hive.client.HiveClientImpl.<init>(HiveClientImpl.scala:113) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:258) at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:359) at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:263) at org.apache.spark.sql.hive.HiveSharedState.metadataHive$lzycompute(HiveSharedState.scala:39) at org.apache.spark.sql.hive.HiveSharedState.metadataHive(HiveSharedState.scala:38) at org.apache.spark.sql.hive.HiveSharedState.externalCatalog$lzycompute(HiveSharedState.scala:46) at org.apache.spark.sql.hive.HiveSharedState.externalCatalog(HiveSharedState.scala:45) at org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:50) at org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48) at org.apache.spark.sql.hive.HiveSessionState$$anon$1.<init>(HiveSessionState.scala:63) at org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63) at org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64) at org.apache.spark.sql.SparkSession.applySchemaToPythonRDD(SparkSession.scala:666) at org.apache.spark.sql.SparkSession.applySchemaToPythonRDD(SparkSession.scala:656) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:745)
Steps to reproduce
1. Pass valid --principal=user@REALM and --keytab=/home/user/.keytab to spark-submit
2. Set spark.sql.catalogImplementation = 'hive'
3. Set deploy mode to yarn-client
4. Create a SparkSession and try to use session.createDataFrame()
Observations
- The setupCredentials function in Client.scala sets spark.yarn.keytab to a UUID suffixed version of the base keytab filename without any path. For example, sparkContext.getConf().getAll() shows spark.yarn.keytab as having value .keytab-f0b9b814-460e-4fa8-8e7d-029186b696c4
- When listing the contents of the application staging directory on HDFS, no suffixed file exists. Rather, the keytab file appears in the listing with its original name. For instance, hdfs dfs -ls hdfs://home/user/.sparkStaging/appication_big_uuid/ shows an entry hdfs://home/user/.sparkStaging/appication_big_uuid/.keytab, but not hdfs://home/user/.sparkStaging/appication_big_uuid/.keytab-big-uuid.
- The same exception noted above occurs even after I manually put a copy of the keytab with a filename matching the new value of spark.yarn.keytab onto HDFS in the staging directory.
Expected Behavior
HiveClientImpl should be able to read spark.yarn.keytab to find the keytab file and initialize itself properly.
References
SPARK-8619also noted trouble with the keytab property getting changed after app startup.
Attachments
Issue Links
- is related to
-
SPARK-19588 Allow putting keytab file to HDFS location specified in spark.yarn.keytab
- Resolved
- links to