Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Fixed
-
2.0.2, 2.1.1
-
None
Description
Motivation:
In my case I want to disable filesystem cache to be able to change S3's access key and secret key on the fly to read from buckets with different permissions. This works perfectly fine for RDDs but doesn't work for DFs.
Example (works for RDD but fails for DataFrame):
import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession object SimpleApp { def main(args: Array[String]) { val awsAccessKeyId = "something" val awsSecretKey = "something else" val conf = new SparkConf().setAppName("Simple Application").setMaster("local[*]") val sc = new SparkContext(conf) sc.hadoopConfiguration.set("fs.s3.awsAccessKeyId", awsAccessKeyId) sc.hadoopConfiguration.set("fs.s3.awsSecretAccessKey", awsSecretKey) sc.hadoopConfiguration.setBoolean("fs.s3.impl.disable.cache",true) sc.hadoopConfiguration.set("fs.s3.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem") sc.hadoopConfiguration.set("fs.s3.buffer.dir","/tmp") val spark = SparkSession.builder().config(conf).getOrCreate() val rddFile = sc.textFile("s3://bucket/file.csv").count // ok val rddGlob = sc.textFile("s3://bucket/*").count // ok val dfFile = spark.read.format("csv").load("s3://bucket/file.csv").count // ok val dfGlob = spark.read.format("csv").load("s3://bucket/*").count // IllegalArgumentExcepton. AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) // of a s3 URL, or by setting the fs.s3.awsAccessKeyId or fs.s3.awsSecretAccessKey properties (respectively). sc.stop() } }