Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-21374

Reading globbed paths from S3 into DF doesn't work if filesystem caching is disabled

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Major
    • Resolution: Fixed
    • 2.0.2, 2.1.1
    • 2.2.1, 2.3.0
    • Spark Core
    • None

    Description

      Motivation:
      In my case I want to disable filesystem cache to be able to change S3's access key and secret key on the fly to read from buckets with different permissions. This works perfectly fine for RDDs but doesn't work for DFs.

      Example (works for RDD but fails for DataFrame):

      import org.apache.spark.SparkContext
      import org.apache.spark.SparkConf
      import org.apache.spark.sql.SparkSession
      
      object SimpleApp {
        def main(args: Array[String]) {
      
          val awsAccessKeyId = "something"
          val awsSecretKey = "something else"
      
          val conf = new SparkConf().setAppName("Simple Application").setMaster("local[*]")
      
          val sc = new SparkContext(conf)
          sc.hadoopConfiguration.set("fs.s3.awsAccessKeyId", awsAccessKeyId)
          sc.hadoopConfiguration.set("fs.s3.awsSecretAccessKey", awsSecretKey)
          sc.hadoopConfiguration.setBoolean("fs.s3.impl.disable.cache",true)
          sc.hadoopConfiguration.set("fs.s3.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem")
          sc.hadoopConfiguration.set("fs.s3.buffer.dir","/tmp")
      
          val spark = SparkSession.builder().config(conf).getOrCreate()
      
          val rddFile = sc.textFile("s3://bucket/file.csv").count // ok
          val rddGlob = sc.textFile("s3://bucket/*").count // ok
          val dfFile = spark.read.format("csv").load("s3://bucket/file.csv").count // ok
          
          val dfGlob = spark.read.format("csv").load("s3://bucket/*").count 
          // IllegalArgumentExcepton. AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively)
          // of a s3 URL, or by setting the fs.s3.awsAccessKeyId or fs.s3.awsSecretAccessKey properties (respectively).
         
          sc.stop()
        }
      }
      
      

      Attachments

        Activity

          People

            andrey.t Andrey Taptunov
            andrey.t Andrey Taptunov
            Votes:
            1 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: