Details
-
Sub-task
-
Status: Open
-
Minor
-
Resolution: Unresolved
-
3.2.0
-
None
-
None
Description
-
-
- NOTE: apparently a hack around is use any string as password. Azure will allow access with wrong password to open SA.
-
It does not seem possible to access storage accounts without passwords using abfs, but it is possible using wasb.
This sample code (Spark based) to illustrate, the following code using abfs_path with throw an exception
Exception in thread "main" java.lang.IllegalArgumentException: Invalid account key. at org.apache.hadoop.fs.azurebfs.services.SharedKeyCredentials.<init>(SharedKeyCredentials.java:70) at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.initializeClient(AzureBlobFileSystemStore.java:812) at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.<init>(AzureBlobFileSystemStore.java:149) at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.initialize(AzureBlobFileSystem.java:108) at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303) at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124) at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352) at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320) at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479) at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
While using the wasb_path will work normally,
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.sql.RuntimeConfig; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; public class SimpleApp { static String blob_account_name = "azureopendatastorage"; static String blob_container_name = "gfsweatherdatacontainer"; static String blob_relative_path = "GFSWeather/GFSProcessed"; static String blob_sas_token = ""; static String abfs_path = "abfs://"+blob_container_name+"@"+blob_account_name+".dfs.core.windows.net/"+blob_relative_path; static String wasbs_path = "wasbs://"+blob_container_name + "@"+blob_account_name+".blob.core.windows.net/" + blob_relative_path; public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("NOAAGFS Run").getOrCreate(); configureAzureHadoopConnetor(spark); RuntimeConfig conf = spark.conf(); conf.set("fs.azure.account.key."+blob_account_name+".dfs.core.windows.net", blob_sas_token); conf.set("fs.azure.account.key."+blob_account_name+".blob.core.windows.net", blob_sas_token); System.out.println("Creating parquet dataset"); Dataset<Row> logData = spark.read().parquet(abfs_path); System.out.println("Creating temp view"); logData.createOrReplaceTempView("source"); System.out.println("SQL"); spark.sql("SELECT * FROM source LIMIT 10").show(); spark.stop(); } public static void configureAzureHadoopConnetor(SparkSession session) { RuntimeConfig conf = session.conf(); conf.set("fs.AbstractFileSystem.wasb.impl","org.apache.hadoop.fs.azure.Wasb"); conf.set("fs.AbstractFileSystem.wasbs.impl","org.apache.hadoop.fs.azure.Wasbs"); conf.set("fs.wasb.impl","org.apache.hadoop.fs.azure.NativeAzureFileSystem"); conf.set("fs.wasbs.impl","org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure"); conf.set("fs.azure.secure.mode", false); conf.set("fs.abfs.impl", "org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem"); conf.set("fs.abfss.impl", "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem"); conf.set("fs.AbstractFileSystem.abfs.impl","org.apache.hadoop.fs.azurebfs.Abfs"); conf.set("fs.AbstractFileSystem.abfss.impl","org.apache.hadoop.fs.azurebfs.Abfss"); // Works in conjuction with fs.azure.secure.mode. Setting this config to true // results in fs.azure.NativeAzureFileSystem using the local SAS key generation // where the SAS keys are generating in the same process as fs.azure.NativeAzureFileSystem. // If fs.azure.secure.mode flag is set to false, this flag has no effect. conf.set("fs.azure.local.sas.key.mode", false); } }
Sample build.gradle
plugins { id 'java' } group 'org.samples' version '1.0-SNAPSHOT' sourceCompatibility = 1.8 repositories { mavenCentral() } dependencies { compile 'org.apache.spark:spark-sql_2.12:2.4.3' }