diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index a837d67b96..3712a53521 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -422,6 +422,7 @@ TOK_ADD_TRIGGER; TOK_REPLACE; TOK_LIKERP; TOK_UNMANAGED; +TOK_INPUTFORMAT; } @@ -835,8 +836,8 @@ execStatement loadStatement @init { pushMsg("load statement", state); } @after { popMsg(state); } - : KW_LOAD KW_DATA (islocal=KW_LOCAL)? KW_INPATH (path=StringLiteral) (isoverwrite=KW_OVERWRITE)? KW_INTO KW_TABLE (tab=tableOrPartition) - -> ^(TOK_LOAD $path $tab $islocal? $isoverwrite?) + : KW_LOAD KW_DATA (islocal=KW_LOCAL)? KW_INPATH (path=StringLiteral) (isoverwrite=KW_OVERWRITE)? KW_INTO KW_TABLE (tab=tableOrPartition) inputFileFormat? + -> ^(TOK_LOAD $path $tab $islocal? $isoverwrite? inputFileFormat?) ; replicationClause @@ -1489,6 +1490,13 @@ fileFormat | genericSpec=identifier -> ^(TOK_FILEFORMAT_GENERIC $genericSpec) ; +inputFileFormat +@init { pushMsg("Load Data input file format specification", state); } +@after { popMsg(state); } + : KW_INPUTFORMAT inFmt=StringLiteral KW_SERDE serdeCls=StringLiteral + -> ^(TOK_INPUTFORMAT $inFmt $serdeCls) + ; + tabTypeExpr @init { pushMsg("specifying table types", state); } @after { popMsg(state); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index 2b88ea651b..ee90769107 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -79,6 +79,8 @@ // AST specific data private Tree fromTree, tableTree; private boolean isLocal = false, isOverWrite = false; + private String inputFormatClassName = null; + private String serDeClassName = null; public LoadSemanticAnalyzer(QueryState queryState) throws SemanticException { super(queryState); @@ -257,12 +259,30 @@ private void analyzeLoad(ASTNode ast) throws SemanticException { fromTree = ast.getChild(0); tableTree = ast.getChild(1); - if (ast.getChildCount() == 4) { + boolean inputInfo = false; + // Check the last node + ASTNode child = (ASTNode)ast.getChild(ast.getChildCount() - 1); + if (child.getToken().getType() == HiveParser.TOK_INPUTFORMAT) { + if (child.getChildCount() != 2) { + throw new SemanticException("FileFormat should contain both input format and Serde"); + } + try { + inputFormatClassName = stripQuotes(child.getChild(0).getText()); + serDeClassName = stripQuotes(child.getChild(1).getText()); + inputInfo = true; + } catch (Exception e) { + throw new SemanticException("FileFormat inputFormatClassName or serDeClassName is incorrect"); + } + } + + if ((!inputInfo && ast.getChildCount() == 4) || + (inputInfo && ast.getChildCount() == 5)) { isLocal = true; isOverWrite = true; } - if (ast.getChildCount() == 3) { + if ((!inputInfo && ast.getChildCount() == 3) || + (inputInfo && ast.getChildCount() == 4)) { if (ast.getChild(2).getText().toLowerCase().equals("local")) { isLocal = true; } else { @@ -450,7 +470,15 @@ private void reparseAndSuperAnalyze(Table table, URI fromURI) throws SemanticExc // Set data location and input format, it must be text tempTableObj.setDataLocation(new Path(fromURI)); - tempTableObj.setInputFormatClass(TextInputFormat.class); + if (inputFormatClassName != null && serDeClassName != null) { + try { + tempTableObj.setInputFormatClass(inputFormatClassName); + tempTableObj.setSerializationLib(serDeClassName); + } catch (HiveException e) { + throw new SemanticException("Load Data: Failed to set inputFormat or SerDe"); + } + } + //tempTableObj.setInputFormatClass(TextInputFormat.class); // Step 2 : create the Insert query StringBuilder rewrittenQueryStr = new StringBuilder(); diff --git a/ql/src/test/queries/clientpositive/load_data_using_job.q b/ql/src/test/queries/clientpositive/load_data_using_job.q index 3928f1fa07..3659b6ec8e 100644 --- a/ql/src/test/queries/clientpositive/load_data_using_job.q +++ b/ql/src/test/queries/clientpositive/load_data_using_job.q @@ -84,7 +84,11 @@ drop table srcbucket_mapjoin; -- Load into ORC table using text files CREATE TABLE srcbucket_mapjoin(key int, value string) partitioned by (ds string) STORED AS ORC; -explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin; -load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin; +explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'; +load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'; select * from srcbucket_mapjoin; drop table srcbucket_mapjoin; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out b/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out index 116630c237..c3b70a3b64 100644 --- a/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out +++ b/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out @@ -2776,8 +2776,12 @@ POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@srcbucket_mapjoin PREHOOK: query: explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' PREHOOK: type: QUERY POSTHOOK: query: explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage @@ -2830,10 +2834,14 @@ STAGE PLANS: Basic Stats Work: PREHOOK: query: load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' PREHOOK: type: QUERY PREHOOK: Input: default@srcbucket_mapjoin__TEMP_TABLE_FOR_LOAD_DATA__ PREHOOK: Output: default@srcbucket_mapjoin POSTHOOK: query: load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' +SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' POSTHOOK: type: QUERY POSTHOOK: Input: default@srcbucket_mapjoin__TEMP_TABLE_FOR_LOAD_DATA__ POSTHOOK: Output: default@srcbucket_mapjoin@ds=2008-04-08