diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java index e1cbaa6..20509ce 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java @@ -18,18 +18,18 @@ package org.apache.hadoop.hive.ql; -import org.antlr.runtime.tree.Tree; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.metadata.HiveUtils; -import org.apache.hadoop.hive.ql.parse.ASTNode; -import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin; - import java.text.MessageFormat; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.antlr.runtime.tree.Tree; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.metadata.HiveUtils; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin; + /** * List of all error messages. * This list contains both compile time and run-time errors. @@ -493,7 +493,8 @@ ORC_CORRUPTED_READ(30018, "Corruption in ORC data encountered. To skip reading corrupted " + "data, set " + HiveConf.ConfVars.HIVE_ORC_SKIP_CORRUPT_DATA + " to true"), - + INVALID_FILE_FORMAT_IN_LOAD(30019, "The file that you are trying to load does not match the" + + " file format of the destination table.") ; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index 1a9b42b..979bc8b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -18,6 +18,14 @@ package org.apache.hadoop.hive.ql.parse; +import java.io.IOException; +import java.io.Serializable; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + import org.antlr.runtime.tree.Tree; import org.apache.commons.httpclient.util.URIUtil; import org.apache.commons.lang.StringUtils; @@ -26,26 +34,22 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.plan.LoadTableDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.StatsWork; - -import java.io.IOException; -import java.io.Serializable; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; +import org.apache.hadoop.mapred.InputFormat; /** * LoadSemanticAnalyzer. @@ -60,12 +64,12 @@ public LoadSemanticAnalyzer(HiveConf conf) throws SemanticException { public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path) throws IOException { FileStatus[] srcs = fs.globStatus(path, new PathFilter() { - @Override - public boolean accept(Path p) { - String name = p.getName(); - return name.equals("_metadata") ? true : !name.startsWith("_") && !name.startsWith("."); - } - }); + @Override + public boolean accept(Path p) { + String name = p.getName(); + return name.equals("_metadata") ? true : !name.startsWith("_") && !name.startsWith("."); + } + }); if ((srcs != null) && srcs.length == 1) { if (srcs[0].isDir()) { srcs = fs.listStatus(srcs[0].getPath(), new PathFilter() { @@ -228,6 +232,11 @@ public void analyzeInternal(ASTNode ast) throws SemanticException { // make sure the arguments make sense applyConstraints(fromURI, toURI, fromTree, isLocal); + + // for managed tables, make sure the file formats match + if (ts.tableHandle.getTableType().equals(TableType.MANAGED_TABLE)) { + ensureFileFormatsMatch(ts, fromURI); + } inputs.add(toReadEntity(new Path(fromURI))); Task rTask = null; @@ -323,4 +332,21 @@ else if (statTask != null) { childTask.addDependentTask(statTask); } } + + private void ensureFileFormatsMatch(TableSpec ts, URI fromURI) throws SemanticException { + Class destInputFormat = ts.tableHandle.getInputFormatClass(); + // Other file formats should do similar check to make sure file formats match + // when doing LOAD DATA .. INTO TABLE + if (OrcInputFormat.class.equals(destInputFormat)) { + Path inputFilePath = new Path(fromURI); + try { + FileSystem fs = FileSystem.get(fromURI, conf); + // just creating orc reader is going to do sanity checks to make sure its valid ORC file + OrcFile.createReader(fs, inputFilePath); + } catch (IOException e) { + throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg("Destination table" + + " is stored as ORC but the file being loaded is not a valid ORC file.")); + } + } + } } diff --git a/ql/src/test/queries/clientnegative/load_orc.q b/ql/src/test/queries/clientnegative/load_orc.q new file mode 100644 index 0000000..6c931a0 --- /dev/null +++ b/ql/src/test/queries/clientnegative/load_orc.q @@ -0,0 +1,6 @@ +set hive.default.fileformat=ORC; +create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp); + +load data local inpath '../../data/files/kv1.txt' into table orc_test; + +dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/orc_test/; diff --git a/ql/src/test/queries/clientpositive/load_orc.q b/ql/src/test/queries/clientpositive/load_orc.q new file mode 100644 index 0000000..9702a57 --- /dev/null +++ b/ql/src/test/queries/clientpositive/load_orc.q @@ -0,0 +1,6 @@ +set hive.default.fileformat=ORC; +create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp); + +load data local inpath '../../data/files/orc_split_elim.orc' into table orc_test; + +dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/orc_test/; diff --git a/ql/src/test/results/clientnegative/load_orc.q.out b/ql/src/test/results/clientnegative/load_orc.q.out new file mode 100644 index 0000000..a811016 --- /dev/null +++ b/ql/src/test/results/clientnegative/load_orc.q.out @@ -0,0 +1,9 @@ +PREHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_test +POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_test +FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match the file format of the destination table. Destination table is stored as ORC but the file being loaded is not a valid ORC file. diff --git a/ql/src/test/results/clientpositive/load_orc.q.out b/ql/src/test/results/clientpositive/load_orc.q.out new file mode 100644 index 0000000..dfe03be --- /dev/null +++ b/ql/src/test/results/clientpositive/load_orc.q.out @@ -0,0 +1,18 @@ +PREHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_test +POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_test +PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_test +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@orc_test +POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_test +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@orc_test +Found 1 items +#### A masked pattern was here ####