diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java index e1cbaa6..20509ce 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java @@ -18,18 +18,18 @@ package org.apache.hadoop.hive.ql; -import org.antlr.runtime.tree.Tree; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.metadata.HiveUtils; -import org.apache.hadoop.hive.ql.parse.ASTNode; -import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin; - import java.text.MessageFormat; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.antlr.runtime.tree.Tree; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.metadata.HiveUtils; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin; + /** * List of all error messages. * This list contains both compile time and run-time errors. @@ -493,7 +493,8 @@ ORC_CORRUPTED_READ(30018, "Corruption in ORC data encountered. To skip reading corrupted " + "data, set " + HiveConf.ConfVars.HIVE_ORC_SKIP_CORRUPT_DATA + " to true"), - + INVALID_FILE_FORMAT_IN_LOAD(30019, "The file that you are trying to load does not match the" + + " file format of the destination table.") ; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java b/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java new file mode 100644 index 0000000..12417aa --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io; + +import java.io.IOException; + +/** + * Thrown when an invalid file format is encountered. + */ +public class FileFormatException extends IOException { + + public FileFormatException(String errMsg) { + super(errMsg); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java index 50f417b..bbc4654 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java @@ -18,8 +18,6 @@ package org.apache.hadoop.hive.ql.io.orc; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_ZEROCOPY; - import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; @@ -36,14 +34,14 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.DiskRange; -import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.io.FileFormatException; import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.orc.OrcProto.UserMetadataItem; +import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.BufferChunk; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.Text; -import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.BufferChunk; import com.google.common.collect.Lists; import com.google.common.collect.Sets; @@ -232,7 +230,7 @@ static void ensureOrcFooter(FSDataInputStream in, ByteBuffer buffer) throws IOException { int len = OrcFile.MAGIC.length(); if (psLen < len + 1) { - throw new IOException("Malformed ORC file " + path + + throw new FileFormatException("Malformed ORC file " + path + ". Invalid postscript length " + psLen); } int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - 1 @@ -247,7 +245,7 @@ static void ensureOrcFooter(FSDataInputStream in, in.readFully(header, 0, len); // if it isn't there, this isn't an ORC file if (!Text.decode(header, 0 , len).equals(OrcFile.MAGIC)) { - throw new IOException("Malformed ORC file " + path + + throw new FileFormatException("Malformed ORC file " + path + ". Invalid postscript."); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index 1a9b42b..0707ead 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -18,6 +18,14 @@ package org.apache.hadoop.hive.ql.parse; +import java.io.IOException; +import java.io.Serializable; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + import org.antlr.runtime.tree.Tree; import org.apache.commons.httpclient.util.URIUtil; import org.apache.commons.lang.StringUtils; @@ -26,26 +34,23 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.io.FileFormatException; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.plan.LoadTableDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.StatsWork; - -import java.io.IOException; -import java.io.Serializable; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; +import org.apache.hadoop.mapred.InputFormat; /** * LoadSemanticAnalyzer. @@ -60,12 +65,12 @@ public LoadSemanticAnalyzer(HiveConf conf) throws SemanticException { public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path) throws IOException { FileStatus[] srcs = fs.globStatus(path, new PathFilter() { - @Override - public boolean accept(Path p) { - String name = p.getName(); - return name.equals("_metadata") ? true : !name.startsWith("_") && !name.startsWith("."); - } - }); + @Override + public boolean accept(Path p) { + String name = p.getName(); + return name.equals("_metadata") ? true : !name.startsWith("_") && !name.startsWith("."); + } + }); if ((srcs != null) && srcs.length == 1) { if (srcs[0].isDir()) { srcs = fs.listStatus(srcs[0].getPath(), new PathFilter() { @@ -228,6 +233,11 @@ public void analyzeInternal(ASTNode ast) throws SemanticException { // make sure the arguments make sense applyConstraints(fromURI, toURI, fromTree, isLocal); + + // for managed tables, make sure the file formats match + if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType())) { + ensureFileFormatsMatch(ts, fromURI); + } inputs.add(toReadEntity(new Path(fromURI))); Task extends Serializable> rTask = null; @@ -323,4 +333,26 @@ else if (statTask != null) { childTask.addDependentTask(statTask); } } + + private void ensureFileFormatsMatch(TableSpec ts, URI fromURI) throws SemanticException { + Class extends InputFormat> destInputFormat = ts.tableHandle.getInputFormatClass(); + // Other file formats should do similar check to make sure file formats match + // when doing LOAD DATA .. INTO TABLE + if (OrcInputFormat.class.equals(destInputFormat)) { + Path inputFilePath = new Path(fromURI); + try { + FileSystem fs = FileSystem.get(fromURI, conf); + // just creating orc reader is going to do sanity checks to make sure its valid ORC file + OrcFile.createReader(fs, inputFilePath); + } catch (IOException e) { + if (e instanceof FileFormatException) { + throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg("Destination" + + " table is stored as ORC but the file being loaded is not a valid ORC file.")); + } else { + throw new SemanticException("Unable to load data to destination table." + + " Error: " + e.getMessage()); + } + } + } + } } diff --git a/ql/src/test/queries/clientnegative/load_orc_negative1.q b/ql/src/test/queries/clientnegative/load_orc_negative1.q new file mode 100644 index 0000000..9edb2f9 --- /dev/null +++ b/ql/src/test/queries/clientnegative/load_orc_negative1.q @@ -0,0 +1,4 @@ +set hive.default.fileformat=ORC; +create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp); + +load data local inpath '../../data/files/kv1.txt' into table orc_test; diff --git a/ql/src/test/queries/clientnegative/load_orc_negative2.q b/ql/src/test/queries/clientnegative/load_orc_negative2.q new file mode 100644 index 0000000..b044c9d --- /dev/null +++ b/ql/src/test/queries/clientnegative/load_orc_negative2.q @@ -0,0 +1,6 @@ +create table text_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp); +load data local inpath '../../data/files/kv1.txt' into table text_test; + +set hive.default.fileformat=ORC; +create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp); +load data inpath '${hiveconf:hive.metastore.warehouse.dir}/text_test/kv1.txt' into table orc_test; diff --git a/ql/src/test/queries/clientpositive/load_orc.q b/ql/src/test/queries/clientpositive/load_orc.q new file mode 100644 index 0000000..2eaf098 --- /dev/null +++ b/ql/src/test/queries/clientpositive/load_orc.q @@ -0,0 +1,10 @@ +set hive.default.fileformat=ORC; +create table orc_staging (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp); +create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp); + +load data local inpath '../../data/files/orc_split_elim.orc' into table orc_staging; +dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/orc_staging/; + +load data inpath '${hiveconf:hive.metastore.warehouse.dir}/orc_staging/orc_split_elim.orc' into table orc_test; +load data local inpath '../../data/files/orc_split_elim.orc' into table orc_test; +dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/orc_test/; diff --git a/ql/src/test/results/clientnegative/load_orc_negative1.q.out b/ql/src/test/results/clientnegative/load_orc_negative1.q.out new file mode 100644 index 0000000..ca15a30 --- /dev/null +++ b/ql/src/test/results/clientnegative/load_orc_negative1.q.out @@ -0,0 +1,9 @@ +PREHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_test +POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_test +FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match the file format of the destination table. Destination table is stored as ORC but the file being loaded is not a valid ORC file. diff --git a/ql/src/test/results/clientnegative/load_orc_negative2.q.out b/ql/src/test/results/clientnegative/load_orc_negative2.q.out new file mode 100644 index 0000000..77fb50e --- /dev/null +++ b/ql/src/test/results/clientnegative/load_orc_negative2.q.out @@ -0,0 +1,25 @@ +PREHOOK: query: create table text_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@text_test +POSTHOOK: query: create table text_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@text_test +PREHOOK: query: load data local inpath '../../data/files/kv1.txt' into table text_test +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@text_test +POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' into table text_test +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@text_test +PREHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_test +POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_test +FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match the file format of the destination table. Destination table is stored as ORC but the file being loaded is not a valid ORC file. diff --git a/ql/src/test/results/clientpositive/load_orc.q.out b/ql/src/test/results/clientpositive/load_orc.q.out new file mode 100644 index 0000000..b0835de --- /dev/null +++ b/ql/src/test/results/clientpositive/load_orc.q.out @@ -0,0 +1,43 @@ +PREHOOK: query: create table orc_staging (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_staging +POSTHOOK: query: create table orc_staging (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_staging +PREHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_test +POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_test +PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@orc_staging +POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@orc_staging +Found 1 items +#### A masked pattern was here #### +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@orc_test +#### A masked pattern was here #### +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@orc_test +PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_test +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@orc_test +POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_test +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@orc_test +Found 2 items +#### A masked pattern was here ####