diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 23983d85b3..0e47a3f094 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -1885,7 +1885,7 @@ else if(!isAcidIUDoperation && isFullAcidTable) { } else { FileSystem fs = tbl.getDataLocation().getFileSystem(conf); copyFiles(conf, loadPath, destPath, fs, isSrcLocal, isAcidIUDoperation, - (loadFileType == LoadFileType.OVERWRITE_EXISTING), newFiles); + (loadFileType == LoadFileType.OVERWRITE_EXISTING), newFiles, isFullAcidTable); } } perfLogger.PerfLogEnd("MoveTask", "FileMoves"); @@ -2431,7 +2431,7 @@ else if(!isAcidIUDoperation && isFullAcidTable) { try { FileSystem fs = tbl.getDataLocation().getFileSystem(sessionConf); copyFiles(sessionConf, loadPath, destPath, fs, isSrcLocal, isAcidIUDoperation, - loadFileType == LoadFileType.OVERWRITE_EXISTING, newFiles); + loadFileType == LoadFileType.OVERWRITE_EXISTING, newFiles, isFullAcidTable); } catch (IOException e) { throw new HiveException("addFiles: filesystem error in check phase", e); } @@ -3288,8 +3288,9 @@ public PrincipalPrivilegeSet get_privilege_set(HiveObjectType objectType, } private static void copyFiles(final HiveConf conf, final FileSystem destFs, - FileStatus[] srcs, final FileSystem srcFs, final Path destf, final boolean isSrcLocal, - boolean isOverwrite, final List newFiles) throws HiveException { + FileStatus[] srcs, final FileSystem srcFs, final Path destf, + final boolean isSrcLocal, boolean isOverwrite, + final List newFiles, boolean isFullAcidTable) throws HiveException { final HdfsUtils.HadoopFileStatus fullDestStatus; try { @@ -3305,6 +3306,7 @@ private static void copyFiles(final HiveConf conf, final FileSystem destFs, final ExecutorService pool = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25) > 0 ? Executors.newFixedThreadPool(conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Move-Thread-%d").build()) : null; + int taskId = 0; for (FileStatus src : srcs) { FileStatus[] files; if (src.isDirectory()) { @@ -3332,7 +3334,8 @@ private static void copyFiles(final HiveConf conf, final FileSystem destFs, // copy from source to destination, we will inherit the destination's parent group ownership. if (null == pool) { try { - Path destPath = mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isOverwrite, isRenameAllowed); + Path destPath = mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isOverwrite, isRenameAllowed, + isFullAcidTable ? taskId++ : -1); if (null != newFiles) { newFiles.add(destPath); @@ -3341,6 +3344,8 @@ private static void copyFiles(final HiveConf conf, final FileSystem destFs, throw getHiveException(e, msg, "Failed to move: {}"); } } else { + // future only takes final or seemingly final values. Make a final copy of taskId + final int finalTaskId = isFullAcidTable ? taskId++ : -1; futures.add(pool.submit(new Callable>() { @Override public ObjectPair call() throws HiveException { @@ -3348,7 +3353,7 @@ private static void copyFiles(final HiveConf conf, final FileSystem destFs, try { Path destPath = - mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isOverwrite, isRenameAllowed); + mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isOverwrite, isRenameAllowed, finalTaskId); if (null != newFiles) { newFiles.add(destPath); @@ -3418,6 +3423,10 @@ private static Path getQualifiedPathWithoutSchemeAndAuthority(Path srcf, FileSys return ShimLoader.getHadoopShims().getPathWithoutSchemeAndAuthority(path); } + private static String getPathName(int taskId) { + return Utilities.replaceTaskId("000000", taskId) + "_0"; + } + /** *

* Moves a file from one {@link Path} to another. If {@code isRenameAllowed} is true then the @@ -3445,15 +3454,21 @@ private static Path getQualifiedPathWithoutSchemeAndAuthority(Path srcf, FileSys * @throws IOException if there was an issue moving the file */ private static Path mvFile(HiveConf conf, FileSystem sourceFs, Path sourcePath, FileSystem destFs, Path destDirPath, - boolean isSrcLocal, boolean isOverwrite, boolean isRenameAllowed) throws IOException { + boolean isSrcLocal, boolean isOverwrite, boolean isRenameAllowed, + int taskId) throws IOException { // Strip off the file type, if any so we don't make: // 000000_0.gz -> 000000_0.gz_copy_1 final String fullname = sourcePath.getName(); - final String name = FilenameUtils.getBaseName(sourcePath.getName()); + final String name; + if (taskId == -1) { // non-acid + name = FilenameUtils.getBaseName(sourcePath.getName()); + } else { // acid + name = getPathName(taskId); + } final String type = FilenameUtils.getExtension(sourcePath.getName()); - Path destFilePath = new Path(destDirPath, fullname); + Path destFilePath = new Path(destDirPath, taskId == -1 ? fullname : name); /* * The below loop may perform bad when the destination file already exists and it has too many _copy_ @@ -3763,7 +3778,8 @@ static protected boolean needToCopy(Path srcf, Path destf, FileSystem srcFs, Fil */ static protected void copyFiles(HiveConf conf, Path srcf, Path destf, FileSystem fs, boolean isSrcLocal, boolean isAcid, - boolean isOverwrite, List newFiles) throws HiveException { + boolean isOverwrite, List newFiles, + boolean isFullAcidTable) throws HiveException { try { // create the destination if it does not exist if (!fs.exists(destf)) { @@ -3795,7 +3811,8 @@ static protected void copyFiles(HiveConf conf, Path srcf, Path destf, FileSystem if (isAcid) { moveAcidFiles(srcFs, srcs, destf, newFiles); } else { - copyFiles(conf, fs, srcs, srcFs, destf, isSrcLocal, isOverwrite, newFiles); + copyFiles(conf, fs, srcs, srcFs, destf, isSrcLocal, isOverwrite, + newFiles, isFullAcidTable); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index 5868d4dd56..54f5bab6de 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -159,14 +159,6 @@ private URI initializeFromURI(String fromPath, boolean isLocal) throws IOExcepti throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast, "source contains directory: " + oneSrc.getPath().toString())); } - if(AcidUtils.isAcidTable(table)) { - if(!AcidUtils.originalBucketFilter.accept(oneSrc.getPath())) { - //acid files (e.g. bucket_0000) have ROW_ID embedded in them and so can't be simply - //copied to a table so only allow non-acid files for now - throw new SemanticException(ErrorMsg.ACID_LOAD_DATA_INVALID_FILE_NAME, - oneSrc.getPath().getName(), table.getFullyQualifiedName()); - } - } } } catch (IOException e) { // Has to use full name to make sure it does not conflict with diff --git a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveCopyFiles.java b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveCopyFiles.java index c6a4a8926b..7be93a7023 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveCopyFiles.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveCopyFiles.java @@ -83,7 +83,7 @@ public void testRenameNewFilesOnSameFileSystem() throws IOException { FileSystem targetFs = targetPath.getFileSystem(hiveConf); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false,null); + Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false,null, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -107,7 +107,7 @@ public void testRenameExistingFilesOnSameFileSystem() throws IOException { FileSystem targetFs = targetPath.getFileSystem(hiveConf); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false, null); + Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false, null, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -127,7 +127,7 @@ public void testRenameExistingFilesOnSameFileSystem() throws IOException { sourceFolder.newFile("000001_0.gz"); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false, null); + Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false, null, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -158,7 +158,7 @@ public void testCopyNewFilesOnDifferentFileSystem() throws IOException { Mockito.when(spyTargetFs.getUri()).thenReturn(URI.create("hdfs://" + targetPath.toUri().getPath())); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null); + Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -185,7 +185,7 @@ public void testCopyExistingFilesOnDifferentFileSystem() throws IOException { Mockito.when(spyTargetFs.getUri()).thenReturn(URI.create("hdfs://" + targetPath.toUri().getPath())); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null); + Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -205,7 +205,7 @@ public void testCopyExistingFilesOnDifferentFileSystem() throws IOException { sourceFolder.newFile("000001_0.gz"); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null); + Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); diff --git a/ql/src/test/queries/clientpositive/load_data_acid_rename.q b/ql/src/test/queries/clientpositive/load_data_acid_rename.q new file mode 100644 index 0000000000..b21bc5ef3d --- /dev/null +++ b/ql/src/test/queries/clientpositive/load_data_acid_rename.q @@ -0,0 +1,12 @@ +set hive.mapred.mode=nonstrict; + +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +drop table if exists acid_rename; +create table acid_rename (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc TBLPROPERTIES ("transactional"="true"); +load data local inpath '../../data/files/orc_split_elim.orc' into table acid_rename; + +dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/acid_rename/*/000000_0; + +drop table acid_rename; diff --git a/ql/src/test/queries/clientpositive/smb_mapjoin_7.q b/ql/src/test/queries/clientpositive/smb_mapjoin_7.q index 4a6afb0496..fed931c897 100644 --- a/ql/src/test/queries/clientpositive/smb_mapjoin_7.q +++ b/ql/src/test/queries/clientpositive/smb_mapjoin_7.q @@ -16,8 +16,8 @@ create table smb_join_results(k1 int, v1 string, k2 int, v2 string); create table smb_join_results_empty_bigtable(k1 int, v1 string, k2 int, v2 string); create table normal_join_results(k1 int, v1 string, k2 int, v2 string); -load data local inpath '../../data/files/empty1.txt' into table smb_bucket4_1; -load data local inpath '../../data/files/empty2.txt' into table smb_bucket4_1; +load data local inpath '../../data/files/empty/000000_0' into table smb_bucket4_1; +load data local inpath '../../data/files/empty/000001_0' into table smb_bucket4_1; insert overwrite table smb_bucket4_2 select * from src; diff --git a/ql/src/test/results/clientpositive/beeline/smb_mapjoin_7.q.out b/ql/src/test/results/clientpositive/beeline/smb_mapjoin_7.q.out index 7a6f8c53a5..4b1313dc93 100644 --- a/ql/src/test/results/clientpositive/beeline/smb_mapjoin_7.q.out +++ b/ql/src/test/results/clientpositive/beeline/smb_mapjoin_7.q.out @@ -38,19 +38,19 @@ POSTHOOK: query: create table normal_join_results(k1 int, v1 string, k2 int, v2 POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@normal_join_results -PREHOOK: query: load data local inpath '../../data/files/empty1.txt' into table smb_bucket4_1 +PREHOOK: query: load data local inpath '../../data/files/empty/000000_0' into table smb_bucket4_1 PREHOOK: type: LOAD #### A masked pattern was here #### PREHOOK: Output: default@smb_bucket4_1 -POSTHOOK: query: load data local inpath '../../data/files/empty1.txt' into table smb_bucket4_1 +POSTHOOK: query: load data local inpath '../../data/files/empty/000000_0' into table smb_bucket4_1 POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@smb_bucket4_1 -PREHOOK: query: load data local inpath '../../data/files/empty2.txt' into table smb_bucket4_1 +PREHOOK: query: load data local inpath '../../data/files/empty/000001_0' into table smb_bucket4_1 PREHOOK: type: LOAD #### A masked pattern was here #### PREHOOK: Output: default@smb_bucket4_1 -POSTHOOK: query: load data local inpath '../../data/files/empty2.txt' into table smb_bucket4_1 +POSTHOOK: query: load data local inpath '../../data/files/empty/000001_0' into table smb_bucket4_1 POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@smb_bucket4_1 diff --git a/ql/src/test/results/clientpositive/llap/load_data_acid_rename.q.out b/ql/src/test/results/clientpositive/llap/load_data_acid_rename.q.out new file mode 100644 index 0000000000..b915cdbd31 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/load_data_acid_rename.q.out @@ -0,0 +1,29 @@ +PREHOOK: query: drop table if exists acid_rename +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists acid_rename +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table acid_rename (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc TBLPROPERTIES ("transactional"="true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_rename +POSTHOOK: query: create table acid_rename (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc TBLPROPERTIES ("transactional"="true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_rename +PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table acid_rename +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@acid_rename +POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table acid_rename +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@acid_rename +#### A masked pattern was here #### +PREHOOK: query: drop table acid_rename +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@acid_rename +PREHOOK: Output: default@acid_rename +POSTHOOK: query: drop table acid_rename +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@acid_rename +POSTHOOK: Output: default@acid_rename diff --git a/ql/src/test/results/clientpositive/smb_mapjoin_7.q.out b/ql/src/test/results/clientpositive/smb_mapjoin_7.q.out index b71c5b87c1..83033b07c0 100644 --- a/ql/src/test/results/clientpositive/smb_mapjoin_7.q.out +++ b/ql/src/test/results/clientpositive/smb_mapjoin_7.q.out @@ -38,19 +38,19 @@ POSTHOOK: query: create table normal_join_results(k1 int, v1 string, k2 int, v2 POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@normal_join_results -PREHOOK: query: load data local inpath '../../data/files/empty1.txt' into table smb_bucket4_1 +PREHOOK: query: load data local inpath '../../data/files/empty/000000_0' into table smb_bucket4_1 PREHOOK: type: LOAD #### A masked pattern was here #### PREHOOK: Output: default@smb_bucket4_1 -POSTHOOK: query: load data local inpath '../../data/files/empty1.txt' into table smb_bucket4_1 +POSTHOOK: query: load data local inpath '../../data/files/empty/000000_0' into table smb_bucket4_1 POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@smb_bucket4_1 -PREHOOK: query: load data local inpath '../../data/files/empty2.txt' into table smb_bucket4_1 +PREHOOK: query: load data local inpath '../../data/files/empty/000001_0' into table smb_bucket4_1 PREHOOK: type: LOAD #### A masked pattern was here #### PREHOOK: Output: default@smb_bucket4_1 -POSTHOOK: query: load data local inpath '../../data/files/empty2.txt' into table smb_bucket4_1 +POSTHOOK: query: load data local inpath '../../data/files/empty/000001_0' into table smb_bucket4_1 POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@smb_bucket4_1 diff --git a/ql/src/test/results/clientpositive/spark/smb_mapjoin_7.q.out b/ql/src/test/results/clientpositive/spark/smb_mapjoin_7.q.out index ac49c02913..610abab91b 100644 --- a/ql/src/test/results/clientpositive/spark/smb_mapjoin_7.q.out +++ b/ql/src/test/results/clientpositive/spark/smb_mapjoin_7.q.out @@ -38,19 +38,19 @@ POSTHOOK: query: create table normal_join_results(k1 int, v1 string, k2 int, v2 POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@normal_join_results -PREHOOK: query: load data local inpath '../../data/files/empty1.txt' into table smb_bucket4_1 +PREHOOK: query: load data local inpath '../../data/files/empty/000000_0' into table smb_bucket4_1 PREHOOK: type: LOAD #### A masked pattern was here #### PREHOOK: Output: default@smb_bucket4_1 -POSTHOOK: query: load data local inpath '../../data/files/empty1.txt' into table smb_bucket4_1 +POSTHOOK: query: load data local inpath '../../data/files/empty/000000_0' into table smb_bucket4_1 POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@smb_bucket4_1 -PREHOOK: query: load data local inpath '../../data/files/empty2.txt' into table smb_bucket4_1 +PREHOOK: query: load data local inpath '../../data/files/empty/000001_0' into table smb_bucket4_1 PREHOOK: type: LOAD #### A masked pattern was here #### PREHOOK: Output: default@smb_bucket4_1 -POSTHOOK: query: load data local inpath '../../data/files/empty2.txt' into table smb_bucket4_1 +POSTHOOK: query: load data local inpath '../../data/files/empty/000001_0' into table smb_bucket4_1 POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@smb_bucket4_1