diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 50bdce89a4..b2c671cdaa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -3376,16 +3376,30 @@ private static Path mvFile(HiveConf conf, FileSystem sourceFs, Path sourcePath, final String fullname = sourcePath.getName(); final String name = FilenameUtils.getBaseName(sourcePath.getName()); final String type = FilenameUtils.getExtension(sourcePath.getName()); + final boolean isAcidTarget = destDirPath.getName().startsWith(AcidUtils.BASE_PREFIX) || destDirPath.getName().startsWith(AcidUtils.DELTA_PREFIX); + Path destFilePath; + if(isAcidTarget && !AcidUtils.originalBucketFilter.accept(sourcePath)) { + //if here we are doing a load Data into acid table - todo: make this more explicit + //Acid tables can only deal with files matching AcidUtils.originalBucketFilter. + //so here we rename the input file and further logic will add a copy_N suffix in case of + //collisions. (This works since Load Data doesn't support bucketed tables for now) + destFilePath = new Path(destDirPath, "000000_0"); + } + else { + destFilePath = new Path(destDirPath, fullname); + } - Path destFilePath = new Path(destDirPath, fullname); - - /* + /* * The below loop may perform bad when the destination file already exists and it has too many _copy_ * files as well. A desired approach was to call listFiles() and get a complete list of files from * the destination, and check whether the file exists or not on that list. However, millions of files * could live on the destination directory, and on concurrent situations, this can cause OOM problems. * * I'll leave the below loop for now until a better approach is found. + * + * This is problematic: caller of mvFile() may use a thread pool to move files in parallel in + * which case there is a race condition between exists() and reaname() from different threads. + * I suppose in case of collisions the FileSystem will throw and the command will fail. */ for (int counter = 1; destFs.exists(destFilePath); counter++) { if (isOverwrite) { diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index cc956da575..8b5c5e4540 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -160,7 +160,7 @@ private URI initializeFromURI(String fromPath, boolean isLocal) throws IOExcepti "source contains directory: " + oneSrc.getPath().toString())); } if(AcidUtils.isFullAcidTable(table)) { - if(!AcidUtils.originalBucketFilter.accept(oneSrc.getPath())) { + if(false && !AcidUtils.originalBucketFilter.accept(oneSrc.getPath())) { //acid files (e.g. bucket_0000) have ROW_ID embedded in them and so can't be simply //copied to a table so only allow non-acid files for now throw new SemanticException(ErrorMsg.ACID_LOAD_DATA_INVALID_FILE_NAME, diff --git ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java index b98c74a889..89c6173a78 100644 --- ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java +++ ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java @@ -18,25 +18,16 @@ package org.apache.hadoop.hive.ql; import org.apache.commons.io.FileUtils; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.StorageDescriptor; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; -import org.apache.hadoop.io.NullWritable; import org.junit.Assert; -import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import org.junit.rules.TestName; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; -import java.io.IOException; import java.util.List; /** @@ -464,4 +455,26 @@ private void checkResult(String[][] expectedResult, String query, boolean isVect checkExpected(rs, expectedResult, msg + (isVectorized ? " vect" : ""), LOG, !isVectorized); assertVectorized(isVectorized, query); } + @Test + public void testAnyFileName() throws Exception { + boolean isVectorized = false; + runStatementOnDriver("drop table if exists T"); + runStatementOnDriver("drop table if exists Tstage"); + runStatementOnDriver("create table T ( ctinyint TINYINT,\n" + + " csmallint SMALLINT,\n" + + " cint INT,\n" + + " cbigint BIGINT,\n" + + " cfloat FLOAT,\n" + + " cdouble DOUBLE,\n" + + " cstring1 STRING,\n" + + " cstring2 STRING,\n" + + " ctimestamp1 TIMESTAMP,\n" + + " ctimestamp2 TIMESTAMP,\n" + + " cboolean1 BOOLEAN,\n" + + " cboolean2 BOOLEAN) stored as orc tblproperties('transactional'='true')"); + //ql/target/tmp/org.apache.hadoop.hive.ql.TestTxnNoBuckets-1512791382683/warehouse + runStatementOnDriver("load data local inpath '" + getWarehouseDir() + "/../../../../../data/files/alltypesorc' into table T"); + List rs = runStatementOnDriver("select count(*) from T"); + Assert.assertEquals("12288", rs.get(0)); + } }