diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 1dc7501291..275a4ed0a8 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -5014,4 +5014,22 @@ public static String generateMrDeprecationWarning() { return reverseMap; } } + @Override + public void set(String name, String value) { + super.set(name, value); + if(ConfVars.HIVE_SUPPORT_CONCURRENCY.varname.equals(name) && !Boolean.parseBoolean(value)) { + /*data/conf/hive-site.xml defaults concurrency to true + * many JUnit tests explicitly set it to false + * when testing Acid, hive-site.xml must be set concurrency=true and txn.manager=DbTxnMana... + * This ends up with any tests that explicitly sets concur=false running with + * DbTxnManager and concur=false which is an invalid config. This is a lame hack but I don't + * see a better option. Removing explicit setting of concurrency in tests to make it use + * the default ends up useng the default ZK based LM which wedges a lot of tests and I can't + * yet enable DbTxnManger by default... + * + * Another option is to make DbTxnManger do nothing if concurrency=false so that setHiveConf() + * doesn't throw but this may lead to bad misconfigs in the field*/ + setVar(ConfVars.HIVE_TXN_MANAGER, "org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager"); + } + } } diff --git data/conf/hive-site.xml data/conf/hive-site.xml index 9f6aec178d..29a4026dbd 100644 --- data/conf/hive-site.xml +++ data/conf/hive-site.xml @@ -69,6 +69,19 @@ + metastore.create.as.acid + true + + + hive.txn.manager + org.apache.hadoop.hive.ql.lockmgr.DbTxnManager + + + hive.exec.dynamic.partition.mode + nonstrict + + + javax.jdo.option.ConnectionURL jdbc:derby:memory:${test.tmp.dir}/junit_metastore_db;create=true diff --git data/conf/llap/hive-site.xml data/conf/llap/hive-site.xml index cdda875ddb..e64fc9146a 100644 --- data/conf/llap/hive-site.xml +++ data/conf/llap/hive-site.xml @@ -166,10 +166,22 @@ org.apache.hadoop.hive.ql.hooks.PostExecutePrinter Post Execute Hook for Tests + + metastore.create.as.acid + true + + + hive.txn.manager + org.apache.hadoop.hive.ql.lockmgr.DbTxnManager + + + hive.exec.dynamic.partition.mode + nonstrict + hive.support.concurrency - false + true Whether hive supports concurrency or not. A zookeeper instance must be up and running for the default hive lock manager to support read-write locks. diff --git itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java index f79dbac573..9b102f2a36 100644 --- itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java +++ itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java @@ -96,6 +96,7 @@ import org.apache.hadoop.hive.common.io.SortPrintStream; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; import org.apache.hive.druid.MiniDruidCluster; import org.apache.hadoop.hive.llap.LlapItUtils; import org.apache.hadoop.hive.llap.daemon.MiniLlapCluster; @@ -555,6 +556,7 @@ public QTestUtil(String outDir, String logDir, MiniClusterType clusterType, // HIVE-14443 move this fall-back logic to CliConfigs if (confDir != null && !confDir.isEmpty()) { HiveConf.setHiveSiteLocation(new URL("file://"+ new File(confDir).toURI().getPath() + "/hive-site.xml")); + MetastoreConf.setHiveSiteLocation(HiveConf.getHiveSiteLocation()); System.out.println("Setting hive-site: "+HiveConf.getHiveSiteLocation()); } diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 20d7593d88..7e54fe3feb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -3376,16 +3376,30 @@ private static Path mvFile(HiveConf conf, FileSystem sourceFs, Path sourcePath, final String fullname = sourcePath.getName(); final String name = FilenameUtils.getBaseName(sourcePath.getName()); final String type = FilenameUtils.getExtension(sourcePath.getName()); + final boolean isAcidTarget = destDirPath.getName().startsWith(AcidUtils.BASE_PREFIX) || destDirPath.getName().startsWith(AcidUtils.DELTA_PREFIX); + Path destFilePath; + if(isAcidTarget && !AcidUtils.originalBucketFilter.accept(sourcePath)) { + //if here we are doing a load Data into acid table - todo: make this more explicit + //Acid tables can only deal with files matching AcidUtils.originalBucketFilter. + //so here we rename the input file and further logic will add a copy_N suffix in case of + //collisions. (This works since Load Data doesn't support bucketed tables for now) + destFilePath = new Path(destDirPath, "000000_0"); + } + else { + destFilePath = new Path(destDirPath, fullname); + } - Path destFilePath = new Path(destDirPath, fullname); - - /* + /* * The below loop may perform bad when the destination file already exists and it has too many _copy_ * files as well. A desired approach was to call listFiles() and get a complete list of files from * the destination, and check whether the file exists or not on that list. However, millions of files * could live on the destination directory, and on concurrent situations, this can cause OOM problems. * * I'll leave the below loop for now until a better approach is found. + * + * This is problematic: caller of mvFile() may use a thread pool to move files in parallel in + * which case there is a race condition between exists() and reaname() from different threads. + * I suppose in case of collisions the FileSystem will throw and the command will fail. */ for (int counter = 1; destFs.exists(destFilePath); counter++) { if (isOverwrite) { diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index 4535c3edc2..ea63016ad4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -160,7 +160,7 @@ private URI initializeFromURI(String fromPath, boolean isLocal) throws IOExcepti "source contains directory: " + oneSrc.getPath().toString())); } if(AcidUtils.isAcidTable(table)) { - if(!AcidUtils.originalBucketFilter.accept(oneSrc.getPath())) { + if(false && !AcidUtils.originalBucketFilter.accept(oneSrc.getPath())) { //acid files (e.g. bucket_0000) have ROW_ID embedded in them and so can't be simply //copied to a table so only allow non-acid files for now throw new SemanticException(ErrorMsg.ACID_LOAD_DATA_INVALID_FILE_NAME, diff --git ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java index c4911bbe7c..25839e1cd5 100644 --- ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java +++ ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java @@ -18,6 +18,9 @@ package org.apache.hadoop.hive.ql; import org.apache.commons.io.FileUtils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; import org.junit.Assert; @@ -460,4 +463,64 @@ private void checkResult(String[][] expectedResult, String query, boolean isVect checkExpected(rs, expectedResult, msg + (isVectorized ? " vect" : ""), LOG, !isVectorized); assertVectorized(isVectorized, query); } + @Test + public void testAnyFileName() throws Exception { + boolean isVectorized = false; + runStatementOnDriver("drop table if exists T"); + runStatementOnDriver("create table T ( ctinyint TINYINT,\n" + + " csmallint SMALLINT,\n" + + " cint INT,\n" + + " cbigint BIGINT,\n" + + " cfloat FLOAT,\n" + + " cdouble DOUBLE,\n" + + " cstring1 STRING,\n" + + " cstring2 STRING,\n" + + " ctimestamp1 TIMESTAMP,\n" + + " ctimestamp2 TIMESTAMP,\n" + + " cboolean1 BOOLEAN,\n" + + " cboolean2 BOOLEAN) stored as orc tblproperties('transactional'='true')"); + //ql/target/tmp/org.apache.hadoop.hive.ql.TestTxnNoBuckets-1512791382683/warehouse + runStatementOnDriver("load data local inpath '" + getWarehouseDir() + "/../../../../../data/files/alltypesorc' into table T"); + List rs = runStatementOnDriver("select count(*) from T"); + Assert.assertEquals("12288", rs.get(0)); + runStatementOnDriver("load data local inpath '" + getWarehouseDir() + "/../../../../../data/files/alltypesorc' into table T"); + rs = runStatementOnDriver("select count(*) from T"); + Assert.assertEquals("24576", rs.get(0)); + String[][] expected = new String[][] { + {"{\"transactionid\":15,\"bucketid\":536870912,\"rowid\":0}", "t/delta_0000015_0000015_0000/000000_0"}, + {"{\"transactionid\":17,\"bucketid\":536870912,\"rowid\":0}", "t/delta_0000017_0000017_0000/000000_0"} + }; + checkResult(expected, "select ROW__ID, INPUT__FILE__NAME from T where ROW__ID.rowid=0 limit 2", isVectorized, "load data inpath"); + } + @Test + public void testAnyFileName2() throws Exception { + boolean isVectorized = false; + runStatementOnDriver("drop table if exists T"); + runStatementOnDriver("drop table if exists Tstage"); + runStatementOnDriver("create table Tstage (a int, b int) clustered by (a) into 2 buckets stored as orc"); + runStatementOnDriver("insert into Tstage values(1,2),(2,2)"); + runStatementOnDriver("export table Tstage to '" + getWarehouseDir() + "/1'"); + runStatementOnDriver("create table T (a int, b int) stored as orc tblproperties('transactional'='true')"); + FileSystem fs = FileSystem.get(hiveConf); + FileStatus[] status; + status = fs.listStatus(new Path(getWarehouseDir() + "/1/data"), + org.apache.hadoop.hive.common.FileUtils.STAGING_DIR_PATH_FILTER); + fs.rename(status[0].getPath(), new Path(status[0].getPath().getParent(), "a_0")); + fs.rename(status[1].getPath(), new Path(status[1].getPath().getParent(), "b_0")); + + runStatementOnDriver("load data local inpath '" + getWarehouseDir() + "/1/data' into table T");//only get 1 file in T + + //ql/target/tmp/org.apache.hadoop.hive.ql.TestTxnNoBuckets-1512791382683/warehouse + runStatementOnDriver("load data local inpath '" + getWarehouseDir() + "/../../../../../data/files/alltypesorc' into table T"); + List rs = runStatementOnDriver("select count(*) from T"); + Assert.assertEquals("12288", rs.get(0)); + runStatementOnDriver("load data local inpath '" + getWarehouseDir() + "/../../../../../data/files/alltypesorc' into table T"); + rs = runStatementOnDriver("select count(*) from T"); + Assert.assertEquals("24576", rs.get(0)); + String[][] expected = new String[][] { + {"{\"transactionid\":15,\"bucketid\":536870912,\"rowid\":0}", "t/delta_0000015_0000015_0000/000000_0"}, + {"{\"transactionid\":17,\"bucketid\":536870912,\"rowid\":0}", "t/delta_0000017_0000017_0000/000000_0"} + }; + checkResult(expected, "select ROW__ID, INPUT__FILE__NAME from T where ROW__ID.rowid=0 limit 2", isVectorized, "load data inpath"); + } } diff --git standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java index 2e43dc85ea..00f3c8d84e 100644 --- standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java +++ standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java @@ -1032,6 +1032,10 @@ private MetastoreConf() { throw new RuntimeException("You should never be creating one of these!"); } + public static void setHiveSiteLocation(URL location) { + hiveSiteURL = location; + } + public static Configuration newMetastoreConf() { Configuration conf = new Configuration(); @@ -1047,7 +1051,15 @@ public static Configuration newMetastoreConf() { // Add in hive-site.xml. We add this first so that it gets overridden by the new metastore // specific files if they exist. - hiveSiteURL = findConfigFile(classLoader, "hive-site.xml"); + if(hiveSiteURL == null) { + /* + * this 'if' is pretty lame - QTestUtil.QTestUtil() uses hiveSiteURL to load a specific + * hive-site.xml from data/conf/ so this makes it follow the same logic - otherwise + * HiveConf and MetastoreConf may load different hive-site.xml ( For example, + * HiveConf uses data/conf/spark/hive-site.xml and MetastoreConf data/conf/hive-site.xml) + */ + hiveSiteURL = findConfigFile(classLoader, "hive-site.xml"); + } if (hiveSiteURL != null) conf.addResource(hiveSiteURL); // Now add hivemetastore-site.xml. Again we add this before our own config files so that the