diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index a00d9075a4..d696e5778d 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2908,6 +2908,12 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal HIVE_COMPACTOR_WAIT_TIMEOUT("hive.compactor.wait.timeout", 300000L, "Time out in " + "milliseconds for blocking compaction. It's value has to be higher than 2000 milliseconds. "), + + HIVE_MR_COMPACTOR_GATHER_STATS("hive.mr.compactor.gather.stats", true, "If set to true MAJOR compaction " + + "will gather stats if there are stats already associated with the table/partition.\n" + + "Turn this off to save some resources and the stats are not used anyway.\n" + + "Works only for MR based compaction, CRUD based compaction uses hive.stats.autogather."), + /** * @deprecated Use MetastoreConf.COMPACTOR_INITIATOR_FAILED_THRESHOLD */ diff --git itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java index c687f146eb..9564208ddc 100644 --- itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java +++ itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java @@ -129,6 +129,7 @@ public void setup() throws Exception { hiveConf.setVar(HiveConf.ConfVars.POSTEXECHOOKS, ""); hiveConf.setVar(HiveConf.ConfVars.METASTOREWAREHOUSE, TEST_WAREHOUSE_DIR); hiveConf.setVar(HiveConf.ConfVars.HIVEINPUTFORMAT, HiveInputFormat.class.getName()); + hiveConf.setBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER, false); TxnDbUtil.setConfValues(hiveConf); TxnDbUtil.cleanDb(hiveConf); @@ -1468,6 +1469,57 @@ private void minorCompactWhileStreamingWithSplitUpdate(boolean newStreamingAPI) } } + @Test + public void testCompactorGatherStats() throws Exception { + String dbName = "default"; + String tableName = "stats_comp_test"; + List colNames = Arrays.asList("a"); + executeStatementOnDriver("drop table if exists " + dbName + "." + tableName, driver); + executeStatementOnDriver("create table " + dbName + "." + tableName + + " (a INT) STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); + executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(1)", driver); + executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(1)", driver); + + TxnStore txnHandler = TxnUtils.getTxnStore(conf); + txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR)); + runWorker(conf); + + // Make sure we do not have statistics for this table yet + // Compaction generates stats only if there is any + List colStats = msClient.getTableColumnStatistics(dbName, + tableName, colNames, Constants.HIVE_ENGINE); + assertEquals("No stats should be there for the table", 0, colStats.size()); + + executeStatementOnDriver("analyze table " + dbName + "." + tableName + " compute statistics for columns", driver); + executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(2)", driver); + + // Make sure we have old statistics for the table + colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE); + assertEquals("Stats should be there", 1, colStats.size()); + assertEquals("Value should contain old data", 1, colStats.get(0).getStatsData().getLongStats().getHighValue()); + assertEquals("Value should contain old data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue()); + + txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR)); + runWorker(conf); + // Make sure the statistics is updated for the table + colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE); + assertEquals("Stats should be there", 1, colStats.size()); + assertEquals("Value should contain new data", 2, colStats.get(0).getStatsData().getLongStats().getHighValue()); + assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue()); + + executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(3)", driver); + HiveConf workerConf = new HiveConf(conf); + workerConf.setBoolVar(ConfVars.HIVE_MR_COMPACTOR_GATHER_STATS, false); + + txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR)); + runWorker(workerConf); + // Make sure the statistics is NOT updated for the table + colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE); + assertEquals("Stats should be there", 1, colStats.size()); + assertEquals("Value should contain new data", 2, colStats.get(0).getStatsData().getLongStats().getHighValue()); + assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue()); + } + /** * Users have the choice of specifying compaction related tblproperties either in CREATE TABLE * statement or in ALTER TABLE .. COMPACT statement. This tests both cases. diff --git ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java index 018c73376f..04251423a8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java +++ ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java @@ -211,6 +211,7 @@ private void overrideMRProps(JobConf job, Map properties) { * @param sd metastore storage descriptor * @param writeIds list of valid write ids * @param ci CompactionInfo + * @param su StatsUpdater which is null if no stats gathering is needed * @throws java.io.IOException if the job fails */ void run(HiveConf conf, String jobName, Table t, Partition p, StorageDescriptor sd, ValidWriteIdList writeIds, @@ -294,7 +295,9 @@ void run(HiveConf conf, String jobName, Table t, Partition p, StorageDescriptor launchCompactionJob(job, baseDir, ci.type, dirsToSearch, dir.getCurrentDirectories(), dir.getCurrentDirectories().size(), dir.getObsolete().size(), conf, msc, ci.id, jobName); - su.gatherStats(); + if (su != null) { + su.gatherStats(); + } } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java index a96cf1e731..cc7b12d89c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java +++ ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java @@ -91,6 +91,7 @@ public static String hostname() { @Override public void run() { LOG.info("Starting Worker thread"); + boolean computeStats = conf.getBoolVar(HiveConf.ConfVars.HIVE_MR_COMPACTOR_GATHER_STATS); do { boolean launchedJob = false; // Make sure nothing escapes this run method and kills the metastore at large, @@ -201,10 +202,11 @@ public void run() { continue; } - LOG.info("Starting " + ci.type.toString() + " compaction for " + ci.getFullPartitionName() + " in " + JavaUtils.txnIdToString(compactorTxnId)); - final StatsUpdater su = StatsUpdater.init(ci, msc.findColumnsWithStats( + LOG.info("Starting " + ci.type.toString() + " compaction for " + ci.getFullPartitionName() + " in " + + JavaUtils.txnIdToString(compactorTxnId) + " with compute stats set to " + computeStats); + final StatsUpdater su = computeStats ? StatsUpdater.init(ci, msc.findColumnsWithStats( CompactionInfo.compactionInfoToStruct(ci)), conf, - runJobAsSelf(ci.runAs) ? ci.runAs : t.getOwner()); + runJobAsSelf(ci.runAs) ? ci.runAs : t.getOwner()) : null; final CompactorMR mr = new CompactorMR(); launchedJob = true; try {