diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java index 5dcd49ba27..0638caf2e9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java @@ -215,7 +215,7 @@ private int convertJoinBucketMapJoin(JoinOperator joinOp, MapJoinOperator mapJoi LOG.debug("Found a big table branch with parent operator {} and position {}", parentOp, pos); bigTablePosition = pos; bigTableFound = true; - bigInputStat = new Statistics(0, Long.MAX_VALUE, 0); + bigInputStat = new Statistics(0, Long.MAX_VALUE, Long.MAX_VALUE, 0); } else { // Either we've found multiple big table branches, or the current branch cannot // be a big table branch. Disable mapjoin for these cases. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 0ada066d8e..43fc449954 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -2116,7 +2116,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } } - Statistics wcStats = new Statistics(newNumRows, newDataSize, 0); + Statistics wcStats = new Statistics(newNumRows, newDataSize, 0, 0); wcStats.setBasicStatsState(statsState); // evaluate filter expression and update statistics diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java index bc5f9d943e..618c6a21af 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java @@ -53,6 +53,7 @@ public State merge(State otherState) { private long numRows; private long runTimeNumRows; private long dataSize; + private long totalFileSize; private long numErasureCodedFiles; private State basicStatsState; private Map columnStats; @@ -60,12 +61,13 @@ public State merge(State otherState) { private boolean runtimeStats; public Statistics() { - this(0, 0, 0); + this(0, 0, 0, 0); } - public Statistics(long nr, long ds, long numEcFiles) { + public Statistics(long nr, long ds, long fs, long numEcFiles) { numRows = nr; dataSize = ds; + totalFileSize = fs; numErasureCodedFiles = numEcFiles; runTimeNumRows = -1; columnStats = null; @@ -74,6 +76,14 @@ public Statistics(long nr, long ds, long numEcFiles) { updateBasicStatsState(); } + public void setTotalFileSize(final long totalFileSize) { + this.totalFileSize = totalFileSize; + } + + public long getTotalFileSize() { + return totalFileSize; + } + public long getNumRows() { return numRows; } @@ -191,7 +201,7 @@ public String extendedToString() { @Override public Statistics clone() { - Statistics clone = new Statistics(numRows, dataSize, numErasureCodedFiles); + Statistics clone = new Statistics(numRows, dataSize, totalFileSize, numErasureCodedFiles); clone.setRunTimeNumRows(runTimeNumRows); clone.setBasicStatsState(basicStatsState); clone.setColumnStatsState(columnStatsState); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java index e38a0675b8..9a97746ae6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java @@ -195,7 +195,7 @@ public DataSizeEstimator(HiveConf conf) { public void apply(BasicStats stats) { long ds = stats.getRawDataSize(); if (ds <= 0) { - ds = stats.getTotalSize(); + ds = stats.getTotalFileSize(); // if data size is still 0 then get file size if (ds <= 0) { @@ -228,6 +228,7 @@ private long getFileSizeForPath(Path path) throws IOException { private long currentNumRows; private long currentDataSize; + private long currentFileSize; private Statistics.State state; public BasicStats(Partish p) { @@ -239,6 +240,7 @@ public BasicStats(Partish p) { currentNumRows = rowCount; currentDataSize = rawDataSize; + currentFileSize = totalSize; if (currentNumRows > 0) { state = State.COMPLETE; @@ -252,10 +254,12 @@ public BasicStats(List partStats) { partish = null; List nrIn = Lists.newArrayList(); List dsIn = Lists.newArrayList(); + List fsIn = Lists.newArrayList(); state = (partStats.size() == 0) ? State.COMPLETE : null; for (BasicStats ps : partStats) { nrIn.add(ps.getNumRows()); dsIn.add(ps.getDataSize()); + fsIn.add(ps.getTotalFileSize()); if (state == null) { state = ps.getState(); @@ -265,6 +269,7 @@ public BasicStats(List partStats) { } currentNumRows = StatsUtils.getSumIgnoreNegatives(nrIn); currentDataSize = StatsUtils.getSumIgnoreNegatives(dsIn); + currentFileSize = StatsUtils.getSumIgnoreNegatives(fsIn); } @@ -292,8 +297,12 @@ protected void setDataSize(long ds) { currentDataSize = ds; } - protected long getTotalSize() { - return totalSize; + protected long getTotalFileSize() { + return currentFileSize; + } + + public void setTotalFileSize(final long totalFileSize) { + this.currentFileSize = totalFileSize; } protected long getRawDataSize() { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index bd4a4f6d2c..45bd5e0633 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -147,7 +147,7 @@ * - hive configuration * @param partList * - partition list - * @param table + * @param tablebasicStats.getNumRows() * - table * @param tableScanOperator * - table scan operator @@ -276,6 +276,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p // long nr = getNumRows(conf, schema, neededColumns, table, ds); long ds = basicStats.getDataSize(); long nr = basicStats.getNumRows(); + long fs = basicStats.getTotalFileSize(); List colStats = Collections.emptyList(); long numErasureCodedFiles = getErasureCodedFiles(table); @@ -292,7 +293,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p } } - stats = new Statistics(nr, ds, numErasureCodedFiles); + stats = new Statistics(nr, ds, fs, numErasureCodedFiles); // infer if any column can be primary key based on column statistics inferAndSetPrimaryKey(stats.getNumRows(), colStats); @@ -321,6 +322,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p long nr = bbs.getNumRows(); long ds = bbs.getDataSize(); + long fs = bbs.getTotalFileSize(); List erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.NUM_ERASURE_CODED_FILES); @@ -329,7 +331,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p if (nr == 0) { nr = 1; } - stats = new Statistics(nr, ds, numErasureCodedFiles); + stats = new Statistics(nr, ds, fs, numErasureCodedFiles); stats.setBasicStatsState(bbs.getState()); if (nr > 0) { // FIXME: this promotion process should be removed later diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java index c2a1823897..cb8ac38581 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java @@ -96,7 +96,7 @@ private void runEstimationCheck(HashTableKeyType l) throws SerDeException, IOExc dataSize += 8; } - Statistics stat = new Statistics(keyCount, dataSize, 0); + Statistics stat = new Statistics(keyCount, dataSize, 0, 0); Long realObjectSize = getObjectSize(container); Long executionEstimate = container.getEstimatedMemorySize();