Index: common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java (revision 8e7f23f34b2ce7328c9d571a13c336f0c8cdecb6) +++ common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java (date 1640770144000) @@ -78,6 +78,36 @@ return fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER); } + public static FileStatus[] getUnionAllFileStatusRecurse(Path path, int level, FileSystem fs) + throws IOException { + if ( level < 0) { + return null; + } + + // construct a path pattern (e.g., /*/*) to find all dynamically generated paths + StringBuilder sb = new StringBuilder(path.toUri().getPath()); + for (int i = 0; i <= level; i++) { + sb.append(Path.SEPARATOR).append("*"); + } + Path pathPattern = new Path(path, sb.toString()); + FileStatus[] statuses = fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER); + if(null == statuses){ + return null; + } + List results = new ArrayList<>(); + for (FileStatus status : statuses) { + if (status.isDirectory()) { + results.add(status); + } + } + + if(results.isEmpty()) { + return null; + } + + return results.toArray(new FileStatus[0]); + } + public static int getNumBitVectorsForNDVEstimation(Configuration conf) throws Exception { int numBitVectors; float percentageError = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_NDV_ERROR); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (revision 8e7f23f34b2ce7328c9d571a13c336f0c8cdecb6) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (date 1640770143000) @@ -1409,6 +1409,20 @@ FileStatus[] statuses = HiveStatsUtils.getFileStatusRecurse( tmpPath, ((dpCtx == null) ? 1 : dpCtx.getNumDPCols()), fs); if(statuses != null && statuses.length > 0) { + // check has union all operate + ContentSummary summary = fs.getContentSummary(tmpPath); + if (summary.getDirectoryCount() > statuses.length) { + FileStatus[] unionAllStatuses = HiveStatsUtils.getUnionAllFileStatusRecurse( + tmpPath, ((dpCtx == null) ? 1 : dpCtx.getNumDPCols()), fs); + if (null != unionAllStatuses && unionAllStatuses.length > 0) { + FileStatus[] preStatuses = statuses; + statuses = new FileStatus[preStatuses.length + unionAllStatuses.length]; + System.arraycopy(preStatuses, 0, statuses, 0, preStatuses.length); + System.arraycopy(unionAllStatuses, 0, statuses, preStatuses.length, + unionAllStatuses.length); + } + } + PerfLogger perfLogger = SessionState.getPerfLogger(); perfLogger.PerfLogBegin("FileSinkOperator", "RemoveTempOrDuplicateFiles"); // remove any tmp file or double-committed output files