From 24fe061147646f6859cea51bacebed32dfcdcf62 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Wed, 10 Feb 2016 15:51:04 -0800 Subject: [PATCH] HIVE-13040 : Handle empty bucket creations more efficiently --- .../java/org/apache/hadoop/hive/ql/exec/Utilities.java | 4 ++-- ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java | 5 +++-- .../apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java | 17 +++++++---------- .../clientpositive/dynpart_sort_opt_vectorization.q.out | 4 ++-- .../test/results/clientpositive/union_fast_stats.q.out | 16 ++++++++-------- .../org/apache/hadoop/hive/shims/Hadoop23Shims.java | 2 +- 6 files changed, 23 insertions(+), 25 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 7a62ff9..ab0635e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -1480,7 +1480,7 @@ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws I taskIDToFile = removeTempOrDuplicateFiles(items, fs); // if the table is bucketed and enforce bucketing, we should check and generate all buckets - if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null) { + if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) { // refresh the file list items = fs.listStatus(parts[i].getPath()); // get the missing buckets and generate empty buckets @@ -1500,7 +1500,7 @@ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws I FileStatus[] items = fs.listStatus(path); taskIDToFile = removeTempOrDuplicateFiles(items, fs); if(taskIDToFile != null && taskIDToFile.size() > 0 && conf != null && conf.getTable() != null - && (conf.getTable().getNumBuckets() > taskIDToFile.size())) { + && (conf.getTable().getNumBuckets() > taskIDToFile.size()) && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) { // get the missing buckets and generate empty buckets for non-dynamic partition String taskID1 = taskIDToFile.keySet().iterator().next(); Path bucketPath = taskIDToFile.values().iterator().next().getPath(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java index 520ae74..b3299db 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -106,6 +106,7 @@ private AcidUtils() { Pattern.compile("[0-9]+_[0-9]+"); public static final PathFilter hiddenFileFilter = new PathFilter(){ + @Override public boolean accept(Path p){ String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); @@ -605,7 +606,7 @@ private static void getChildState(FileStatus child, HdfsFileStatusWithId childWi // it is possible that the cleaner is running and removing these original files, // in which case recursing through them could cause us to get an error. originalDirectories.add(child); - } else { + } else if (child.getLen() != 0){ original.add(createOriginalObj(childWithId, child)); } } @@ -616,7 +617,7 @@ public static HdfsFileStatusWithId createOriginalObj( } private static class HdfsFileStatusWithoutId implements HdfsFileStatusWithId { - private FileStatus fs; + private final FileStatus fs; public HdfsFileStatusWithoutId(FileStatus fs) { this.fs = fs; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java index 3fb6a86..b0f8c8b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java @@ -110,23 +110,20 @@ public void close(Reporter reporter) throws IOException { @Override public void close(boolean b) throws IOException { - // if we haven't written any rows, we need to create a file with a - // generic schema. if (writer == null) { - // a row with no columns - ObjectInspector inspector = ObjectInspectorFactory. - getStandardStructObjectInspector(new ArrayList(), - new ArrayList()); - options.inspector(inspector); - writer = OrcFile.createWriter(path, options); + // we are closing a file without writing any data in it + FileSystem fs = options.getFileSystem() == null ? + path.getFileSystem(options.getConfiguration()) : options.getFileSystem(); + fs.createNewFile(path); + return; } writer.close(); } @Override public SerDeStats getStats() { - stats.setRawDataSize(writer.getRawDataSize()); - stats.setRowCount(writer.getNumberOfRows()); + stats.setRawDataSize(null == writer ? 0 : writer.getRawDataSize()); + stats.setRowCount(null == writer ? 0 : writer.getNumberOfRows()); return stats; } } diff --git a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out index be2b61e..d03bfe4 100644 --- a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out +++ b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out @@ -1104,7 +1104,7 @@ Partition Parameters: numFiles 8 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information @@ -1186,7 +1186,7 @@ Partition Parameters: numFiles 8 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/union_fast_stats.q.out b/ql/src/test/results/clientpositive/union_fast_stats.q.out index a02ff04..e908ec0 100644 --- a/ql/src/test/results/clientpositive/union_fast_stats.q.out +++ b/ql/src/test/results/clientpositive/union_fast_stats.q.out @@ -117,10 +117,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 4 + numFiles 3 numRows 15 rawDataSize 3483 - totalSize 4211 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -170,10 +170,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 4 + numFiles 3 numRows 15 - rawDataSize 3651 - totalSize 4211 + rawDataSize 3483 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -235,10 +235,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 5 + numFiles 4 numRows 20 - rawDataSize 4720 - totalSize 5568 + rawDataSize 4552 + totalSize 5360 #### A masked pattern was here #### # Storage Information diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java index 31060a2..2e7c7a9 100644 --- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java +++ b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java @@ -164,7 +164,7 @@ public RecordReader getRecordReader(InputSplit split, Iterator it = result.iterator(); while (it.hasNext()) { FileStatus stat = it.next(); - if (!stat.isFile()) { + if (!stat.isFile() || stat.getLen() == 0) { it.remove(); } } -- 1.7.12.4 (Apple Git-37)