From 75a63523ed05a18996e6460b0cc0f09b5de04117 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Wed, 10 Feb 2016 15:51:04 -0800 Subject: [PATCH] HIVE-13040 : Handle empty bucket creations more efficiently --- .../apache/hadoop/hive/ql/exec/StatsNoJobTask.java | 31 ++++++++------ .../org/apache/hadoop/hive/ql/exec/Utilities.java | 4 +- .../org/apache/hadoop/hive/ql/io/AcidUtils.java | 5 ++- .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 12 +++--- .../hadoop/hive/ql/io/orc/OrcOutputFormat.java | 17 ++++---- .../dynpart_sort_opt_vectorization.q.out | 4 +- .../tez/dynpart_sort_opt_vectorization.q.out | 8 ++-- .../tez/dynpart_sort_optimization.q.out | 4 +- .../clientpositive/tez/union_fast_stats.q.out | 14 +++---- .../clientpositive/tez/vector_outer_join1.q.out | 48 +++++++++++----------- .../clientpositive/tez/vector_outer_join4.q.out | 48 +++++++++++----------- .../results/clientpositive/union_fast_stats.q.out | 16 ++++---- .../apache/hadoop/hive/shims/Hadoop23Shims.java | 2 +- 13 files changed, 109 insertions(+), 104 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java index 14eacdf..6dca180 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java @@ -125,7 +125,7 @@ public String getName() { class StatsCollection implements Runnable { - private Partition partn; + private final Partition partn; public StatsCollection(Partition part) { this.partn = part; @@ -150,7 +150,7 @@ public void run() { boolean statsAvailable = false; for(FileStatus file: fileList) { if (!file.isDir()) { - InputFormat inputFormat = (InputFormat) ReflectionUtil.newInstance( + InputFormat inputFormat = ReflectionUtil.newInstance( partn.getInputFormatClass(), jc); InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { partn.getLocation() }); @@ -195,7 +195,7 @@ public void run() { "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e)); // Before updating the partition params, if any partition params is null - // and if statsReliable is true then updatePartition() function will fail + // and if statsReliable is true then updatePartition() function will fail // the task by returning 1 if (work.isStatsReliable()) { partUpdates.put(tPart.getSd().getLocation(), null); @@ -246,22 +246,27 @@ private int aggregateStats(ExecutorService threadPool) { boolean statsAvailable = false; for(FileStatus file: fileList) { if (!file.isDir()) { - InputFormat inputFormat = (InputFormat) ReflectionUtil.newInstance( + InputFormat inputFormat = ReflectionUtil.newInstance( table.getInputFormatClass(), jc); InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table .getDataLocation().toString() }); - org.apache.hadoop.mapred.RecordReader recordReader = - inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL); - StatsProvidingRecordReader statsRR; - if (recordReader instanceof StatsProvidingRecordReader) { - statsRR = (StatsProvidingRecordReader) recordReader; - numRows += statsRR.getStats().getRowCount(); - rawDataSize += statsRR.getStats().getRawDataSize(); - fileSize += file.getLen(); + if (file.getLen() == 0) { numFiles += 1; statsAvailable = true; + } else { + org.apache.hadoop.mapred.RecordReader recordReader = + inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL); + StatsProvidingRecordReader statsRR; + if (recordReader instanceof StatsProvidingRecordReader) { + statsRR = (StatsProvidingRecordReader) recordReader; + numRows += statsRR.getStats().getRowCount(); + rawDataSize += statsRR.getStats().getRawDataSize(); + fileSize += file.getLen(); + numFiles += 1; + statsAvailable = true; + } + recordReader.close(); } - recordReader.close(); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 7a62ff9..ab0635e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -1480,7 +1480,7 @@ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws I taskIDToFile = removeTempOrDuplicateFiles(items, fs); // if the table is bucketed and enforce bucketing, we should check and generate all buckets - if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null) { + if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) { // refresh the file list items = fs.listStatus(parts[i].getPath()); // get the missing buckets and generate empty buckets @@ -1500,7 +1500,7 @@ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws I FileStatus[] items = fs.listStatus(path); taskIDToFile = removeTempOrDuplicateFiles(items, fs); if(taskIDToFile != null && taskIDToFile.size() > 0 && conf != null && conf.getTable() != null - && (conf.getTable().getNumBuckets() > taskIDToFile.size())) { + && (conf.getTable().getNumBuckets() > taskIDToFile.size()) && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) { // get the missing buckets and generate empty buckets for non-dynamic partition String taskID1 = taskIDToFile.keySet().iterator().next(); Path bucketPath = taskIDToFile.values().iterator().next().getPath(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java index 520ae74..b3299db 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -106,6 +106,7 @@ private AcidUtils() { Pattern.compile("[0-9]+_[0-9]+"); public static final PathFilter hiddenFileFilter = new PathFilter(){ + @Override public boolean accept(Path p){ String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); @@ -605,7 +606,7 @@ private static void getChildState(FileStatus child, HdfsFileStatusWithId childWi // it is possible that the cleaner is running and removing these original files, // in which case recursing through them could cause us to get an error. originalDirectories.add(child); - } else { + } else if (child.getLen() != 0){ original.add(createOriginalObj(childWithId, child)); } } @@ -616,7 +617,7 @@ public static HdfsFileStatusWithId createOriginalObj( } private static class HdfsFileStatusWithoutId implements HdfsFileStatusWithId { - private FileStatus fs; + private final FileStatus fs; public HdfsFileStatusWithoutId(FileStatus fs) { this.fs = fs; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index f36f707..74b3826 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -859,11 +859,13 @@ public BISplitStrategy(Context context, FileSystem fs, List splits = Lists.newArrayList(); for (HdfsFileStatusWithId file : fileStatuses) { FileStatus fileStatus = file.getFileStatus(); - String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue() - .getHosts(); - OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), file.getFileId(), 0, - fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1); - splits.add(orcSplit); + if (fileStatus.getLen() != 0) { + String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue() + .getHosts(); + OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), file.getFileId(), 0, + fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1); + splits.add(orcSplit); + } } // add uncovered ACID delta splits diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java index 3fb6a86..b0f8c8b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java @@ -110,23 +110,20 @@ public void close(Reporter reporter) throws IOException { @Override public void close(boolean b) throws IOException { - // if we haven't written any rows, we need to create a file with a - // generic schema. if (writer == null) { - // a row with no columns - ObjectInspector inspector = ObjectInspectorFactory. - getStandardStructObjectInspector(new ArrayList(), - new ArrayList()); - options.inspector(inspector); - writer = OrcFile.createWriter(path, options); + // we are closing a file without writing any data in it + FileSystem fs = options.getFileSystem() == null ? + path.getFileSystem(options.getConfiguration()) : options.getFileSystem(); + fs.createNewFile(path); + return; } writer.close(); } @Override public SerDeStats getStats() { - stats.setRawDataSize(writer.getRawDataSize()); - stats.setRowCount(writer.getNumberOfRows()); + stats.setRawDataSize(null == writer ? 0 : writer.getRawDataSize()); + stats.setRowCount(null == writer ? 0 : writer.getNumberOfRows()); return stats; } } diff --git a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out index be2b61e..d03bfe4 100644 --- a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out +++ b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out @@ -1104,7 +1104,7 @@ Partition Parameters: numFiles 8 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information @@ -1186,7 +1186,7 @@ Partition Parameters: numFiles 8 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out index 79558d5..a90e3f6 100644 --- a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out +++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out @@ -1161,10 +1161,10 @@ Table: over1k_part_buck_orc #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 8 + numFiles 4 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information @@ -1243,10 +1243,10 @@ Table: over1k_part_buck_sort_orc #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 8 + numFiles 4 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out index fbeea6b..5292106 100644 --- a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out +++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out @@ -1074,7 +1074,7 @@ Table: over1k_part_buck #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 8 + numFiles 4 numRows 6 rawDataSize 156 totalSize 162 @@ -1156,7 +1156,7 @@ Table: over1k_part_buck_sort #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 8 + numFiles 4 numRows 6 rawDataSize 156 totalSize 162 diff --git a/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out b/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out index 41c0d71..c703e84 100644 --- a/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out +++ b/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out @@ -120,7 +120,7 @@ Table Parameters: numFiles 4 numRows 0 rawDataSize 0 - totalSize 4211 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -171,9 +171,9 @@ Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} numFiles 4 - numRows 15 - rawDataSize 3651 - totalSize 4211 + numRows 0 + rawDataSize 0 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -236,9 +236,9 @@ Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} numFiles 5 - numRows 20 - rawDataSize 4720 - totalSize 5568 + numRows 5 + rawDataSize 1069 + totalSize 5360 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out b/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out index d962621..4e2e62c 100644 --- a/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out @@ -184,11 +184,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -198,11 +198,11 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23 input vertices: 1 Map 2 - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true File Output Operator compressed: false - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -212,16 +212,16 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col2 (type: int) sort order: + Map-reduce partition columns: _col2 (type: int) - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) Execution mode: vectorized @@ -296,11 +296,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -310,11 +310,11 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 2 - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true File Output Operator compressed: false - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -324,16 +324,16 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint) sort order: + Map-reduce partition columns: _col0 (type: tinyint) - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Stage: Stage-0 @@ -500,11 +500,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), cint (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -514,7 +514,7 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 3 - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Map Join Operator condition map: @@ -525,7 +525,7 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 4 - Statistics: Num rows: 17 Data size: 4417 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 17 Data size: 4214 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Group By Operator aggregations: count(), sum(_col0) @@ -541,31 +541,31 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int) outputColumnNames: _col0 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Map 4 Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint) sort order: + Map-reduce partition columns: _col0 (type: tinyint) - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Execution mode: vectorized diff --git a/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out b/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out index 9db8e00..a6690b6 100644 --- a/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out @@ -214,11 +214,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -228,11 +228,11 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23 input vertices: 1 Map 2 - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true File Output Operator compressed: false - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -242,16 +242,16 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col2 (type: int) sort order: + Map-reduce partition columns: _col2 (type: int) - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) Execution mode: vectorized @@ -361,11 +361,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -375,11 +375,11 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 2 - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true File Output Operator compressed: false - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -389,16 +389,16 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint) sort order: + Map-reduce partition columns: _col0 (type: tinyint) - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Stage: Stage-0 @@ -870,11 +870,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), cint (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -884,7 +884,7 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 3 - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Map Join Operator condition map: @@ -894,7 +894,7 @@ STAGE PLANS: 1 _col0 (type: tinyint) input vertices: 1 Map 4 - Statistics: Num rows: 36 Data size: 8476 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 36 Data size: 8273 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Group By Operator aggregations: count() @@ -910,31 +910,31 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int) outputColumnNames: _col0 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Map 4 Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint) sort order: + Map-reduce partition columns: _col0 (type: tinyint) - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Execution mode: vectorized diff --git a/ql/src/test/results/clientpositive/union_fast_stats.q.out b/ql/src/test/results/clientpositive/union_fast_stats.q.out index a02ff04..e908ec0 100644 --- a/ql/src/test/results/clientpositive/union_fast_stats.q.out +++ b/ql/src/test/results/clientpositive/union_fast_stats.q.out @@ -117,10 +117,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 4 + numFiles 3 numRows 15 rawDataSize 3483 - totalSize 4211 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -170,10 +170,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 4 + numFiles 3 numRows 15 - rawDataSize 3651 - totalSize 4211 + rawDataSize 3483 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -235,10 +235,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 5 + numFiles 4 numRows 20 - rawDataSize 4720 - totalSize 5568 + rawDataSize 4552 + totalSize 5360 #### A masked pattern was here #### # Storage Information diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java index 31060a2..2e7c7a9 100644 --- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java +++ b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java @@ -164,7 +164,7 @@ public RecordReader getRecordReader(InputSplit split, Iterator it = result.iterator(); while (it.hasNext()) { FileStatus stat = it.next(); - if (!stat.isFile()) { + if (!stat.isFile() || stat.getLen() == 0) { it.remove(); } } -- 1.7.12.4 (Apple Git-37)