From 0a65aecc15445f7403ac41e1c6de4210d93e0cd5 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Wed, 10 Feb 2016 15:51:04 -0800 Subject: [PATCH] HIVE-13040 : Handle empty bucket creations more efficiently --- .../apache/hadoop/hive/ql/exec/StatsNoJobTask.java | 31 ++++++++------ .../org/apache/hadoop/hive/ql/exec/Utilities.java | 4 +- .../org/apache/hadoop/hive/ql/io/AcidUtils.java | 5 ++- .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 12 +++--- .../hadoop/hive/ql/io/orc/OrcOutputFormat.java | 17 ++++---- .../apache/hadoop/hive/ql/io/TestAcidUtils.java | 42 +++++++++---------- .../hive/ql/io/orc/TestInputOutputFormat.java | 31 +++++++------- .../dynpart_sort_opt_vectorization.q.out | 4 +- .../tez/dynpart_sort_opt_vectorization.q.out | 8 ++-- .../tez/dynpart_sort_optimization.q.out | 4 +- .../clientpositive/tez/union_fast_stats.q.out | 14 +++---- .../clientpositive/tez/vector_outer_join1.q.out | 48 +++++++++++----------- .../clientpositive/tez/vector_outer_join4.q.out | 48 +++++++++++----------- .../results/clientpositive/union_fast_stats.q.out | 16 ++++---- .../apache/hadoop/hive/shims/Hadoop23Shims.java | 2 +- 15 files changed, 146 insertions(+), 140 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java index 14eacdf..6dca180 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java @@ -125,7 +125,7 @@ public String getName() { class StatsCollection implements Runnable { - private Partition partn; + private final Partition partn; public StatsCollection(Partition part) { this.partn = part; @@ -150,7 +150,7 @@ public void run() { boolean statsAvailable = false; for(FileStatus file: fileList) { if (!file.isDir()) { - InputFormat inputFormat = (InputFormat) ReflectionUtil.newInstance( + InputFormat inputFormat = ReflectionUtil.newInstance( partn.getInputFormatClass(), jc); InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { partn.getLocation() }); @@ -195,7 +195,7 @@ public void run() { "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e)); // Before updating the partition params, if any partition params is null - // and if statsReliable is true then updatePartition() function will fail + // and if statsReliable is true then updatePartition() function will fail // the task by returning 1 if (work.isStatsReliable()) { partUpdates.put(tPart.getSd().getLocation(), null); @@ -246,22 +246,27 @@ private int aggregateStats(ExecutorService threadPool) { boolean statsAvailable = false; for(FileStatus file: fileList) { if (!file.isDir()) { - InputFormat inputFormat = (InputFormat) ReflectionUtil.newInstance( + InputFormat inputFormat = ReflectionUtil.newInstance( table.getInputFormatClass(), jc); InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table .getDataLocation().toString() }); - org.apache.hadoop.mapred.RecordReader recordReader = - inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL); - StatsProvidingRecordReader statsRR; - if (recordReader instanceof StatsProvidingRecordReader) { - statsRR = (StatsProvidingRecordReader) recordReader; - numRows += statsRR.getStats().getRowCount(); - rawDataSize += statsRR.getStats().getRawDataSize(); - fileSize += file.getLen(); + if (file.getLen() == 0) { numFiles += 1; statsAvailable = true; + } else { + org.apache.hadoop.mapred.RecordReader recordReader = + inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL); + StatsProvidingRecordReader statsRR; + if (recordReader instanceof StatsProvidingRecordReader) { + statsRR = (StatsProvidingRecordReader) recordReader; + numRows += statsRR.getStats().getRowCount(); + rawDataSize += statsRR.getStats().getRawDataSize(); + fileSize += file.getLen(); + numFiles += 1; + statsAvailable = true; + } + recordReader.close(); } - recordReader.close(); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 7a62ff9..ab0635e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -1480,7 +1480,7 @@ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws I taskIDToFile = removeTempOrDuplicateFiles(items, fs); // if the table is bucketed and enforce bucketing, we should check and generate all buckets - if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null) { + if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) { // refresh the file list items = fs.listStatus(parts[i].getPath()); // get the missing buckets and generate empty buckets @@ -1500,7 +1500,7 @@ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws I FileStatus[] items = fs.listStatus(path); taskIDToFile = removeTempOrDuplicateFiles(items, fs); if(taskIDToFile != null && taskIDToFile.size() > 0 && conf != null && conf.getTable() != null - && (conf.getTable().getNumBuckets() > taskIDToFile.size())) { + && (conf.getTable().getNumBuckets() > taskIDToFile.size()) && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) { // get the missing buckets and generate empty buckets for non-dynamic partition String taskID1 = taskIDToFile.keySet().iterator().next(); Path bucketPath = taskIDToFile.values().iterator().next().getPath(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java index 520ae74..b3299db 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -106,6 +106,7 @@ private AcidUtils() { Pattern.compile("[0-9]+_[0-9]+"); public static final PathFilter hiddenFileFilter = new PathFilter(){ + @Override public boolean accept(Path p){ String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); @@ -605,7 +606,7 @@ private static void getChildState(FileStatus child, HdfsFileStatusWithId childWi // it is possible that the cleaner is running and removing these original files, // in which case recursing through them could cause us to get an error. originalDirectories.add(child); - } else { + } else if (child.getLen() != 0){ original.add(createOriginalObj(childWithId, child)); } } @@ -616,7 +617,7 @@ public static HdfsFileStatusWithId createOriginalObj( } private static class HdfsFileStatusWithoutId implements HdfsFileStatusWithId { - private FileStatus fs; + private final FileStatus fs; public HdfsFileStatusWithoutId(FileStatus fs) { this.fs = fs; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index f36f707..74b3826 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -859,11 +859,13 @@ public BISplitStrategy(Context context, FileSystem fs, List splits = Lists.newArrayList(); for (HdfsFileStatusWithId file : fileStatuses) { FileStatus fileStatus = file.getFileStatus(); - String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue() - .getHosts(); - OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), file.getFileId(), 0, - fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1); - splits.add(orcSplit); + if (fileStatus.getLen() != 0) { + String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue() + .getHosts(); + OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), file.getFileId(), 0, + fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1); + splits.add(orcSplit); + } } // add uncovered ACID delta splits diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java index 3fb6a86..b0f8c8b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java @@ -110,23 +110,20 @@ public void close(Reporter reporter) throws IOException { @Override public void close(boolean b) throws IOException { - // if we haven't written any rows, we need to create a file with a - // generic schema. if (writer == null) { - // a row with no columns - ObjectInspector inspector = ObjectInspectorFactory. - getStandardStructObjectInspector(new ArrayList(), - new ArrayList()); - options.inspector(inspector); - writer = OrcFile.createWriter(path, options); + // we are closing a file without writing any data in it + FileSystem fs = options.getFileSystem() == null ? + path.getFileSystem(options.getConfiguration()) : options.getFileSystem(); + fs.createNewFile(path); + return; } writer.close(); } @Override public SerDeStats getStats() { - stats.setRawDataSize(writer.getRawDataSize()); - stats.setRowCount(writer.getNumberOfRows()); + stats.setRawDataSize(null == writer ? 0 : writer.getRawDataSize()); + stats.setRowCount(null == writer ? 0 : writer.getNumberOfRows()); return stats; } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java index f87dd14..b5abc40 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java @@ -91,12 +91,12 @@ public void testParsing() throws Exception { public void testOriginal() throws Exception { Configuration conf = new Configuration(); MockFileSystem fs = new MockFileSystem(conf, - new MockFile("mock:/tbl/part1/000000_0", 500, new byte[0]), - new MockFile("mock:/tbl/part1/000001_1", 500, new byte[0]), - new MockFile("mock:/tbl/part1/000002_0", 500, new byte[0]), - new MockFile("mock:/tbl/part1/random", 500, new byte[0]), - new MockFile("mock:/tbl/part1/_done", 0, new byte[0]), - new MockFile("mock:/tbl/part1/subdir/000000_0", 0, new byte[0])); + new MockFile("mock:/tbl/part1/000000_0", 500, new byte[1]), + new MockFile("mock:/tbl/part1/000001_1", 500, new byte[1]), + new MockFile("mock:/tbl/part1/000002_0", 500, new byte[1]), + new MockFile("mock:/tbl/part1/random", 500, new byte[1]), + new MockFile("mock:/tbl/part1/_done", 0, new byte[1]), + new MockFile("mock:/tbl/part1/subdir/000000_0", 0, new byte[1])); AcidUtils.Directory dir = AcidUtils.getAcidState(new MockPath(fs, "/tbl/part1"), conf, new ValidReadTxnList("100:")); @@ -117,17 +117,17 @@ public void testOriginal() throws Exception { public void testOriginalDeltas() throws Exception { Configuration conf = new Configuration(); MockFileSystem fs = new MockFileSystem(conf, - new MockFile("mock:/tbl/part1/000000_0", 500, new byte[0]), - new MockFile("mock:/tbl/part1/000001_1", 500, new byte[0]), - new MockFile("mock:/tbl/part1/000002_0", 500, new byte[0]), - new MockFile("mock:/tbl/part1/random", 500, new byte[0]), - new MockFile("mock:/tbl/part1/_done", 0, new byte[0]), - new MockFile("mock:/tbl/part1/subdir/000000_0", 0, new byte[0]), - new MockFile("mock:/tbl/part1/delta_025_025/bucket_0", 0, new byte[0]), - new MockFile("mock:/tbl/part1/delta_029_029/bucket_0", 0, new byte[0]), - new MockFile("mock:/tbl/part1/delta_025_030/bucket_0", 0, new byte[0]), - new MockFile("mock:/tbl/part1/delta_050_100/bucket_0", 0, new byte[0]), - new MockFile("mock:/tbl/part1/delta_101_101/bucket_0", 0, new byte[0])); + new MockFile("mock:/tbl/part1/000000_0", 500, new byte[1]), + new MockFile("mock:/tbl/part1/000001_1", 500, new byte[1]), + new MockFile("mock:/tbl/part1/000002_0", 500, new byte[1]), + new MockFile("mock:/tbl/part1/random", 500, new byte[1]), + new MockFile("mock:/tbl/part1/_done", 0, new byte[1]), + new MockFile("mock:/tbl/part1/subdir/000000_0", 0, new byte[1]), + new MockFile("mock:/tbl/part1/delta_025_025/bucket_0", 0, new byte[1]), + new MockFile("mock:/tbl/part1/delta_029_029/bucket_0", 0, new byte[1]), + new MockFile("mock:/tbl/part1/delta_025_030/bucket_0", 0, new byte[1]), + new MockFile("mock:/tbl/part1/delta_050_100/bucket_0", 0, new byte[1]), + new MockFile("mock:/tbl/part1/delta_101_101/bucket_0", 0, new byte[1])); AcidUtils.Directory dir = AcidUtils.getAcidState(new TestInputOutputFormat.MockPath(fs, "mock:/tbl/part1"), conf, new ValidReadTxnList("100:")); @@ -220,10 +220,10 @@ public void testBestBase() throws Exception { public void testObsoleteOriginals() throws Exception { Configuration conf = new Configuration(); MockFileSystem fs = new MockFileSystem(conf, - new MockFile("mock:/tbl/part1/base_10/bucket_0", 500, new byte[0]), - new MockFile("mock:/tbl/part1/base_5/bucket_0", 500, new byte[0]), - new MockFile("mock:/tbl/part1/000000_0", 500, new byte[0]), - new MockFile("mock:/tbl/part1/000001_1", 500, new byte[0])); + new MockFile("mock:/tbl/part1/base_10/bucket_0", 500, new byte[1]), + new MockFile("mock:/tbl/part1/base_5/bucket_0", 500, new byte[1]), + new MockFile("mock:/tbl/part1/000000_0", 500, new byte[1]), + new MockFile("mock:/tbl/part1/000001_1", 500, new byte[1])); Path part = new MockPath(fs, "/tbl/part1"); AcidUtils.Directory dir = AcidUtils.getAcidState(part, conf, new ValidReadTxnList("150:")); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 9b1d7ae..da1e299 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -529,11 +529,11 @@ public void testSplitStrategySelection() throws Exception { public void testFileGenerator() throws Exception { OrcInputFormat.Context context = new OrcInputFormat.Context(conf); MockFileSystem fs = new MockFileSystem(conf, - new MockFile("mock:/a/b/part-00", 1000, new byte[0]), - new MockFile("mock:/a/b/part-01", 1000, new byte[0]), - new MockFile("mock:/a/b/_part-02", 1000, new byte[0]), - new MockFile("mock:/a/b/.part-03", 1000, new byte[0]), - new MockFile("mock:/a/b/part-04", 1000, new byte[0])); + new MockFile("mock:/a/b/part-00", 1000, new byte[1]), + new MockFile("mock:/a/b/part-01", 1000, new byte[1]), + new MockFile("mock:/a/b/_part-02", 1000, new byte[1]), + new MockFile("mock:/a/b/.part-03", 1000, new byte[1]), + new MockFile("mock:/a/b/part-04", 1000, new byte[1])); OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false); @@ -560,14 +560,14 @@ public void testEtlCombinedStrategy() throws Exception { conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000"); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); MockFileSystem fs = new MockFileSystem(conf, - new MockFile("mock:/a/1/part-00", 1000, new byte[0]), - new MockFile("mock:/a/1/part-01", 1000, new byte[0]), - new MockFile("mock:/a/2/part-00", 1000, new byte[0]), - new MockFile("mock:/a/2/part-01", 1000, new byte[0]), - new MockFile("mock:/a/3/base_0/1", 1000, new byte[0]), - new MockFile("mock:/a/4/base_0/1", 1000, new byte[0]), - new MockFile("mock:/a/5/base_0/1", 1000, new byte[0]), - new MockFile("mock:/a/5/delta_0_25/1", 1000, new byte[0]) + new MockFile("mock:/a/1/part-00", 1000, new byte[1]), + new MockFile("mock:/a/1/part-01", 1000, new byte[1]), + new MockFile("mock:/a/2/part-00", 1000, new byte[1]), + new MockFile("mock:/a/2/part-01", 1000, new byte[1]), + new MockFile("mock:/a/3/base_0/1", 1000, new byte[1]), + new MockFile("mock:/a/4/base_0/1", 1000, new byte[1]), + new MockFile("mock:/a/5/base_0/1", 1000, new byte[1]), + new MockFile("mock:/a/5/delta_0_25/1", 1000, new byte[1]) ); OrcInputFormat.CombinedCtx combineCtx = new OrcInputFormat.CombinedCtx(); @@ -575,7 +575,7 @@ public void testEtlCombinedStrategy() throws Exception { SplitStrategy ss = createOrCombineStrategy(context, fs, "mock:/a/1", combineCtx); assertNull(ss); assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy); - OrcInputFormat.ETLSplitStrategy etlSs = (OrcInputFormat.ETLSplitStrategy)combineCtx.combined; + OrcInputFormat.ETLSplitStrategy etlSs = combineCtx.combined; assertEquals(2, etlSs.files.size()); assertTrue(etlSs.isOriginal); assertEquals(1, etlSs.dirs.size()); @@ -591,7 +591,7 @@ public void testEtlCombinedStrategy() throws Exception { assertEquals(4, etlSs.files.size()); assertEquals(2, etlSs.dirs.size()); assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy); - etlSs = (OrcInputFormat.ETLSplitStrategy)combineCtx.combined; + etlSs = combineCtx.combined; assertEquals(1, etlSs.files.size()); assertFalse(etlSs.isOriginal); assertEquals(1, etlSs.dirs.size()); @@ -1426,6 +1426,7 @@ public void testSplitGenFailure() throws IOException { org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); + writer.write(new OrcSerde().serialize(null,null)); writer.close(true); InputFormat in = new OrcInputFormat(); fs.setPermission(testFilePath, FsPermission.createImmutable((short) 0333)); diff --git a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out index be2b61e..d03bfe4 100644 --- a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out +++ b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out @@ -1104,7 +1104,7 @@ Partition Parameters: numFiles 8 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information @@ -1186,7 +1186,7 @@ Partition Parameters: numFiles 8 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out index 79558d5..a90e3f6 100644 --- a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out +++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out @@ -1161,10 +1161,10 @@ Table: over1k_part_buck_orc #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 8 + numFiles 4 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information @@ -1243,10 +1243,10 @@ Table: over1k_part_buck_sort_orc #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 8 + numFiles 4 numRows 6 rawDataSize 120 - totalSize 2400 + totalSize 2004 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out index fbeea6b..5292106 100644 --- a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out +++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out @@ -1074,7 +1074,7 @@ Table: over1k_part_buck #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 8 + numFiles 4 numRows 6 rawDataSize 156 totalSize 162 @@ -1156,7 +1156,7 @@ Table: over1k_part_buck_sort #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 8 + numFiles 4 numRows 6 rawDataSize 156 totalSize 162 diff --git a/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out b/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out index 41c0d71..c703e84 100644 --- a/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out +++ b/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out @@ -120,7 +120,7 @@ Table Parameters: numFiles 4 numRows 0 rawDataSize 0 - totalSize 4211 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -171,9 +171,9 @@ Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} numFiles 4 - numRows 15 - rawDataSize 3651 - totalSize 4211 + numRows 0 + rawDataSize 0 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -236,9 +236,9 @@ Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} numFiles 5 - numRows 20 - rawDataSize 4720 - totalSize 5568 + numRows 5 + rawDataSize 1069 + totalSize 5360 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out b/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out index d962621..4e2e62c 100644 --- a/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out @@ -184,11 +184,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -198,11 +198,11 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23 input vertices: 1 Map 2 - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true File Output Operator compressed: false - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -212,16 +212,16 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col2 (type: int) sort order: + Map-reduce partition columns: _col2 (type: int) - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) Execution mode: vectorized @@ -296,11 +296,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -310,11 +310,11 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 2 - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true File Output Operator compressed: false - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -324,16 +324,16 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint) sort order: + Map-reduce partition columns: _col0 (type: tinyint) - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Stage: Stage-0 @@ -500,11 +500,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), cint (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -514,7 +514,7 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 3 - Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Map Join Operator condition map: @@ -525,7 +525,7 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 4 - Statistics: Num rows: 17 Data size: 4417 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 17 Data size: 4214 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Group By Operator aggregations: count(), sum(_col0) @@ -541,31 +541,31 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int) outputColumnNames: _col0 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Map 4 Map Operator Tree: TableScan alias: c - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint) sort order: + Map-reduce partition columns: _col0 (type: tinyint) - Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Execution mode: vectorized diff --git a/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out b/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out index 9db8e00..a6690b6 100644 --- a/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out @@ -214,11 +214,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -228,11 +228,11 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23 input vertices: 1 Map 2 - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true File Output Operator compressed: false - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -242,16 +242,16 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col2 (type: int) sort order: + Map-reduce partition columns: _col2 (type: int) - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) Execution mode: vectorized @@ -361,11 +361,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -375,11 +375,11 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 2 - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true File Output Operator compressed: false - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -389,16 +389,16 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint) sort order: + Map-reduce partition columns: _col0 (type: tinyint) - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Stage: Stage-0 @@ -870,11 +870,11 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint), cint (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Left Outer Join0 to 1 @@ -884,7 +884,7 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 3 - Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Map Join Operator condition map: @@ -894,7 +894,7 @@ STAGE PLANS: 1 _col0 (type: tinyint) input vertices: 1 Map 4 - Statistics: Num rows: 36 Data size: 8476 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 36 Data size: 8273 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Group By Operator aggregations: count() @@ -910,31 +910,31 @@ STAGE PLANS: Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int) outputColumnNames: _col0 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Map 4 Map Operator Tree: TableScan alias: c - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint) sort order: + Map-reduce partition columns: _col0 (type: tinyint) - Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Execution mode: vectorized diff --git a/ql/src/test/results/clientpositive/union_fast_stats.q.out b/ql/src/test/results/clientpositive/union_fast_stats.q.out index a02ff04..e908ec0 100644 --- a/ql/src/test/results/clientpositive/union_fast_stats.q.out +++ b/ql/src/test/results/clientpositive/union_fast_stats.q.out @@ -117,10 +117,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 4 + numFiles 3 numRows 15 rawDataSize 3483 - totalSize 4211 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -170,10 +170,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 4 + numFiles 3 numRows 15 - rawDataSize 3651 - totalSize 4211 + rawDataSize 3483 + totalSize 4003 #### A masked pattern was here #### # Storage Information @@ -235,10 +235,10 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 5 + numFiles 4 numRows 20 - rawDataSize 4720 - totalSize 5568 + rawDataSize 4552 + totalSize 5360 #### A masked pattern was here #### # Storage Information diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java index 31060a2..9a3a31c 100644 --- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java +++ b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java @@ -164,7 +164,7 @@ public RecordReader getRecordReader(InputSplit split, Iterator it = result.iterator(); while (it.hasNext()) { FileStatus stat = it.next(); - if (!stat.isFile()) { + if (!stat.isFile() || (stat.getLen() == 0 && !stat.getPath().toUri().getScheme().equals("nullscan"))) { it.remove(); } } -- 1.7.12.4 (Apple Git-37)