diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SetSparkReducerParallelism.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SetSparkReducerParallelism.java index e808a4f..1999747 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SetSparkReducerParallelism.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SetSparkReducerParallelism.java @@ -18,10 +18,13 @@ package org.apache.hadoop.hive.ql.optimizer.spark; +import java.util.Collection; +import java.util.EnumSet; import java.util.List; import java.util.Set; import java.util.Stack; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.ObjectPair; @@ -50,6 +53,8 @@ import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.stats.StatsUtils; +import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.UNIFORM; + /** * SetSparkReducerParallelism determines how many reducers should * be run for a given reduce sink, clone from SetReducerParallelism. @@ -120,41 +125,64 @@ public Object process(Node nd, Stack stack, } } - long numberOfBytes = 0; - - if (useOpStats) { - // we need to add up all the estimates from the siblings of this reduce sink - for (Operator sibling - : sink.getChildOperators().get(0).getParentOperators()) { - if (sibling.getStatistics() != null) { - numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize()); - if (LOG.isDebugEnabled()) { - LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics()); - } - } else { - LOG.warn("No stats available from: " + sibling); - } - } - } else if (parentSinks.isEmpty()) { - // Not using OP stats and this is the first sink in the path, meaning that - // we should use TS stats to infer parallelism - for (Operator sibling - : sink.getChildOperators().get(0).getParentOperators()) { - Set sources = - OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class); - for (TableScanOperator source : sources) { - if (source.getStatistics() != null) { - numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize()); + if (useOpStats || parentSinks.isEmpty()) { + long numberOfBytes = 0; + if (useOpStats) { + // we need to add up all the estimates from the siblings of this reduce sink + for (Operator sibling + : sink.getChildOperators().get(0).getParentOperators()) { + if (sibling.getStatistics() != null) { + numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize()); if (LOG.isDebugEnabled()) { - LOG.debug("Table source " + source + " has stats: " + source.getStatistics()); + LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics()); } } else { - LOG.warn("No stats available from table source: " + source); + LOG.warn("No stats available from: " + sibling); } } + } else { + // Not using OP stats and this is the first sink in the path, meaning that + // we should use TS stats to infer parallelism + for (Operator sibling + : sink.getChildOperators().get(0).getParentOperators()) { + Set sources = + OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class); + for (TableScanOperator source : sources) { + if (source.getStatistics() != null) { + numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize()); + if (LOG.isDebugEnabled()) { + LOG.debug("Table source " + source + " has stats: " + source.getStatistics()); + } + } else { + LOG.warn("No stats available from table source: " + source); + } + } + } + LOG.debug("Gathered stats for sink " + sink + ". Total size is " + + numberOfBytes + " bytes."); + } + + // Divide it by 2 so that we can have more reducers + long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2; + int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, + maxReducers, false); + + getSparkMemoryAndCores(context); + if (sparkMemoryAndCores != null && + sparkMemoryAndCores.getFirst() > 0 && sparkMemoryAndCores.getSecond() > 0) { + // warn the user if bytes per reducer is much larger than memory per task + if ((double) sparkMemoryAndCores.getFirst() / bytesPerReducer < 0.5) { + LOG.warn("Average load of a reducer is much larger than its available memory. " + + "Consider decreasing hive.exec.reducers.bytes.per.reducer"); + } + + // If there are more cores, use the number of cores + numReducers = Math.max(numReducers, sparkMemoryAndCores.getSecond()); } - LOG.debug("Gathered stats for sink " + sink + ". Total size is " - + numberOfBytes + " bytes."); + numReducers = Math.min(numReducers, maxReducers); + LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + + " (calculated)"); + desc.setNumReducers(numReducers); } else { // Use the maximum parallelism from all parent reduce sinks int numberOfReducers = 0; @@ -164,30 +192,14 @@ public Object process(Node nd, Stack stack, desc.setNumReducers(numberOfReducers); LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents"); - return false; } - - // Divide it by 2 so that we can have more reducers - long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2; - int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, - maxReducers, false); - - getSparkMemoryAndCores(context); - if (sparkMemoryAndCores != null && - sparkMemoryAndCores.getFirst() > 0 && sparkMemoryAndCores.getSecond() > 0) { - // warn the user if bytes per reducer is much larger than memory per task - if ((double) sparkMemoryAndCores.getFirst() / bytesPerReducer < 0.5) { - LOG.warn("Average load of a reducer is much larger than its available memory. " + - "Consider decreasing hive.exec.reducers.bytes.per.reducer"); - } - - // If there are more cores, use the number of cores - numReducers = Math.max(numReducers, sparkMemoryAndCores.getSecond()); + final Collection keyCols = + ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getKeyCols()); + final Collection partCols = + ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols()); + if (keyCols != null && keyCols.equals(partCols)) { + desc.setReducerTraits(EnumSet.of(UNIFORM)); } - numReducers = Math.min(numReducers, maxReducers); - LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + - " (calculated)"); - desc.setNumReducers(numReducers); } } else { LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers()); diff --git a/ql/src/test/queries/clientpositive/ppd_join_filter.q b/ql/src/test/queries/clientpositive/ppd_join_filter.q index 418f4c5..e909b3d 100644 --- a/ql/src/test/queries/clientpositive/ppd_join_filter.q +++ b/ql/src/test/queries/clientpositive/ppd_join_filter.q @@ -2,6 +2,8 @@ set hive.mapred.mode=nonstrict; set hive.optimize.ppd=true; set hive.ppd.remove.duplicatefilters=false; +-- SORT_QUERY_RESULTS + explain extended select a.key, b.k2, b.k3 from src a join ( diff --git a/ql/src/test/results/clientpositive/spark/groupby3.q.out b/ql/src/test/results/clientpositive/spark/groupby3.q.out index 23871ba..a34e89e1 100644 --- a/ql/src/test/results/clientpositive/spark/groupby3.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby3.q.out @@ -148,4 +148,4 @@ POSTHOOK: query: SELECT dest1.* FROM dest1 POSTHOOK: type: QUERY POSTHOOK: Input: default@dest1 #### A masked pattern was here #### -130091.0 260.182 256.10355987055016 98.0 0.0 142.9268095075238 143.06995106518906 20428.072876000002 20469.010897795593 +130091.0 260.182 256.10355987055016 98.0 0.0 142.92680950752379 143.06995106518903 20428.07287599999 20469.010897795582 diff --git a/ql/src/test/results/clientpositive/spark/union_remove_10.q.out b/ql/src/test/results/clientpositive/spark/union_remove_10.q.out index 7ae8479..18438d2 100644 --- a/ql/src/test/results/clientpositive/spark/union_remove_10.q.out +++ b/ql/src/test/results/clientpositive/spark/union_remove_10.q.out @@ -232,7 +232,7 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: numFiles 4 - totalSize 350 + totalSize 348 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/spark/union_remove_13.q.out b/ql/src/test/results/clientpositive/spark/union_remove_13.q.out index 0d0d841..e83f927 100644 --- a/ql/src/test/results/clientpositive/spark/union_remove_13.q.out +++ b/ql/src/test/results/clientpositive/spark/union_remove_13.q.out @@ -246,7 +246,7 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: numFiles 3 - totalSize 271 + totalSize 269 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/spark/union_remove_15.q.out b/ql/src/test/results/clientpositive/spark/union_remove_15.q.out index 96ee309..320d869 100644 --- a/ql/src/test/results/clientpositive/spark/union_remove_15.q.out +++ b/ql/src/test/results/clientpositive/spark/union_remove_15.q.out @@ -178,7 +178,7 @@ Table Parameters: numPartitions 2 numRows 0 rawDataSize 0 - totalSize 336 + totalSize 332 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/spark/union_remove_16.q.out b/ql/src/test/results/clientpositive/spark/union_remove_16.q.out index 8ea2b00..04b668b 100644 --- a/ql/src/test/results/clientpositive/spark/union_remove_16.q.out +++ b/ql/src/test/results/clientpositive/spark/union_remove_16.q.out @@ -220,7 +220,7 @@ Table Parameters: numPartitions 2 numRows 0 rawDataSize 0 - totalSize 336 + totalSize 332 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/spark/union_remove_7.q.out b/ql/src/test/results/clientpositive/spark/union_remove_7.q.out index 7341d98..0fabbac 100644 --- a/ql/src/test/results/clientpositive/spark/union_remove_7.q.out +++ b/ql/src/test/results/clientpositive/spark/union_remove_7.q.out @@ -157,7 +157,7 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: numFiles 4 - totalSize 336 + totalSize 332 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/spark/union_remove_8.q.out b/ql/src/test/results/clientpositive/spark/union_remove_8.q.out index 06ba030..c96f591 100644 --- a/ql/src/test/results/clientpositive/spark/union_remove_8.q.out +++ b/ql/src/test/results/clientpositive/spark/union_remove_8.q.out @@ -182,7 +182,7 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: numFiles 4 - totalSize 350 + totalSize 348 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/spark/union_remove_9.q.out b/ql/src/test/results/clientpositive/spark/union_remove_9.q.out index 183169b..550b2ad 100644 --- a/ql/src/test/results/clientpositive/spark/union_remove_9.q.out +++ b/ql/src/test/results/clientpositive/spark/union_remove_9.q.out @@ -240,7 +240,7 @@ Retention: 0 Table Type: MANAGED_TABLE Table Parameters: numFiles 4 - totalSize 350 + totalSize 348 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out index 976d9d9..a752256 100644 --- a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out @@ -1114,7 +1114,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: boolean) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE @@ -1254,7 +1254,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: boolean) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE @@ -1394,7 +1394,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: boolean) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE @@ -1534,7 +1534,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: boolean) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.out b/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.out index c69bc81..f64a6af 100644 --- a/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.out @@ -161,7 +161,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 1049 Data size: 311170 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out b/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out index 9af0786..81b0e15 100644 --- a/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out @@ -1281,7 +1281,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vector_decimal_aggregate.q.out b/ql/src/test/results/clientpositive/spark/vector_decimal_aggregate.q.out index 9994f2b..9ec5a09 100644 --- a/ql/src/test/results/clientpositive/spark/vector_decimal_aggregate.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_decimal_aggregate.q.out @@ -85,7 +85,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 2165060 Basic stats: COMPLETE Column stats: NONE @@ -247,7 +247,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 2165060 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vector_distinct_2.q.out b/ql/src/test/results/clientpositive/spark/vector_distinct_2.q.out index aff53a6..3d13156 100644 --- a/ql/src/test/results/clientpositive/spark/vector_distinct_2.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_distinct_2.q.out @@ -156,7 +156,7 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: _col0 (type: tinyint), _col1 (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 2000 Data size: 918712 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vector_groupby_3.q.out b/ql/src/test/results/clientpositive/spark/vector_groupby_3.q.out index 83f8604..1761205 100644 --- a/ql/src/test/results/clientpositive/spark/vector_groupby_3.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_groupby_3.q.out @@ -158,7 +158,7 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: _col0 (type: tinyint), _col1 (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 2000 Data size: 918712 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vector_mapjoin_reduce.q.out b/ql/src/test/results/clientpositive/spark/vector_mapjoin_reduce.q.out index 558a2d0..e4f9162 100644 --- a/ql/src/test/results/clientpositive/spark/vector_mapjoin_reduce.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_mapjoin_reduce.q.out @@ -159,7 +159,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 100 Data size: 11999 Basic stats: COMPLETE Column stats: NONE @@ -436,7 +436,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 100 Data size: 11999 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vector_orderby_5.q.out b/ql/src/test/results/clientpositive/spark/vector_orderby_5.q.out index dc394c8..6af68f7 100644 --- a/ql/src/test/results/clientpositive/spark/vector_orderby_5.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_orderby_5.q.out @@ -159,7 +159,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: boolean) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 2000 Data size: 918712 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vector_string_concat.q.out b/ql/src/test/results/clientpositive/spark/vector_string_concat.q.out index 17c79a5..92634d7 100644 --- a/ql/src/test/results/clientpositive/spark/vector_string_concat.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_string_concat.q.out @@ -369,7 +369,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkStringOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 2000 Data size: 918712 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vectorization_12.q.out b/ql/src/test/results/clientpositive/spark/vectorization_12.q.out index c17043b..a707ac9 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_12.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_12.q.out @@ -121,11 +121,10 @@ STAGE PLANS: sort order: ++++ Map-reduce partition columns: _col0 (type: double), _col1 (type: bigint), _col2 (type: string), _col3 (type: boolean) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [0, 1, 2, 3] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0, 1, 2, 3] valueColumns: [4, 5, 6, 7, 8] Statistics: Num rows: 5006 Data size: 153682 Basic stats: COMPLETE Column stats: NONE value expressions: _col4 (type: bigint), _col5 (type: struct), _col6 (type: struct), _col7 (type: bigint), _col8 (type: struct) diff --git a/ql/src/test/results/clientpositive/spark/vectorization_13.q.out b/ql/src/test/results/clientpositive/spark/vectorization_13.q.out index bae304b..6ecd0b5 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_13.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_13.q.out @@ -123,11 +123,10 @@ STAGE PLANS: sort order: +++++ Map-reduce partition columns: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [0, 1, 2, 3, 4] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0, 1, 2, 3, 4] valueColumns: [5, 6, 7, 8, 9, 10] Statistics: Num rows: 2730 Data size: 83809 Basic stats: COMPLETE Column stats: NONE value expressions: _col5 (type: tinyint), _col6 (type: double), _col7 (type: struct), _col8 (type: struct), _col9 (type: float), _col10 (type: tinyint) @@ -477,7 +476,7 @@ STAGE PLANS: sort order: +++++ Map-reduce partition columns: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 2730 Data size: 83809 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vectorization_14.q.out b/ql/src/test/results/clientpositive/spark/vectorization_14.q.out index 9d52abe..15926be 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_14.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_14.q.out @@ -124,11 +124,10 @@ STAGE PLANS: sort order: +++++ Map-reduce partition columns: _col0 (type: string), _col1 (type: float), _col2 (type: double), _col3 (type: timestamp), _col4 (type: boolean) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [0, 1, 2, 3, 4] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0, 1, 2, 3, 4] valueColumns: [5, 6, 7, 8, 9, 10] Statistics: Num rows: 606 Data size: 18603 Basic stats: COMPLETE Column stats: NONE value expressions: _col5 (type: struct), _col6 (type: float), _col7 (type: struct), _col8 (type: bigint), _col9 (type: struct), _col10 (type: struct) diff --git a/ql/src/test/results/clientpositive/spark/vectorization_15.q.out b/ql/src/test/results/clientpositive/spark/vectorization_15.q.out index cc9ae1d..9a78ee5 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_15.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_15.q.out @@ -119,11 +119,10 @@ STAGE PLANS: sort order: +++++++ Map-reduce partition columns: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [0, 1, 2, 3, 4, 5, 6] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0, 1, 2, 3, 4, 5, 6] valueColumns: [7, 8, 9, 10, 11, 12] Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE value expressions: _col7 (type: struct), _col8 (type: double), _col9 (type: struct), _col10 (type: struct), _col11 (type: struct), _col12 (type: struct) diff --git a/ql/src/test/results/clientpositive/spark/vectorization_16.q.out b/ql/src/test/results/clientpositive/spark/vectorization_16.q.out index d5235aa..eb65dc3 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_16.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_16.q.out @@ -96,11 +96,10 @@ STAGE PLANS: sort order: +++ Map-reduce partition columns: _col0 (type: double), _col1 (type: string), _col2 (type: timestamp) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [0, 1, 2] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0, 1, 2] valueColumns: [3, 4, 5] Statistics: Num rows: 4096 Data size: 125745 Basic stats: COMPLETE Column stats: NONE value expressions: _col3 (type: bigint), _col4 (type: struct), _col5 (type: double) diff --git a/ql/src/test/results/clientpositive/spark/vectorization_9.q.out b/ql/src/test/results/clientpositive/spark/vectorization_9.q.out index d5235aa..eb65dc3 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_9.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_9.q.out @@ -96,11 +96,10 @@ STAGE PLANS: sort order: +++ Map-reduce partition columns: _col0 (type: double), _col1 (type: string), _col2 (type: timestamp) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [0, 1, 2] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0, 1, 2] valueColumns: [3, 4, 5] Statistics: Num rows: 4096 Data size: 125745 Basic stats: COMPLETE Column stats: NONE value expressions: _col3 (type: bigint), _col4 (type: struct), _col5 (type: double) diff --git a/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out index 4d3e41a..bfb2e4c 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out @@ -2209,7 +2209,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: smallint) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 2503 Data size: 76841 Basic stats: COMPLETE Column stats: NONE @@ -2487,7 +2487,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: double) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 2654 Data size: 81476 Basic stats: COMPLETE Column stats: NONE @@ -2809,7 +2809,7 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: _col0 (type: timestamp), _col1 (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE @@ -3212,7 +3212,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: boolean) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 10239 Data size: 314333 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/spark/vectorized_ptf.q.out b/ql/src/test/results/clientpositive/spark/vectorized_ptf.q.out index 4972677..7e6ffe8 100644 --- a/ql/src/test/results/clientpositive/spark/vectorized_ptf.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorized_ptf.q.out @@ -368,11 +368,10 @@ STAGE PLANS: sort order: + Map-reduce partition columns: p_partkey (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator keyColumns: [0] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0] valueColumns: [1, 2, 5] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE value expressions: p_name (type: string), p_mfgr (type: string), p_size (type: int) @@ -410,11 +409,10 @@ STAGE PLANS: sort order: + Map-reduce partition columns: p_partkey (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator keyColumns: [0] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0] valueColumns: [] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized @@ -1460,11 +1458,10 @@ STAGE PLANS: sort order: + Map-reduce partition columns: p_partkey (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator keyColumns: [0] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0] valueColumns: [] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized @@ -1638,11 +1635,10 @@ STAGE PLANS: sort order: + Map-reduce partition columns: p_partkey (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator keyColumns: [0] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0] valueColumns: [] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized @@ -2996,11 +2992,10 @@ STAGE PLANS: sort order: + Map-reduce partition columns: p_partkey (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator keyColumns: [0] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [0] valueColumns: [] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized @@ -4184,11 +4179,10 @@ STAGE PLANS: sort order: + Map-reduce partition columns: p_mfgr (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkStringOperator keyColumns: [2] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [2] valueColumns: [1, 5] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE value expressions: p_name (type: string), p_size (type: int) @@ -4496,11 +4490,10 @@ STAGE PLANS: sort order: + Map-reduce partition columns: p_mfgr (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkStringOperator keyColumns: [2] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [2] valueColumns: [1, 5] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE value expressions: p_name (type: string), p_size (type: int) @@ -4803,11 +4796,10 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: p_mfgr (type: string), p_name (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [2, 1] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [2, 1] valueColumns: [5] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE value expressions: p_size (type: int) @@ -5087,11 +5079,10 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: p_mfgr (type: string), p_name (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [2, 1] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [2, 1] valueColumns: [5] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE value expressions: p_size (type: int) @@ -5413,11 +5404,10 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: p_mfgr (type: string), p_name (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [2, 1] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [2, 1] valueColumns: [5] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE value expressions: p_size (type: int) @@ -5709,11 +5699,10 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: p_mfgr (type: string), p_name (type: string) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkMultiKeyOperator keyColumns: [2, 1] native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - partitionColumns: [2, 1] valueColumns: [5] Statistics: Num rows: 26 Data size: 16042 Basic stats: COMPLETE Column stats: NONE value expressions: p_size (type: int) diff --git a/ql/src/test/results/clientpositive/spark/vectorized_shufflejoin.q.out b/ql/src/test/results/clientpositive/spark/vectorized_shufflejoin.q.out index 18c7db1..4d8620f 100644 --- a/ql/src/test/results/clientpositive/spark/vectorized_shufflejoin.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorized_shufflejoin.q.out @@ -51,7 +51,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE @@ -92,7 +92,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator + className: VectorReduceSinkLongOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE