diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 8fc834efb4..2c4d758f5e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -3172,7 +3172,7 @@ private boolean validateAggregationDesc(AggregationDesc aggDesc, GroupByDesc.Mod } // The planner seems to pull this one out. - if (aggDesc.getDistinct()) { + if (groupByMode != GroupByDesc.Mode.HASH && aggDesc.getDistinct()) { setExpressionIssue("Aggregation Function", "DISTINCT not supported"); return false; } diff --git ql/src/test/queries/clientpositive/vectorized_distinct_gby.q ql/src/test/queries/clientpositive/vectorized_distinct_gby.q index a64a60718f..07d7062762 100644 --- ql/src/test/queries/clientpositive/vectorized_distinct_gby.q +++ ql/src/test/queries/clientpositive/vectorized_distinct_gby.q @@ -18,3 +18,15 @@ select sum(distinct a), count(distinct a) from dtest; explain vectorization detail select sum(distinct cint), count(distinct cint), avg(distinct cint), std(distinct cint) from alltypesorc; select sum(distinct cint), count(distinct cint), avg(distinct cint), std(distinct cint) from alltypesorc; + + +explain vectorization detail +select ctinyint, count(distinct cint), sum(( CASE WHEN ( ( cstring1 LIKE +'test%1' ) OR ( cstring1 LIKE 'test%2' ) ) THEN 1 ELSE 0 END )) AS s,max(( CASE +WHEN ( ( cstring1 LIKE 'test%3' ) OR ( cstring1 LIKE '%test%5' ) ) THEN cstring1 ELSE +'XXXXX' END )) AS maxVal from alltypesorc group by ctinyint; + +select ctinyint, count(distinct cint), sum(( CASE WHEN ( ( cstring1 LIKE +'test%1' ) OR ( cstring1 LIKE 'test%2' ) ) THEN 1 ELSE 0 END )) AS s,max(( CASE +WHEN ( ( cstring1 LIKE 'test%3' ) OR ( cstring1 LIKE '%test%5' ) ) THEN cstring1 ELSE +'XXXXX' END )) AS maxVal from alltypesorc group by ctinyint; \ No newline at end of file diff --git ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out index 1a9c3bd7fa..913fcdc16b 100644 --- ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out +++ ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out @@ -41,13 +41,28 @@ STAGE PLANS: TableScan alias: dtest Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:a:int, 1:b:int, 2:ROW__ID:struct] Select Operator expressions: a (type: int) outputColumnNames: a + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0] Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(DISTINCT a), count(DISTINCT a) bucketGroup: true + Group By Vectorization: + aggregators: VectorUDAFSumLong(col 0:int) -> bigint, VectorUDAFCount(col 0:int) -> bigint + className: VectorGroupByOperator + groupByMode: HASH + keyExpressions: col 0:int + native: false + vectorProcessingMode: HASH + projectedOutputColumnNums: [0, 1] keys: a (type: int) minReductionHashAggr: 0.99 mode: hash @@ -57,13 +72,28 @@ STAGE PLANS: key expressions: _col0 (type: int) null sort order: z sort order: + + Reduce Sink Vectorization: + className: VectorReduceSinkOperator + native: false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false, No DISTINCT columns IS false Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized Map Vectorization: enabled: true enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat - notVectorizedReason: Aggregation Function expression for GROUPBY operator: DISTINCT not supported - vectorized: false + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 2 + includeColumns: [0] + dataColumns: a:int, b:int + partitionColumnCount: 0 + scratchColumnTypeNames: [] Reduce Vectorization: enabled: false enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true @@ -122,12 +152,28 @@ STAGE PLANS: TableScan alias: alltypesorc Statistics: Num rows: 12288 Data size: 36696 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:ctinyint:tinyint, 1:csmallint:smallint, 2:cint:int, 3:cbigint:bigint, 4:cfloat:float, 5:cdouble:double, 6:cstring1:string, 7:cstring2:string, 8:ctimestamp1:timestamp, 9:ctimestamp2:timestamp, 10:cboolean1:boolean, 11:cboolean2:boolean, 12:ROW__ID:struct] Select Operator expressions: cint (type: int), UDFToDouble(cint) (type: double), (UDFToDouble(cint) * UDFToDouble(cint)) (type: double) outputColumnNames: _col0, _col1, _col2 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [2, 13, 16] + selectExpressions: CastLongToDouble(col 2:int) -> 13:double, DoubleColMultiplyDoubleColumn(col 14:double, col 15:double)(children: CastLongToDouble(col 2:int) -> 14:double, CastLongToDouble(col 2:int) -> 15:double) -> 16:double Statistics: Num rows: 12288 Data size: 36696 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(DISTINCT _col0), count(DISTINCT _col0), sum(DISTINCT _col2), sum(DISTINCT _col1) + Group By Vectorization: + aggregators: VectorUDAFSumLong(col 2:int) -> bigint, VectorUDAFCount(col 2:int) -> bigint, VectorUDAFSumDouble(col 16:double) -> double, VectorUDAFSumDouble(col 13:double) -> double + className: VectorGroupByOperator + groupByMode: HASH + keyExpressions: col 2:int, col 16:double, col 13:double + native: false + vectorProcessingMode: HASH + projectedOutputColumnNums: [0, 1, 2, 3] keys: _col0 (type: int), _col2 (type: double), _col1 (type: double) minReductionHashAggr: 0.99 mode: hash @@ -137,13 +183,28 @@ STAGE PLANS: key expressions: _col0 (type: int), _col1 (type: double), _col2 (type: double) null sort order: zzz sort order: +++ + Reduce Sink Vectorization: + className: VectorReduceSinkOperator + native: false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false, No DISTINCT columns IS false Statistics: Num rows: 12288 Data size: 601608 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized Map Vectorization: enabled: true enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat - notVectorizedReason: Aggregation Function expression for GROUPBY operator: DISTINCT not supported - vectorized: false + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 12 + includeColumns: [2] + dataColumns: ctinyint:tinyint, csmallint:smallint, cint:int, cbigint:bigint, cfloat:float, cdouble:double, cstring1:string, cstring2:string, ctimestamp1:timestamp, ctimestamp2:timestamp, cboolean1:boolean, cboolean2:boolean + partitionColumnCount: 0 + scratchColumnTypeNames: [double, double, double, double] Reduce Vectorization: enabled: false enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true @@ -181,3 +242,256 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@alltypesorc #### A masked pattern was here #### -3482841611 6082 -572647.4204209142 6.153814687328982E8 +PREHOOK: query: explain vectorization detail +select ctinyint, count(distinct cint), sum(( CASE WHEN ( ( cstring1 LIKE +'test%1' ) OR ( cstring1 LIKE 'test%2' ) ) THEN 1 ELSE 0 END )) AS s,max(( CASE +WHEN ( ( cstring1 LIKE 'test%3' ) OR ( cstring1 LIKE '%test%5' ) ) THEN cstring1 ELSE +'XXXXX' END )) AS maxVal from alltypesorc group by ctinyint +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: explain vectorization detail +select ctinyint, count(distinct cint), sum(( CASE WHEN ( ( cstring1 LIKE +'test%1' ) OR ( cstring1 LIKE 'test%2' ) ) THEN 1 ELSE 0 END )) AS s,max(( CASE +WHEN ( ( cstring1 LIKE 'test%3' ) OR ( cstring1 LIKE '%test%5' ) ) THEN cstring1 ELSE +'XXXXX' END )) AS maxVal from alltypesorc group by ctinyint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 935842 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:ctinyint:tinyint, 1:csmallint:smallint, 2:cint:int, 3:cbigint:bigint, 4:cfloat:float, 5:cdouble:double, 6:cstring1:string, 7:cstring2:string, 8:ctimestamp1:timestamp, 9:ctimestamp2:timestamp, 10:cboolean1:boolean, 11:cboolean2:boolean, 12:ROW__ID:struct] + Select Operator + expressions: ctinyint (type: tinyint), cint (type: int), CASE WHEN (((cstring1 like 'test%1') or (cstring1 like 'test%2'))) THEN (1) ELSE (0) END (type: int), CASE WHEN (((cstring1 like 'test%3') or (cstring1 like '%test%5'))) THEN (cstring1) ELSE ('XXXXX') END (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 2, 16, 20] + selectExpressions: IfExprLongScalarLongScalar(col 15:boolean, val 1, val 0)(children: ColOrCol(col 13:boolean, col 14:boolean)(children: SelectStringColLikeStringScalar(col 6:string) -> 13:boolean, SelectStringColLikeStringScalar(col 6:string) -> 14:boolean) -> 15:boolean) -> 16:int, IfExprStringGroupColumnStringScalar(col 19:boolean, col 6:string, val XXXXX)(children: ColOrCol(col 17:boolean, col 18:boolean)(children: SelectStringColLikeStringScalar(col 6:string) -> 17:boolean, SelectStringColLikeStringScalar(col 6:string) -> 18:boolean) -> 19:boolean) -> 20:string + Statistics: Num rows: 12288 Data size: 935842 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(DISTINCT _col1), sum(_col2), max(_col3) + Group By Vectorization: + aggregators: VectorUDAFCount(col 2:int) -> bigint, VectorUDAFSumLong(col 16:int) -> bigint, VectorUDAFMaxString(col 20:string) -> string + className: VectorGroupByOperator + groupByMode: HASH + keyExpressions: col 0:tinyint, col 2:int + native: false + vectorProcessingMode: HASH + projectedOutputColumnNums: [0, 1, 2] + keys: _col0 (type: tinyint), _col1 (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 6144 Data size: 1265496 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: int) + null sort order: zz + sort order: ++ + Map-reduce partition columns: _col0 (type: tinyint) + Reduce Sink Vectorization: + className: VectorReduceSinkOperator + native: false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false, No DISTINCT columns IS false + Statistics: Num rows: 6144 Data size: 1265496 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col3 (type: bigint), _col4 (type: string) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 12 + includeColumns: [0, 2, 6] + dataColumns: ctinyint:tinyint, csmallint:smallint, cint:int, cbigint:bigint, cfloat:float, cdouble:double, cstring1:string, cstring2:string, ctimestamp1:timestamp, ctimestamp2:timestamp, cboolean1:boolean, cboolean2:boolean + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint, bigint, bigint, bigint, bigint, bigint, bigint, string] + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), sum(VALUE._col1), max(VALUE._col2) + keys: KEY._col0 (type: tinyint) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 131 Data size: 26596 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 131 Data size: 26596 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select ctinyint, count(distinct cint), sum(( CASE WHEN ( ( cstring1 LIKE +'test%1' ) OR ( cstring1 LIKE 'test%2' ) ) THEN 1 ELSE 0 END )) AS s,max(( CASE +WHEN ( ( cstring1 LIKE 'test%3' ) OR ( cstring1 LIKE '%test%5' ) ) THEN cstring1 ELSE +'XXXXX' END )) AS maxVal from alltypesorc group by ctinyint +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ctinyint, count(distinct cint), sum(( CASE WHEN ( ( cstring1 LIKE +'test%1' ) OR ( cstring1 LIKE 'test%2' ) ) THEN 1 ELSE 0 END )) AS s,max(( CASE +WHEN ( ( cstring1 LIKE 'test%3' ) OR ( cstring1 LIKE '%test%5' ) ) THEN cstring1 ELSE +'XXXXX' END )) AS maxVal from alltypesorc group by ctinyint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +-64 3 0 XXXXX +-63 3 0 XXXXX +-62 3 0 XXXXX +-61 3 0 XXXXX +-60 3 0 XXXXX +-59 3 0 XXXXX +-58 3 0 XXXXX +-57 3 0 XXXXX +-56 3 0 XXXXX +-55 3 0 XXXXX +-54 3 0 XXXXX +-53 3 0 XXXXX +-52 3 0 XXXXX +-51 1009 0 XXXXX +-50 3 0 XXXXX +-49 3 0 XXXXX +-48 3 0 XXXXX +-47 3 0 XXXXX +-46 3 0 XXXXX +-45 3 0 XXXXX +-44 3 0 XXXXX +-43 3 0 XXXXX +-42 3 0 XXXXX +-41 3 0 XXXXX +-40 3 0 XXXXX +-39 3 0 XXXXX +-38 3 0 XXXXX +-37 3 0 XXXXX +-36 3 0 XXXXX +-35 3 0 XXXXX +-34 3 0 XXXXX +-33 3 0 XXXXX +-32 3 0 XXXXX +-31 3 0 XXXXX +-30 3 0 XXXXX +-29 3 0 XXXXX +-28 3 0 XXXXX +-27 3 0 XXXXX +-26 3 0 XXXXX +-25 3 0 XXXXX +-24 3 0 XXXXX +-23 3 0 XXXXX +-22 3 0 XXXXX +-21 3 0 XXXXX +-20 3 0 XXXXX +-19 3 0 XXXXX +-18 3 0 XXXXX +-17 3 0 XXXXX +-16 3 0 XXXXX +-15 3 0 XXXXX +-14 3 0 XXXXX +-13 3 0 XXXXX +-12 3 0 XXXXX +-11 3 0 XXXXX +-10 3 0 XXXXX +-9 3 0 XXXXX +-8 3 0 XXXXX +-7 3 0 XXXXX +-6 3 0 XXXXX +-5 3 0 XXXXX +-4 3 0 XXXXX +-3 3 0 XXXXX +-2 3 0 XXXXX +-1 3 0 XXXXX +0 3 0 XXXXX +1 3 0 XXXXX +2 3 0 XXXXX +3 3 0 XXXXX +4 3 0 XXXXX +5 3 0 XXXXX +6 3 0 XXXXX +7 3 0 XXXXX +8 1011 0 XXXXX +9 3 0 XXXXX +10 3 0 XXXXX +11 1011 0 XXXXX +12 3 0 XXXXX +13 3 0 XXXXX +14 3 0 XXXXX +15 3 0 XXXXX +16 3 0 XXXXX +17 3 0 XXXXX +18 3 0 XXXXX +19 3 0 XXXXX +20 3 0 XXXXX +21 3 0 XXXXX +22 3 0 XXXXX +23 3 0 XXXXX +24 3 0 XXXXX +25 3 0 XXXXX +26 3 0 XXXXX +27 3 0 XXXXX +28 3 0 XXXXX +29 3 0 XXXXX +30 3 0 XXXXX +31 3 0 XXXXX +32 3 0 XXXXX +33 3 0 XXXXX +34 3 0 XXXXX +35 3 0 XXXXX +36 3 0 XXXXX +37 3 0 XXXXX +38 3 0 XXXXX +39 3 0 XXXXX +40 3 0 XXXXX +41 3 0 XXXXX +42 3 0 XXXXX +43 3 0 XXXXX +44 3 0 XXXXX +45 3 0 XXXXX +46 3 0 XXXXX +47 3 0 XXXXX +48 3 0 XXXXX +49 3 0 XXXXX +50 3 0 XXXXX +51 3 0 XXXXX +52 3 0 XXXXX +53 3 0 XXXXX +54 3 0 XXXXX +55 3 0 XXXXX +56 3 0 XXXXX +57 3 0 XXXXX +58 3 0 XXXXX +59 3 0 XXXXX +60 3 0 XXXXX +61 3 0 XXXXX +62 3 0 XXXXX +NULL 3066 0 XXXXX