diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index ed84c0f..1e7dce3 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -303,6 +303,7 @@ minitez.query.files.shared=acid_globallimit.q,\ vector_varchar_4.q,\ vector_varchar_mapjoin1.q,\ vector_varchar_simple.q,\ + vector_when_case_null.q,\ vectorization_0.q,\ vectorization_1.q,\ vectorization_10.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index 4d2430f..9b90f37 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -649,12 +649,21 @@ public static String displayBytes(byte[] bytes, int start, int length) { public static void debugDisplayOneRow(VectorizedRowBatch batch, int index, String prefix) { StringBuilder sb = new StringBuilder(); sb.append(prefix + " row " + index + " "); - for (int column = 0; column < batch.cols.length; column++) { + for (int p = 0; p < batch.projectionSize; p++) { + int column = batch.projectedColumns[p]; + if (p == column) { + sb.append("(col " + p + ") "); + } else { + sb.append("(proj col " + p + " col " + column + ") "); + } ColumnVector colVector = batch.cols[column]; if (colVector == null) { - sb.append("(null colVector " + column + ")"); + sb.append("(null ColumnVector)"); } else { boolean isRepeating = colVector.isRepeating; + if (isRepeating) { + sb.append("(repeating)"); + } index = (isRepeating ? 0 : index); if (colVector.noNulls || !colVector.isNull[index]) { if (colVector instanceof LongColumnVector) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java index 6abfe63..e735bc1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; @@ -52,6 +53,17 @@ public VectorUDFArgDesc() { */ public void setConstant(ExprNodeConstantDesc expr) { isConstant = true; + if (expr != null) { + if (expr.getTypeInfo().getCategory() == Category.PRIMITIVE) { + PrimitiveCategory primitiveCategory = ((PrimitiveTypeInfo) expr.getTypeInfo()) + .getPrimitiveCategory(); + if (primitiveCategory == PrimitiveCategory.VOID) { + // Otherwise we'd create a NullWritable and that isn't what we want. + expr = null; + } + } + } + constExpr = expr; } diff --git ql/src/test/queries/clientpositive/vector_when_case_null.q ql/src/test/queries/clientpositive/vector_when_case_null.q new file mode 100644 index 0000000..a423b60 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_when_case_null.q @@ -0,0 +1,14 @@ +set hive.explain.user=false; +SET hive.vectorized.execution.enabled=true; +SET hive.auto.convert.join=true; +set hive.fetch.task.conversion=none; + +-- SORT_QUERY_RESULTS + +create table count_case_groupby (key string, bool boolean) STORED AS orc; +insert into table count_case_groupby values ('key1', true),('key2', false),('key3', NULL),('key4', false),('key5',NULL); + +explain +SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key; + +SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key; \ No newline at end of file diff --git ql/src/test/results/clientpositive/tez/vector_select_null2.q.out ql/src/test/results/clientpositive/tez/vector_select_null2.q.out new file mode 100644 index 0000000..f9dad4e --- /dev/null +++ ql/src/test/results/clientpositive/tez/vector_select_null2.q.out @@ -0,0 +1,95 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +create table count_case_groupby (key string, bool boolean) STORED AS orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@count_case_groupby +POSTHOOK: query: -- SORT_QUERY_RESULTS + +create table count_case_groupby (key string, bool boolean) STORED AS orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@count_case_groupby +PREHOOK: query: insert into table count_case_groupby values ('key1', true),('key2', false),('key3', NULL),('key4', false),('key5',NULL) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@count_case_groupby +POSTHOOK: query: insert into table count_case_groupby values ('key1', true),('key2', false),('key3', NULL),('key4', false),('key5',NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@count_case_groupby +POSTHOOK: Lineage: count_case_groupby.bool EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: count_case_groupby.key SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain +SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: explain +SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: count_case_groupby + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), CASE WHEN (bool) THEN (1) WHEN ((not bool)) THEN (0) ELSE (null) END (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@count_case_groupby +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@count_case_groupby +#### A masked pattern was here #### +key1 1 +key2 1 +key3 0 +key4 1 +key5 0 diff --git ql/src/test/results/clientpositive/tez/vector_when_case_null.q.out ql/src/test/results/clientpositive/tez/vector_when_case_null.q.out new file mode 100644 index 0000000..ecba224 --- /dev/null +++ ql/src/test/results/clientpositive/tez/vector_when_case_null.q.out @@ -0,0 +1,97 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +create table count_case_groupby (key string, bool boolean) STORED AS orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@count_case_groupby +POSTHOOK: query: -- SORT_QUERY_RESULTS + +create table count_case_groupby (key string, bool boolean) STORED AS orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@count_case_groupby +PREHOOK: query: insert into table count_case_groupby values ('key1', true),('key2', false),('key3', NULL),('key4', false),('key5',NULL) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@count_case_groupby +POSTHOOK: query: insert into table count_case_groupby values ('key1', true),('key2', false),('key3', NULL),('key4', false),('key5',NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@count_case_groupby +POSTHOOK: Lineage: count_case_groupby.bool EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: count_case_groupby.key SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain +SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: explain +SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: count_case_groupby + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), bool (type: boolean) + outputColumnNames: key, bool + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(CASE WHEN (bool) THEN (1) WHEN ((not bool)) THEN (0) ELSE (null) END) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Reducer 2 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@count_case_groupby +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@count_case_groupby +#### A masked pattern was here #### +key1 1 +key2 1 +key3 0 +key4 1 +key5 0 diff --git ql/src/test/results/clientpositive/vector_when_case_null.q.out ql/src/test/results/clientpositive/vector_when_case_null.q.out new file mode 100644 index 0000000..679fddc --- /dev/null +++ ql/src/test/results/clientpositive/vector_when_case_null.q.out @@ -0,0 +1,90 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +create table count_case_groupby (key string, bool boolean) STORED AS orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@count_case_groupby +POSTHOOK: query: -- SORT_QUERY_RESULTS + +create table count_case_groupby (key string, bool boolean) STORED AS orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@count_case_groupby +PREHOOK: query: insert into table count_case_groupby values ('key1', true),('key2', false),('key3', NULL),('key4', false),('key5',NULL) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@count_case_groupby +POSTHOOK: query: insert into table count_case_groupby values ('key1', true),('key2', false),('key3', NULL),('key4', false),('key5',NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@count_case_groupby +POSTHOOK: Lineage: count_case_groupby.bool EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: count_case_groupby.key SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain +SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: explain +SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: count_case_groupby + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), bool (type: boolean) + outputColumnNames: key, bool + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(CASE WHEN (bool) THEN (1) WHEN ((not bool)) THEN (0) ELSE (null) END) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 452 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@count_case_groupby +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, COUNT(CASE WHEN bool THEN 1 WHEN NOT bool THEN 0 ELSE NULL END) AS cnt_bool0_ok FROM count_case_groupby GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@count_case_groupby +#### A masked pattern was here #### +key1 1 +key2 1 +key3 0 +key4 1 +key5 0