From 3baf507544bee936a1502976156d341de8073ba1 Mon Sep 17 00:00:00 2001 From: Gopal V Date: Thu, 26 May 2016 18:19:55 -0700 Subject: [PATCH] Fast Vector MapJoin Long hashtable has to handle all integral types --- .../fast/VectorMapJoinFastLongHashTable.java | 2 +- .../VectorMapJoinOptimizedLongCommon.java | 2 +- .../hadoop/hive/ql/plan/VectorMapJoinDesc.java | 27 ++++- .../queries/clientpositive/vectorized_mapjoin2.q | 21 ++++ .../clientpositive/tez/vectorized_mapjoin2.q.out | 135 +++++++++++++++++++++ .../clientpositive/vectorized_mapjoin2.q.out | 132 ++++++++++++++++++++ 6 files changed, 316 insertions(+), 3 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/vectorized_mapjoin2.q create mode 100644 ql/src/test/results/clientpositive/tez/vectorized_mapjoin2.q.out create mode 100644 ql/src/test/results/clientpositive/vectorized_mapjoin2.q.out diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java index 0a502e0..78b55a1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java @@ -259,7 +259,7 @@ public VectorMapJoinFastLongHashTable( super(initialCapacity, loadFactor, writeBuffersSize); this.isOuterJoin = isOuterJoin; this.hashTableKeyType = hashTableKeyType; - PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.longTypeInfo }; + PrimitiveTypeInfo[] primitiveTypeInfos = { hashTableKeyType.getPrimitiveTypeInfo() }; keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); allocateBucketArray(); useMinMax = minMaxEnabled; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java index a84de89..0eabc44 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java @@ -161,7 +161,7 @@ public VectorMapJoinOptimizedLongCommon( min = Long.MAX_VALUE; max = Long.MIN_VALUE; this.hashTableKeyType = hashTableKeyType; - // PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.longTypeInfo }; + // PrimitiveTypeInfo[] primitiveTypeInfos = { hashTableKeyType.getPrimitiveTypeInfo() }; // keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); keyBinarySortableSerializeWrite = new BinarySortableSerializeWrite(1); output = new Output(); diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java index e1bf1f4..8ea230f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java @@ -18,6 +18,9 @@ package org.apache.hadoop.hive.ql.plan; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + /** * VectorGroupByDesc. * @@ -51,7 +54,29 @@ INT, LONG, STRING, - MULTI_KEY + MULTI_KEY; + + public PrimitiveTypeInfo getPrimitiveTypeInfo() { + switch (this) { + case BOOLEAN: + return TypeInfoFactory.booleanTypeInfo; + case BYTE: + return TypeInfoFactory.byteTypeInfo; + case INT: + return TypeInfoFactory.intTypeInfo; + case LONG: + return TypeInfoFactory.longTypeInfo; + case NONE: + return TypeInfoFactory.voidTypeInfo; + case SHORT: + return TypeInfoFactory.shortTypeInfo; + case STRING: + return TypeInfoFactory.stringTypeInfo; + case MULTI_KEY: + default: + return null; + } + } } private HashTableImplementationType hashTableImplementationType; diff --git ql/src/test/queries/clientpositive/vectorized_mapjoin2.q ql/src/test/queries/clientpositive/vectorized_mapjoin2.q new file mode 100644 index 0000000..137acbc --- /dev/null +++ ql/src/test/queries/clientpositive/vectorized_mapjoin2.q @@ -0,0 +1,21 @@ +set hive.explain.user=false; +SET hive.vectorized.execution.enabled=true; +SET hive.vectorized.execution.mapjoin.native.enabled=true; +set hive.cbo.enable=true; +set hive.fetch.task.conversion=none; +SET hive.auto.convert.join=true; +SET hive.auto.convert.join.noconditionaltask=true; +SET hive.auto.convert.join.noconditionaltask.size=1000000000; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.mapjoin.hybridgrace.hashtable=false; +set hive.vectorized.execution.mapjoin.native.fast.hashtable.enabled=true; + +create temporary table x (a int) stored as orc; +create temporary table y (b int) stored as orc; +insert into x values(1); +insert into y values(1); + +explain +select count(1) from x, y where a = b; + +select count(1) from x, y where a = b; diff --git ql/src/test/results/clientpositive/tez/vectorized_mapjoin2.q.out ql/src/test/results/clientpositive/tez/vectorized_mapjoin2.q.out new file mode 100644 index 0000000..a3c0b31 --- /dev/null +++ ql/src/test/results/clientpositive/tez/vectorized_mapjoin2.q.out @@ -0,0 +1,135 @@ +PREHOOK: query: create temporary table x (a int) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x +POSTHOOK: query: create temporary table x (a int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x +PREHOOK: query: create temporary table y (b int) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@y +POSTHOOK: query: create temporary table y (b int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@y +PREHOOK: query: insert into x values(1) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@x +POSTHOOK: query: insert into x values(1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@x +POSTHOOK: Lineage: x.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into y values(1) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@y +POSTHOOK: query: insert into y values(1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@y +POSTHOOK: Lineage: y.b EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain +select count(1) from x, y where a = b +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from x, y where a = b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 3 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: x + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + input vertices: + 1 Map 3 + Statistics: Num rows: 49 Data size: 199 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map 3 + Map Operator Tree: + TableScan + alias: y + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: b is not null (type: boolean) + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: b (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Execution mode: vectorized + Reducer 2 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1) from x, y where a = b +PREHOOK: type: QUERY +PREHOOK: Input: default@x +PREHOOK: Input: default@y +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from x, y where a = b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x +POSTHOOK: Input: default@y +#### A masked pattern was here #### +1 diff --git ql/src/test/results/clientpositive/vectorized_mapjoin2.q.out ql/src/test/results/clientpositive/vectorized_mapjoin2.q.out new file mode 100644 index 0000000..73b61a2 --- /dev/null +++ ql/src/test/results/clientpositive/vectorized_mapjoin2.q.out @@ -0,0 +1,132 @@ +PREHOOK: query: create temporary table x (a int) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x +POSTHOOK: query: create temporary table x (a int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x +PREHOOK: query: create temporary table y (b int) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@y +POSTHOOK: query: create temporary table y (b int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@y +PREHOOK: query: insert into x values(1) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@x +POSTHOOK: query: insert into x values(1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@x +POSTHOOK: Lineage: x.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into y values(1) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@y +POSTHOOK: query: insert into y values(1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@y +POSTHOOK: Lineage: y.b EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain +select count(1) from x, y where a = b +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from x, y where a = b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-5 is a root stage + Stage-2 depends on stages: Stage-5 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-5 + Map Reduce Local Work + Alias -> Map Local Tables: + $hdt$_0:$hdt$_0:x + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $hdt$_0:$hdt$_0:x + TableScan + alias: x + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: y + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: b is not null (type: boolean) + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: b (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 45 Data size: 181 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + Statistics: Num rows: 49 Data size: 199 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1) from x, y where a = b +PREHOOK: type: QUERY +PREHOOK: Input: default@x +PREHOOK: Input: default@y +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from x, y where a = b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x +POSTHOOK: Input: default@y +#### A masked pattern was here #### +1 -- 2.4.0