diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java index 044bba1..fb01087 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java @@ -52,7 +52,10 @@ import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.PartitionIterable; +import org.apache.hadoop.hive.ql.metadata.PrimaryKeyInfo; import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.metadata.UniqueConstraint; +import org.apache.hadoop.hive.ql.metadata.UniqueConstraint.UniqueConstraintCol; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ExprNodeConverter; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; @@ -81,6 +84,7 @@ private final ImmutableMap hivePartitionColsMap; private final ImmutableList hiveVirtualCols; private final int noOfNonVirtualCols; + private final List keys; private final List referentialConstraints; final HiveConf hiveConf; @@ -112,6 +116,7 @@ public RelOptHiveTable(RelOptSchema calciteSchema, String qualifiedTblName, this.partitionCache = partitionCache; this.colStatsCache = colStatsCache; this.noColsMissingStats = noColsMissingStats; + this.keys = generateKeys(); this.referentialConstraints = generateReferentialConstraints(); } @@ -152,7 +157,12 @@ public RelOptHiveTable copy(RelDataType newRowType) { } @Override - public boolean isKey(ImmutableBitSet arg0) { + public boolean isKey(ImmutableBitSet columns) { + for (ImmutableBitSet key : keys) { + if (columns.contains(key)) { + return true; + } + } return false; } @@ -161,6 +171,65 @@ public boolean isKey(ImmutableBitSet arg0) { return referentialConstraints; } + private List generateKeys() { + // First PK + final PrimaryKeyInfo pki; + try { + pki = Hive.get().getReliablePrimaryKeys( + hiveTblMetadata.getDbName(), hiveTblMetadata.getTableName()); + } catch (HiveException e) { + throw new RuntimeException(e); + } + ImmutableList.Builder builder = ImmutableList.builder(); + if (!pki.getColNames().isEmpty()) { + ImmutableBitSet.Builder keys = ImmutableBitSet.builder(); + for (String pkColName : pki.getColNames().values()) { + int pkPos; + for (pkPos = 0; pkPos < rowType.getFieldNames().size(); pkPos++) { + String colName = rowType.getFieldNames().get(pkPos); + if (pkColName.equals(colName)) { + break; + } + } + if (pkPos == rowType.getFieldNames().size() + || pkPos == rowType.getFieldNames().size()) { + LOG.error("Column for primary key definition " + pkColName + " not found"); + return ImmutableList.of(); + } + keys.set(pkPos); + } + builder.add(keys.build()); + } + // Then UKs + final UniqueConstraint uki; + try { + uki = Hive.get().getReliableUniqueConstraints( + hiveTblMetadata.getDbName(), hiveTblMetadata.getTableName()); + } catch (HiveException e) { + throw new RuntimeException(e); + } + for (List ukCols : uki.getUniqueConstraints().values()) { + ImmutableBitSet.Builder keys = ImmutableBitSet.builder(); + for (UniqueConstraintCol ukCol : ukCols) { + int ukPos; + for (ukPos = 0; ukPos < rowType.getFieldNames().size(); ukPos++) { + String colName = rowType.getFieldNames().get(ukPos); + if (ukCol.colName.equals(colName)) { + break; + } + } + if (ukPos == rowType.getFieldNames().size() + || ukPos == rowType.getFieldNames().size()) { + LOG.error("Column for unique constraint definition " + ukCol.colName + " not found"); + return ImmutableList.of(); + } + keys.set(ukPos); + } + builder.add(keys.build()); + } + return builder.build(); + } + private List generateReferentialConstraints() { final ForeignKeyInfo fki; try { diff --git ql/src/test/queries/clientpositive/groupby_join_pushdown.q ql/src/test/queries/clientpositive/groupby_join_pushdown.q index a6e2568..d0bf0fb 100644 --- ql/src/test/queries/clientpositive/groupby_join_pushdown.q +++ ql/src/test/queries/clientpositive/groupby_join_pushdown.q @@ -54,3 +54,20 @@ SELECT sum(f.cint), f.ctinyint FROM alltypesorc f JOIN alltypesorc g ON(f.ctinyint = g.ctinyint) GROUP BY f.ctinyint, g.ctinyint; +ALTER TABLE alltypesorc ADD CONSTRAINT pk_alltypesorc_1 PRIMARY KEY (ctinyint) DISABLE RELY; + +-- COLUMNS ARE UNIQUE, OPTIMIZATION IS NOT TRIGGERED +explain +SELECT sum(f.cint), f.ctinyint +FROM alltypesorc f JOIN alltypesorc g ON(f.ctinyint = g.ctinyint) +GROUP BY f.ctinyint, g.ctinyint; + +ALTER TABLE alltypesorc DROP CONSTRAINT pk_alltypesorc_1; + +ALTER TABLE alltypesorc ADD CONSTRAINT uk_alltypesorc_1 UNIQUE (ctinyint) DISABLE RELY; + +-- COLUMNS ARE UNIQUE, OPTIMIZATION IS NOT TRIGGERED +explain +SELECT sum(f.cint), f.ctinyint +FROM alltypesorc f JOIN alltypesorc g ON(f.ctinyint = g.ctinyint) +GROUP BY f.ctinyint, g.ctinyint; diff --git ql/src/test/results/clientpositive/groupby_join_pushdown.q.out ql/src/test/results/clientpositive/groupby_join_pushdown.q.out index 95d499a..2a2fdd7 100644 --- ql/src/test/results/clientpositive/groupby_join_pushdown.q.out +++ ql/src/test/results/clientpositive/groupby_join_pushdown.q.out @@ -1454,3 +1454,223 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: ALTER TABLE alltypesorc ADD CONSTRAINT pk_alltypesorc_1 PRIMARY KEY (ctinyint) DISABLE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: query: ALTER TABLE alltypesorc ADD CONSTRAINT pk_alltypesorc_1 PRIMARY KEY (ctinyint) DISABLE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: query: explain +SELECT sum(f.cint), f.ctinyint +FROM alltypesorc f JOIN alltypesorc g ON(f.ctinyint = g.ctinyint) +GROUP BY f.ctinyint, g.ctinyint +PREHOOK: type: QUERY +POSTHOOK: query: explain +SELECT sum(f.cint), f.ctinyint +FROM alltypesorc f JOIN alltypesorc g ON(f.ctinyint = g.ctinyint) +GROUP BY f.ctinyint, g.ctinyint +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: f + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), cint (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint) + sort order: + + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: g + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint) + sort order: + + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: tinyint) + 1 _col0 (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: tinyint), _col2 (type: tinyint) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: tinyint) + sort order: ++ + Map-reduce partition columns: _col0 (type: tinyint), _col1 (type: tinyint) + Statistics: Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: tinyint), KEY._col1 (type: tinyint) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: bigint), _col0 (type: tinyint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: ALTER TABLE alltypesorc DROP CONSTRAINT pk_alltypesorc_1 +PREHOOK: type: ALTERTABLE_DROPCONSTRAINT +POSTHOOK: query: ALTER TABLE alltypesorc DROP CONSTRAINT pk_alltypesorc_1 +POSTHOOK: type: ALTERTABLE_DROPCONSTRAINT +PREHOOK: query: ALTER TABLE alltypesorc ADD CONSTRAINT uk_alltypesorc_1 UNIQUE (ctinyint) DISABLE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: query: ALTER TABLE alltypesorc ADD CONSTRAINT uk_alltypesorc_1 UNIQUE (ctinyint) DISABLE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: query: explain +SELECT sum(f.cint), f.ctinyint +FROM alltypesorc f JOIN alltypesorc g ON(f.ctinyint = g.ctinyint) +GROUP BY f.ctinyint, g.ctinyint +PREHOOK: type: QUERY +POSTHOOK: query: explain +SELECT sum(f.cint), f.ctinyint +FROM alltypesorc f JOIN alltypesorc g ON(f.ctinyint = g.ctinyint) +GROUP BY f.ctinyint, g.ctinyint +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: f + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ctinyint is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), cint (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint) + sort order: + + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: g + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ctinyint is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint) + sort order: + + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: tinyint) + 1 _col0 (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: tinyint), _col2 (type: tinyint) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: tinyint) + sort order: ++ + Map-reduce partition columns: _col0 (type: tinyint), _col1 (type: tinyint) + Statistics: Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: tinyint), KEY._col1 (type: tinyint) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: bigint), _col0 (type: tinyint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +