diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java index 3573d07..05ebe06 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java @@ -605,9 +605,9 @@ private void genObject(int aliasNum, boolean allLeftFirst, boolean allLeftNull) innerJoin(skip, left, right); } else if (type == JoinDesc.LEFT_SEMI_JOIN) { if (innerJoin(skip, left, right)) { - // if left-semi-join found a match, skipping the rest of the rows in the - // rhs table of the semijoin - done = true; + // if left-semi-join found a match and we do not have any additional predicates, + // skipping the rest of the rows in the rhs table of the semijoin + done = !needsPostEvaluation; } } else if (type == JoinDesc.LEFT_OUTER_JOIN || (type == JoinDesc.FULL_OUTER_JOIN && rightNull)) { @@ -641,6 +641,7 @@ private void genObject(int aliasNum, boolean allLeftFirst, boolean allLeftNull) // This is only executed for outer joins with residual filters boolean forward = createForwardJoinObject(skipVectors[numAliases - 1]); producedRow |= forward; + done = (type == JoinDesc.LEFT_SEMI_JOIN) && forward; if (!rightNull && (type == JoinDesc.RIGHT_OUTER_JOIN || type == JoinDesc.FULL_OUTER_JOIN)) { if (forward) { diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 46493ac..16d7438 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -2114,27 +2114,10 @@ private RelNode genJoinRelNode(RelNode leftRel, String leftTableAlias, RelNode r RelNode topRel = null; RowResolver topRR = null; if (leftSemiJoin) { - List sysFieldList = new ArrayList(); - List leftJoinKeys = new ArrayList(); - List rightJoinKeys = new ArrayList(); - - RexNode nonEquiConds = RelOptUtil.splitJoinCondition(sysFieldList, leftRel, rightRel, - calciteJoinCond, leftJoinKeys, rightJoinKeys, null, null); - - if (!nonEquiConds.isAlwaysTrue()) { - throw new SemanticException("Non equality condition not supported in Semi-Join" - + nonEquiConds); - } - RelNode[] inputRels = new RelNode[] { leftRel, rightRel }; - final List leftKeys = new ArrayList(); - final List rightKeys = new ArrayList(); - calciteJoinCond = HiveCalciteUtil.projectNonColumnEquiConditions( - HiveRelFactories.HIVE_PROJECT_FACTORY, inputRels, leftJoinKeys, rightJoinKeys, 0, - leftKeys, rightKeys); topRel = HiveSemiJoin.getSemiJoin(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), - inputRels[0], inputRels[1], calciteJoinCond, ImmutableIntList.copyOf(leftKeys), - ImmutableIntList.copyOf(rightKeys)); + inputRels[0], inputRels[1], calciteJoinCond, ImmutableIntList.of(), + ImmutableIntList.of()); // Create join RR: we need to check whether we need to update left RR in case // previous call to projectNonColumnEquiConditions updated it diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 5ccb69a..3525e50 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -8146,6 +8146,9 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, HashMap> filterMap = new HashMap>(); + // Only used for semijoin with residual predicates + List topSelectInputColumns = new ArrayList<>(); + for (int pos = 0; pos < right.length; ++pos) { Operator input = right[pos] == null ? left : right[pos]; if (input == null) { @@ -8165,7 +8168,10 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, Byte tag = (byte) rsDesc.getTag(); // check whether this input operator produces output - if (omitOpts != null && omitOpts.contains(pos)) { + // If it has residual, we do not skip this output, + // we will add a Select on top of the join + if (omitOpts != null && omitOpts.contains(pos) + && join.getPostJoinFilters().size() == 0) { exprMap.put(tag, valueDesc); filterMap.put(tag, filterDesc); rightOps[pos] = input; @@ -8209,6 +8215,11 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, valueDesc.add(desc); outputColumnNames.add(internalName); reversedExprs.put(internalName, tag); + + // Populate semijoin select if needed + if (omitOpts == null || !omitOpts.contains(pos)) { + topSelectInputColumns.add(info); + } } for (ASTNode cond : join.getFilters().get(tag)) { filterDesc.add(genExprNodeDesc(cond, inputRR)); @@ -8230,8 +8241,8 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, desc.setFilterMap(join.getFilterMap()); // Add filters that apply to more than one input if (join.getPostJoinFilters().size() != 0 && - (!join.getNoOuterJoin() || - HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) { + (!join.getNoOuterJoin() || !join.getNoSemiJoin() + || HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) { LOG.debug("Generate JOIN with post-filtering conditions"); List residualFilterExprs = new ArrayList(); for (ASTNode cond : join.getPostJoinFilters()) { @@ -8254,7 +8265,29 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, } desc.setNullSafes(nullsafes); } - return putOpInsertMap(joinOp, outputRR); + + Operator topOp = putOpInsertMap(joinOp, outputRR); + if (omitOpts != null && !omitOpts.isEmpty() + && join.getPostJoinFilters().size() != 0) { + // Adding a select operator to top of semijoin to ensure projection of only correct columns + final List topSelectExprs = new ArrayList<>(); + final List topSelectOutputColNames = new ArrayList<>(); + final RowResolver topSelectRR = new RowResolver(); + final Map topSelectColExprMap = new HashMap(); + for (ColumnInfo colInfo : topSelectInputColumns) { + ExprNodeDesc expr = new ExprNodeColumnDesc(colInfo); + topSelectExprs.add(expr); + topSelectOutputColNames.add(colInfo.getInternalName()); + topSelectColExprMap.put(colInfo.getInternalName(), expr); + topSelectRR.put(colInfo.getTabAlias(), colInfo.getInternalName(), colInfo); + } + final SelectDesc topSelect = new SelectDesc(topSelectExprs, topSelectOutputColNames); + topOp = putOpInsertMap(OperatorFactory.getAndMakeChild(topSelect, + new RowSchema(topSelectRR.getColumnInfos()), topOp), topSelectRR); + topOp.setColumnExprMap(topSelectColExprMap); + } + + return topOp; } private ExprNodeDesc[][] genJoinKeys(QBJoinTree joinTree, Operator[] inputs) @@ -8413,13 +8446,9 @@ private Operator genJoinOperator(QB qb, QBJoinTree joinTree, } omitOpts.add(pos); - // generate a selection operator for group-by keys only - srcOp = insertSelectForSemijoin(fields, srcOp); - // generate a groupby operator (HASH mode) for a map-side partial // aggregation for semijoin - srcOps[pos++] = genMapGroupByForSemijoin(qb, fields, srcOp, - GroupByDesc.Mode.HASH); + srcOps[pos++] = genMapGroupByForSemijoin(qb, srcOp, GroupByDesc.Mode.HASH); } else { srcOps[pos++] = srcOp; } @@ -8440,8 +8469,16 @@ private Operator genJoinOperator(QB qb, QBJoinTree joinTree, srcOps[i] = genJoinReduceSinkChild(qb, joinKeys[i], srcOps[i], srcs, joinTree.getNextTag()); } - JoinOperator joinOp = (JoinOperator) genJoinOperatorChildren(joinTree, - joinSrcOp, srcOps, omitOpts, joinKeys); + Operator topOp = genJoinOperatorChildren(joinTree, + joinSrcOp, srcOps, omitOpts, joinKeys); + JoinOperator joinOp; + if (topOp instanceof JoinOperator) { + joinOp = (JoinOperator) topOp; + } else { + // We might generate a Select operator on top of the join operator for + // semijoin + joinOp = (JoinOperator) topOp.getParentOperators().get(0); + } joinOp.getConf().setQBJoinTreeProps(joinTree); joinContext.put(joinOp, joinTree); @@ -8451,66 +8488,15 @@ private Operator genJoinOperator(QB qb, QBJoinTree joinTree, // Safety check for postconditions throw new SemanticException("Post-filtering conditions should have been added to the JOIN operator"); } - Operator op = joinOp; for(ASTNode condn : joinTree.getPostJoinFilters()) { - op = genFilterPlan(qb, condn, op, false); + topOp = genFilterPlan(qb, condn, topOp, false); } - return op; } - return joinOp; + return topOp; } - /** - * Construct a selection operator for semijoin that filter out all fields - * other than the group by keys. - * - * @param fields - * list of fields need to be output - * @param input - * input operator - * @return the selection operator. - * @throws SemanticException - */ - private Operator insertSelectForSemijoin(ArrayList fields, - Operator input) throws SemanticException { - - RowResolver inputRR = opParseCtx.get(input).getRowResolver(); - ArrayList colList = new ArrayList(); - ArrayList outputColumnNames = new ArrayList(); - Map colExprMap = new HashMap(); - - RowResolver outputRR = new RowResolver(); - - // construct the list of columns that need to be projected - for (int i = 0; i < fields.size(); ++i) { - ASTNode field = fields.get(i); - ExprNodeDesc exprNode = genExprNodeDesc(field, inputRR); - String colName = getColumnInternalName(i); - outputColumnNames.add(colName); - ColumnInfo colInfo = new ColumnInfo(colName, exprNode.getTypeInfo(), "", false); - outputRR.putExpression(field, colInfo); - colList.add(exprNode); - colExprMap.put(colName, exprNode); - } - - // create selection operator - Operator output = putOpInsertMap(OperatorFactory.getAndMakeChild( - new SelectDesc(colList, outputColumnNames, false), - new RowSchema(outputRR.getColumnInfos()), input), outputRR); - - output.setColumnExprMap(colExprMap); - return output; - } - - private Operator genMapGroupByForSemijoin(QB qb, ArrayList fields, // the - // ASTNode - // of - // the - // join - // key - // "tab.col" - Operator inputOperatorInfo, GroupByDesc.Mode mode) + private Operator genMapGroupByForSemijoin(QB qb, Operator inputOperatorInfo, GroupByDesc.Mode mode) throws SemanticException { RowResolver groupByInputRowResolver = opParseCtx.get(inputOperatorInfo) @@ -8520,28 +8506,21 @@ private Operator genMapGroupByForSemijoin(QB qb, ArrayList fields, // t ArrayList outputColumnNames = new ArrayList(); ArrayList aggregations = new ArrayList(); Map colExprMap = new HashMap(); - qb.getParseInfo(); - - groupByOutputRowResolver.setIsExprResolver(true); // join keys should only - // be columns but not be - // expressions - for (int i = 0; i < fields.size(); ++i) { + for (ColumnInfo col : groupByInputRowResolver.getColumnInfos()) { // get the group by keys to ColumnInfo - ASTNode colName = fields.get(i); - ExprNodeDesc grpByExprNode = genExprNodeDesc(colName, - groupByInputRowResolver); + ExprNodeColumnDesc grpByExprNode = new ExprNodeColumnDesc(col); groupByKeys.add(grpByExprNode); - + outputColumnNames.add(grpByExprNode.getColumn()); // generate output column names - String field = getColumnInternalName(i); - outputColumnNames.add(field); - ColumnInfo colInfo2 = new ColumnInfo(field, grpByExprNode.getTypeInfo(), - "", false); - groupByOutputRowResolver.putExpression(colName, colInfo2); - + String[] nm = groupByInputRowResolver.reverseLookup(col.getInternalName()); + String[] nm2 = groupByInputRowResolver.getAlternateMappings(col.getInternalName()); + groupByOutputRowResolver.put(nm[0], nm[1], col); + if (nm2 != null) { + groupByOutputRowResolver.addMappingOnly(nm2[0], nm2[1], col); + } // establish mapping from the output column to the input column - colExprMap.put(field, grpByExprNode); + colExprMap.put(col.getInternalName(), grpByExprNode); } // Generate group-by operator diff --git ql/src/test/queries/clientpositive/semijoin6.q ql/src/test/queries/clientpositive/semijoin6.q new file mode 100644 index 0000000..f061b34 --- /dev/null +++ ql/src/test/queries/clientpositive/semijoin6.q @@ -0,0 +1,46 @@ +set hive.mapred.mode=nonstrict; +-- SORT_QUERY_RESULTS + +create table tx1 (a integer,b integer); +insert into tx1 values (1, 105), (2, 203), (3, 300), (4, 400), (null, 400); + +create table tx2 (a int, b int); +insert into tx2 values (1, 105), (1, 1900), (2, 1995), (2, 1996), (4, 400), (4, null); + +explain +select * from tx1 u left semi join tx2 v on u.a=v.a; + +select * from tx1 u left semi join tx2 v on u.a=v.a; + +explain +select * from tx1 u left semi join tx2 v on u.b <> v.b; + +select * from tx1 u left semi join tx2 v on u.b <> v.b; + +explain +select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b; + +select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b; + +explain +select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b; + +select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b; + +explain +select * from tx1 u left semi join tx1 v on u.a=v.a; + +select * from tx1 u left semi join tx1 v on u.a=v.a; + +explain +select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a; + +select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a; diff --git ql/src/test/results/clientpositive/semijoin2.q.out ql/src/test/results/clientpositive/semijoin2.q.out index 2c3c5d8..090dda6 100644 --- ql/src/test/results/clientpositive/semijoin2.q.out +++ ql/src/test/results/clientpositive/semijoin2.q.out @@ -120,7 +120,7 @@ STAGE PLANS: Left Semi Join 0 to 1 keys: 0 UDFToInteger(_col105) (type: int), _col72 (type: timestamp) - 1 _col0 (type: int), _col1 (type: timestamp) + 1 -92 (type: int), _col1 (type: timestamp) outputColumnNames: _col16, _col21, _col98 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE File Output Operator diff --git ql/src/test/results/clientpositive/semijoin6.q.out ql/src/test/results/clientpositive/semijoin6.q.out new file mode 100644 index 0000000..9e776b1 --- /dev/null +++ ql/src/test/results/clientpositive/semijoin6.q.out @@ -0,0 +1,569 @@ +PREHOOK: query: create table tx1 (a integer,b integer) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tx1 +POSTHOOK: query: create table tx1 (a integer,b integer) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tx1 +PREHOOK: query: insert into tx1 values (1, 105), (2, 203), (3, 300), (4, 400), (null, 400) +PREHOOK: type: QUERY +PREHOOK: Output: default@tx1 +POSTHOOK: query: insert into tx1 values (1, 105), (2, 203), (3, 300), (4, 400), (null, 400) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@tx1 +POSTHOOK: Lineage: tx1.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: tx1.b EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: create table tx2 (a int, b int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tx2 +POSTHOOK: query: create table tx2 (a int, b int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tx2 +PREHOOK: query: insert into tx2 values (1, 105), (1, 1900), (2, 1995), (2, 1996), (4, 400), (4, null) +PREHOOK: type: QUERY +PREHOOK: Output: default@tx2 +POSTHOOK: query: insert into tx2 values (1, 105), (1, 1900), (2, 1995), (2, 1996), (4, 400), (4, null) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@tx2 +POSTHOOK: Lineage: tx2.a EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: tx2.b EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: v + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 35 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 35 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +4 400 +Warning: Shuffle Join JOIN[7][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.b <> v.b +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.b <> v.b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + TableScan + alias: v + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: b (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + residual filter predicates: {(_col1 <> _col2)} + Statistics: Num rows: 30 Data size: 346 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 30 Data size: 346 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 30 Data size: 346 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[7][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: select * from tx1 u left semi join tx2 v on u.b <> v.b +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v on u.b <> v.b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +3 300 +4 400 +NULL 400 +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: v + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int), _col1 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col3 + residual filter predicates: {(_col1 <> _col3)} + Statistics: Num rows: 6 Data size: 35 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 35 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 35 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +Warning: Shuffle Join JOIN[6][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + TableScan + alias: v + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + residual filter predicates: {((_col0 = _col2) or (_col1 <> _col3))} + Statistics: Num rows: 30 Data size: 346 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 30 Data size: 346 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 30 Data size: 346 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[6][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +3 300 +4 400 +NULL 400 +PREHOOK: query: explain +select * from tx1 u left semi join tx1 v on u.a=v.a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx1 v on u.a=v.a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: v + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 28 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 28 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tx1 u left semi join tx1 v on u.a=v.a +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx1 v on u.a=v.a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +#### A masked pattern was here #### +1 105 +2 203 +3 300 +4 400 +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 26 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: v + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int), _col1 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 32 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col3 + residual filter predicates: {((_col0 + _col3) > 400)} {(CASE WHEN ((_col0 > 3)) THEN (true) WHEN ((_col3 > 1900)) THEN (true) ELSE (false) END or ((COALESCE(_col0) + COALESCE(_col3)) > 1900))} + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +4 400