diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index a4648be..eb44428 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -608,6 +608,7 @@ minillaplocal.query.files=\ schema_evol_text_vecrow_table.q,\ selectDistinctStar.q,\ semijoin.q,\ + semijoin6.q,\ semijoin_hint.q,\ smb_cache.q,\ special_character_in_tabnames_1.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java index 3573d07..05ebe06 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java @@ -605,9 +605,9 @@ private void genObject(int aliasNum, boolean allLeftFirst, boolean allLeftNull) innerJoin(skip, left, right); } else if (type == JoinDesc.LEFT_SEMI_JOIN) { if (innerJoin(skip, left, right)) { - // if left-semi-join found a match, skipping the rest of the rows in the - // rhs table of the semijoin - done = true; + // if left-semi-join found a match and we do not have any additional predicates, + // skipping the rest of the rows in the rhs table of the semijoin + done = !needsPostEvaluation; } } else if (type == JoinDesc.LEFT_OUTER_JOIN || (type == JoinDesc.FULL_OUTER_JOIN && rightNull)) { @@ -641,6 +641,7 @@ private void genObject(int aliasNum, boolean allLeftFirst, boolean allLeftNull) // This is only executed for outer joins with residual filters boolean forward = createForwardJoinObject(skipVectors[numAliases - 1]); producedRow |= forward; + done = (type == JoinDesc.LEFT_SEMI_JOIN) && forward; if (!rightNull && (type == JoinDesc.RIGHT_OUTER_JOIN || type == JoinDesc.FULL_OUTER_JOIN)) { if (forward) { diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 46493ac..22cbadc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -113,6 +113,7 @@ import org.apache.calcite.sql.SqlNode; import org.apache.calcite.sql.SqlOperator; import org.apache.calcite.sql.SqlWindow; +import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.parser.SqlParserPos; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.tools.Frameworks; @@ -2121,17 +2122,15 @@ private RelNode genJoinRelNode(RelNode leftRel, String leftTableAlias, RelNode r RexNode nonEquiConds = RelOptUtil.splitJoinCondition(sysFieldList, leftRel, rightRel, calciteJoinCond, leftJoinKeys, rightJoinKeys, null, null); - if (!nonEquiConds.isAlwaysTrue()) { - throw new SemanticException("Non equality condition not supported in Semi-Join" - + nonEquiConds); - } - RelNode[] inputRels = new RelNode[] { leftRel, rightRel }; final List leftKeys = new ArrayList(); final List rightKeys = new ArrayList(); - calciteJoinCond = HiveCalciteUtil.projectNonColumnEquiConditions( - HiveRelFactories.HIVE_PROJECT_FACTORY, inputRels, leftJoinKeys, rightJoinKeys, 0, - leftKeys, rightKeys); + RexNode remainingEquiCond = HiveCalciteUtil.projectNonColumnEquiConditions(HiveRelFactories.HIVE_PROJECT_FACTORY, + inputRels, leftJoinKeys, rightJoinKeys, 0, leftKeys, rightKeys); + calciteJoinCond = remainingEquiCond != null ? + RexUtil.composeConjunction(cluster.getRexBuilder(), + ImmutableList.of(remainingEquiCond, nonEquiConds), false) : + nonEquiConds; topRel = HiveSemiJoin.getSemiJoin(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), inputRels[0], inputRels[1], calciteJoinCond, ImmutableIntList.copyOf(leftKeys), ImmutableIntList.copyOf(rightKeys)); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 5ccb69a..ecbb099 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -2854,6 +2854,12 @@ private void parseJoinCondition(QBJoinTree joinTree, ASTNode joinCond, switch (joinCond.getToken().getType()) { case HiveParser.KW_OR: + parseJoinCondPopulateAlias(joinTree, (ASTNode) joinCond.getChild(0), + new ArrayList(), new ArrayList(), + null, aliasToOpInfo); + parseJoinCondPopulateAlias(joinTree, (ASTNode) joinCond.getChild(1), + new ArrayList(), new ArrayList(), + null, aliasToOpInfo); joinTree.addPostJoinFilter(joinCond); break; @@ -8146,6 +8152,9 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, HashMap> filterMap = new HashMap>(); + // Only used for semijoin with residual predicates + List topSelectInputColumns = new ArrayList<>(); + for (int pos = 0; pos < right.length; ++pos) { Operator input = right[pos] == null ? left : right[pos]; if (input == null) { @@ -8165,7 +8174,10 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, Byte tag = (byte) rsDesc.getTag(); // check whether this input operator produces output - if (omitOpts != null && omitOpts.contains(pos)) { + // If it has residual, we do not skip this output, + // we will add a Select on top of the join + if (omitOpts != null && omitOpts.contains(pos) + && join.getPostJoinFilters().size() == 0) { exprMap.put(tag, valueDesc); filterMap.put(tag, filterDesc); rightOps[pos] = input; @@ -8209,6 +8221,11 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, valueDesc.add(desc); outputColumnNames.add(internalName); reversedExprs.put(internalName, tag); + + // Populate semijoin select if needed + if (omitOpts == null || !omitOpts.contains(pos)) { + topSelectInputColumns.add(info); + } } for (ASTNode cond : join.getFilters().get(tag)) { filterDesc.add(genExprNodeDesc(cond, inputRR)); @@ -8230,8 +8247,8 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, desc.setFilterMap(join.getFilterMap()); // Add filters that apply to more than one input if (join.getPostJoinFilters().size() != 0 && - (!join.getNoOuterJoin() || - HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) { + (!join.getNoOuterJoin() || !join.getNoSemiJoin() + || HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) { LOG.debug("Generate JOIN with post-filtering conditions"); List residualFilterExprs = new ArrayList(); for (ASTNode cond : join.getPostJoinFilters()) { @@ -8254,7 +8271,29 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, } desc.setNullSafes(nullsafes); } - return putOpInsertMap(joinOp, outputRR); + + Operator topOp = putOpInsertMap(joinOp, outputRR); + if (omitOpts != null && !omitOpts.isEmpty() + && join.getPostJoinFilters().size() != 0) { + // Adding a select operator to top of semijoin to ensure projection of only correct columns + final List topSelectExprs = new ArrayList<>(); + final List topSelectOutputColNames = new ArrayList<>(); + final RowResolver topSelectRR = new RowResolver(); + final Map topSelectColExprMap = new HashMap(); + for (ColumnInfo colInfo : topSelectInputColumns) { + ExprNodeDesc expr = new ExprNodeColumnDesc(colInfo); + topSelectExprs.add(expr); + topSelectOutputColNames.add(colInfo.getInternalName()); + topSelectColExprMap.put(colInfo.getInternalName(), expr); + topSelectRR.put(colInfo.getTabAlias(), colInfo.getInternalName(), colInfo); + } + final SelectDesc topSelect = new SelectDesc(topSelectExprs, topSelectOutputColNames); + topOp = putOpInsertMap(OperatorFactory.getAndMakeChild(topSelect, + new RowSchema(topSelectRR.getColumnInfos()), topOp), topSelectRR); + topOp.setColumnExprMap(topSelectColExprMap); + } + + return topOp; } private ExprNodeDesc[][] genJoinKeys(QBJoinTree joinTree, Operator[] inputs) @@ -8440,8 +8479,16 @@ private Operator genJoinOperator(QB qb, QBJoinTree joinTree, srcOps[i] = genJoinReduceSinkChild(qb, joinKeys[i], srcOps[i], srcs, joinTree.getNextTag()); } - JoinOperator joinOp = (JoinOperator) genJoinOperatorChildren(joinTree, - joinSrcOp, srcOps, omitOpts, joinKeys); + Operator topOp = genJoinOperatorChildren(joinTree, + joinSrcOp, srcOps, omitOpts, joinKeys); + JoinOperator joinOp; + if (topOp instanceof JoinOperator) { + joinOp = (JoinOperator) topOp; + } else { + // We might generate a Select operator on top of the join operator for + // semijoin + joinOp = (JoinOperator) topOp.getParentOperators().get(0); + } joinOp.getConf().setQBJoinTreeProps(joinTree); joinContext.put(joinOp, joinTree); @@ -8451,14 +8498,12 @@ private Operator genJoinOperator(QB qb, QBJoinTree joinTree, // Safety check for postconditions throw new SemanticException("Post-filtering conditions should have been added to the JOIN operator"); } - Operator op = joinOp; for(ASTNode condn : joinTree.getPostJoinFilters()) { - op = genFilterPlan(qb, condn, op, false); + topOp = genFilterPlan(qb, condn, topOp, false); } - return op; } - return joinOp; + return topOp; } /** @@ -8473,7 +8518,7 @@ private Operator genJoinOperator(QB qb, QBJoinTree joinTree, * @throws SemanticException */ private Operator insertSelectForSemijoin(ArrayList fields, - Operator input) throws SemanticException { + Operator input) throws SemanticException { RowResolver inputRR = opParseCtx.get(input).getRowResolver(); ArrayList colList = new ArrayList(); @@ -8485,13 +8530,32 @@ private Operator insertSelectForSemijoin(ArrayList fields, // construct the list of columns that need to be projected for (int i = 0; i < fields.size(); ++i) { ASTNode field = fields.get(i); - ExprNodeDesc exprNode = genExprNodeDesc(field, inputRR); + String[] nm; + String[] nm2; + ExprNodeDesc expr = genExprNodeDesc(field, inputRR); + if (expr instanceof ExprNodeColumnDesc) { + // In most of the cases, this is a column reference + ExprNodeColumnDesc columnExpr = (ExprNodeColumnDesc) expr; + nm = inputRR.reverseLookup(columnExpr.getColumn()); + nm2 = inputRR.getAlternateMappings(columnExpr.getColumn()); + } else { + // However, it can be a constant too. In that case, we need to track + // the column that it originated from in the input operator so we can + // propagate the aliases. + ExprNodeConstantDesc constantExpr = (ExprNodeConstantDesc) expr; + String inputCol = constantExpr.getFoldedFromCol(); + nm = inputRR.reverseLookup(inputCol); + nm2 = inputRR.getAlternateMappings(inputCol); + } String colName = getColumnInternalName(i); outputColumnNames.add(colName); - ColumnInfo colInfo = new ColumnInfo(colName, exprNode.getTypeInfo(), "", false); - outputRR.putExpression(field, colInfo); - colList.add(exprNode); - colExprMap.put(colName, exprNode); + ColumnInfo colInfo = new ColumnInfo(colName, expr.getTypeInfo(), "", false); + outputRR.put(nm[0], nm[1], colInfo); + if (nm2 != null) { + outputRR.addMappingOnly(nm2[0], nm2[1], colInfo); + } + colList.add(expr); + colExprMap.put(colName, expr); } // create selection operator @@ -8503,14 +8567,8 @@ private Operator insertSelectForSemijoin(ArrayList fields, return output; } - private Operator genMapGroupByForSemijoin(QB qb, ArrayList fields, // the - // ASTNode - // of - // the - // join - // key - // "tab.col" - Operator inputOperatorInfo, GroupByDesc.Mode mode) + private Operator genMapGroupByForSemijoin(QB qb, ArrayList fields, + Operator inputOperatorInfo, GroupByDesc.Mode mode) throws SemanticException { RowResolver groupByInputRowResolver = opParseCtx.get(inputOperatorInfo) @@ -8520,26 +8578,38 @@ private Operator genMapGroupByForSemijoin(QB qb, ArrayList fields, // t ArrayList outputColumnNames = new ArrayList(); ArrayList aggregations = new ArrayList(); Map colExprMap = new HashMap(); - qb.getParseInfo(); - - groupByOutputRowResolver.setIsExprResolver(true); // join keys should only - // be columns but not be - // expressions for (int i = 0; i < fields.size(); ++i) { // get the group by keys to ColumnInfo ASTNode colName = fields.get(i); - ExprNodeDesc grpByExprNode = genExprNodeDesc(colName, - groupByInputRowResolver); + String[] nm; + String[] nm2; + ExprNodeDesc grpByExprNode = genExprNodeDesc(colName, groupByInputRowResolver); + if (grpByExprNode instanceof ExprNodeColumnDesc) { + // In most of the cases, this is a column reference + ExprNodeColumnDesc columnExpr = (ExprNodeColumnDesc) grpByExprNode; + nm = groupByInputRowResolver.reverseLookup(columnExpr.getColumn()); + nm2 = groupByInputRowResolver.getAlternateMappings(columnExpr.getColumn()); + } else { + // However, it can be a constant too. In that case, we need to track + // the column that it originated from in the input operator so we can + // propagate the aliases. + ExprNodeConstantDesc constantExpr = (ExprNodeConstantDesc) grpByExprNode; + String inputCol = constantExpr.getFoldedFromCol(); + nm = groupByInputRowResolver.reverseLookup(inputCol); + nm2 = groupByInputRowResolver.getAlternateMappings(inputCol); + } groupByKeys.add(grpByExprNode); - // generate output column names String field = getColumnInternalName(i); outputColumnNames.add(field); ColumnInfo colInfo2 = new ColumnInfo(field, grpByExprNode.getTypeInfo(), "", false); + groupByOutputRowResolver.put(nm[0], nm[1], colInfo2); + if (nm2 != null) { + groupByOutputRowResolver.addMappingOnly(nm2[0], nm2[1], colInfo2); + } groupByOutputRowResolver.putExpression(colName, colInfo2); - // establish mapping from the output column to the input column colExprMap.put(field, grpByExprNode); } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java index 4ddf1d5..67ea32c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java @@ -21,7 +21,6 @@ import java.math.BigDecimal; import java.sql.Date; import java.sql.Timestamp; -import java.time.ZoneId; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -49,6 +48,7 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.ExpressionWalker; import org.apache.hadoop.hive.ql.lib.GraphWalker; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; @@ -57,7 +57,6 @@ import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.lib.ExpressionWalker; import org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcFactory; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSubquerySemanticException; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.TypeConverter; @@ -708,7 +707,9 @@ private static ExprNodeDesc toExprNodeDesc(ColumnInfo colInfo) { inspector instanceof PrimitiveObjectInspector) { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) inspector; Object constant = ((ConstantObjectInspector) inspector).getWritableConstantValue(); - return new ExprNodeConstantDesc(colInfo.getType(), poi.getPrimitiveJavaObject(constant)); + ExprNodeConstantDesc constantExpr = new ExprNodeConstantDesc(colInfo.getType(), poi.getPrimitiveJavaObject(constant)); + constantExpr.setFoldedFromCol(colInfo.getInternalName()); + return constantExpr; } // non-constant or non-primitive constants ExprNodeColumnDesc column = new ExprNodeColumnDesc(colInfo); diff --git ql/src/test/queries/clientpositive/semijoin6.q ql/src/test/queries/clientpositive/semijoin6.q new file mode 100644 index 0000000..f061b34 --- /dev/null +++ ql/src/test/queries/clientpositive/semijoin6.q @@ -0,0 +1,46 @@ +set hive.mapred.mode=nonstrict; +-- SORT_QUERY_RESULTS + +create table tx1 (a integer,b integer); +insert into tx1 values (1, 105), (2, 203), (3, 300), (4, 400), (null, 400); + +create table tx2 (a int, b int); +insert into tx2 values (1, 105), (1, 1900), (2, 1995), (2, 1996), (4, 400), (4, null); + +explain +select * from tx1 u left semi join tx2 v on u.a=v.a; + +select * from tx1 u left semi join tx2 v on u.a=v.a; + +explain +select * from tx1 u left semi join tx2 v on u.b <> v.b; + +select * from tx1 u left semi join tx2 v on u.b <> v.b; + +explain +select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b; + +select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b; + +explain +select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b; + +select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b; + +explain +select * from tx1 u left semi join tx1 v on u.a=v.a; + +select * from tx1 u left semi join tx1 v on u.a=v.a; + +explain +select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a; + +select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a; diff --git ql/src/test/results/clientpositive/llap/semijoin6.q.out ql/src/test/results/clientpositive/llap/semijoin6.q.out new file mode 100644 index 0000000..de826e6 --- /dev/null +++ ql/src/test/results/clientpositive/llap/semijoin6.q.out @@ -0,0 +1,658 @@ +PREHOOK: query: create table tx1 (a integer,b integer) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tx1 +POSTHOOK: query: create table tx1 (a integer,b integer) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tx1 +PREHOOK: query: insert into tx1 values (1, 105), (2, 203), (3, 300), (4, 400), (null, 400) +PREHOOK: type: QUERY +PREHOOK: Output: default@tx1 +POSTHOOK: query: insert into tx1 values (1, 105), (2, 203), (3, 300), (4, 400), (null, 400) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@tx1 +POSTHOOK: Lineage: tx1.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: tx1.b EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: create table tx2 (a int, b int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tx2 +POSTHOOK: query: create table tx2 (a int, b int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tx2 +PREHOOK: query: insert into tx2 values (1, 105), (1, 1900), (2, 1995), (2, 1996), (4, 400), (4, null) +PREHOOK: type: QUERY +PREHOOK: Output: default@tx2 +POSTHOOK: query: insert into tx2 values (1, 105), (1, 1900), (2, 1995), (2, 1996), (4, 400), (4, null) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@tx2 +POSTHOOK: Lineage: tx2.a EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: tx2.b EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: v + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +4 400 +Warning: Shuffle Join MERGEJOIN[11][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.b <> v.b +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.b <> v.b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Map 3 (XPROD_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: v + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: b (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + residual filter predicates: {(_col1 <> _col2)} + Statistics: Num rows: 30 Data size: 390 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 30 Data size: 390 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 30 Data size: 390 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[11][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select * from tx1 u left semi join tx2 v on u.b <> v.b +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v on u.b <> v.b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +3 300 +4 400 +NULL 400 +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: v + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int), _col1 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col3 + residual filter predicates: {(_col1 <> _col3)} + Statistics: Num rows: 6 Data size: 52 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 52 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 52 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a and u.b <> v.b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +Warning: Shuffle Join MERGEJOIN[11][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Map 3 (XPROD_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: v + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int), _col1 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + residual filter predicates: {((_col0 = _col2) or (_col1 <> _col3))} + Statistics: Num rows: 30 Data size: 510 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 30 Data size: 510 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 30 Data size: 510 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[11][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v on u.a=v.a or u.b <> v.b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +3 300 +4 400 +NULL 400 +PREHOOK: query: explain +select * from tx1 u left semi join tx1 v on u.a=v.a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx1 v on u.a=v.a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: v + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 44 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 44 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tx1 u left semi join tx1 v on u.a=v.a +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx1 v on u.a=v.a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +#### A masked pattern was here #### +1 105 +2 203 +3 300 +4 400 +PREHOOK: query: explain +select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: u + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: v + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col3 + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int), _col3 (type: int), _col3 (type: int), _col3 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col3 + residual filter predicates: {((_col0 + _col3) > 400)} {(CASE WHEN ((_col0 > 3)) THEN (true) WHEN ((_col3 > 1900)) THEN (true) ELSE (false) END or ((COALESCE(_col0) + COALESCE(_col3)) > 1900))} + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a +PREHOOK: type: QUERY +PREHOOK: Input: default@tx1 +PREHOOK: Input: default@tx2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tx1 u left semi join tx2 v +on (u.a + v.b > 400) + and ((case when u.a > 3 then true when v.b > 1900 then true else false end) + or (coalesce(u.a) + coalesce(v.b) > 1900)) + and u.a = v.a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tx1 +POSTHOOK: Input: default@tx2 +#### A masked pattern was here #### +1 105 +2 203 +4 400