diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index c0ccbee4ec..9cd80ae7fd 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2160,7 +2160,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Whether Hive enables the optimization about converting common join into mapjoin based on the input file size. \n" + "If this parameter is on, and the sum of size for n-1 of the tables/partitions for a n-way join is smaller than the\n" + "specified size, the join is directly converted to a mapjoin (there is no conditional task)."), - + HIVE_CONVERT_ANTI_JOIN("hive.auto.convert.anti.join", false, + "Whether Hive enables the optimization about converting join with null filter to anti join"), HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD("hive.auto.convert.join.noconditionaltask.size", 10000000L, "If hive.auto.convert.join.noconditionaltask is off, this parameter does not take affect. \n" + diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g index e6b6fd6149..3c6bd7eb2b 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g @@ -145,6 +145,7 @@ joinToken | KW_RIGHT (KW_OUTER)? KW_JOIN -> TOK_RIGHTOUTERJOIN | KW_FULL (KW_OUTER)? KW_JOIN -> TOK_FULLOUTERJOIN | KW_LEFT KW_SEMI KW_JOIN -> TOK_LEFTSEMIJOIN + | KW_ANTI KW_JOIN -> TOK_ANTIJOIN ; lateralView diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g index ec054b8d0f..f5b592274f 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g @@ -32,6 +32,7 @@ tokens { TOK_STREAMTABLE; TOK_HINTARGLIST; TOK_LEFTSEMIJOIN; + TOK_ANTIJOIN; } @header { diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index 0f9caaecc9..fb0127d17f 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -300,6 +300,7 @@ TOK_USERSCRIPTCOLSCHEMA; TOK_RECORDREADER; TOK_RECORDWRITER; TOK_LEFTSEMIJOIN; +TOK_ANTIJOIN; TOK_LATERAL_VIEW; TOK_LATERAL_VIEW_OUTER; TOK_TABALIAS; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java index 7a7c8a5aa5..52ec42a1d3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java @@ -153,6 +153,8 @@ transient boolean hasLeftSemiJoin = false; + transient boolean hasAntiJoin = false; + protected transient int countAfterReport; protected transient int heartbeatInterval; protected static final int NOTSKIPBIGTABLE = -1; @@ -365,6 +367,8 @@ protected void initializeOp(Configuration hconf) throws HiveException { for( int i = 0; i < condn.length; i++ ) { if(condn[i].getType() == JoinDesc.LEFT_SEMI_JOIN) { hasLeftSemiJoin = true; + } else if(condn[i].getType() == JoinDesc.ANTI_JOIN) { + hasAntiJoin = true; } } @@ -524,6 +528,9 @@ private boolean createForwardJoinObject(boolean[] skip) throws HiveException { } } if (forward) { + if (hasAntiJoin) { + // TODO : Handle hasAntiJoin + } if (needsPostEvaluation) { forward = !JoinUtil.isFiltered(forwardCache, residualJoinFilters, residualJoinFiltersOIs); } @@ -620,10 +627,22 @@ private void genObject(int aliasNum, boolean allLeftFirst, boolean allLeftNull) boolean rightFirst = true; AbstractRowContainer.RowIterator> iter = aliasRes.rowIter(); int pos = 0; + + // Keep a copy of the skip vector and update the bit for current alias only in the loop. + System.arraycopy(prevSkip, 0, skip, 0, prevSkip.length); + + // For anti join, we should proceed to emit records if the right side is empty + if (!aliasRes.hasRows() && type == JoinDesc.ANTI_JOIN) { + skip[right] = true; + if (aliasNum == numAliases - 1) { + createForwardJoinObject(skipVectors[numAliases - 1]); + } else { + genObject(aliasNum + 1, allLeftFirst, allLeftNull); + } + } + for (List rightObj = iter.first(); !done && rightObj != null; rightObj = loopAgain ? rightObj : iter.next(), rightFirst = loopAgain = false, pos++) { - System.arraycopy(prevSkip, 0, skip, 0, prevSkip.length); - boolean rightNull = rightObj == dummyObj[aliasNum]; if (hasFilter(order[aliasNum])) { filterTags[aliasNum] = getFilterTag(rightObj); @@ -638,6 +657,12 @@ private void genObject(int aliasNum, boolean allLeftFirst, boolean allLeftNull) // skipping the rest of the rows in the rhs table of the semijoin done = !needsPostEvaluation; } + } else if (type == JoinDesc.ANTI_JOIN) { + if (innerJoin(skip, left, right)) { + // if anti join found a match then the condition is not matched for anti join, so we can skip rest of the + // record. But if there is some post evaluation we have to handle that. + done = !needsPostEvaluation; + } } else if (type == JoinDesc.LEFT_OUTER_JOIN || (type == JoinDesc.FULL_OUTER_JOIN && rightNull)) { int result = leftOuterJoin(skip, left, right); @@ -938,11 +963,17 @@ protected void checkAndGenObject() throws HiveException { if (noOuterJoin) { if (!alw.hasRows()) { - return; + if (i != 0 && condn[i-1].getType() == JoinDesc.ANTI_JOIN) { + //TODO : Can both anti join and outer join be present ? + hasEmpty = true; + } else { + return; + } } else if (!alw.isSingleRow()) { mayHasMoreThanOne = true; } } else { + //TODO : Do we need to handle anti join here ? if (!alw.hasRows()) { hasEmpty = true; alw.addRow(dummyObj[i]); @@ -966,6 +997,12 @@ protected void checkAndGenObject() throws HiveException { } } + // For anti join right side table should be empty. + // TODO : What if hasEmpty is set for outer join ? + if (hasAntiJoin && !hasEmpty) { + return; + } + if (!needsPostEvaluation && !hasEmpty && !mayHasMoreThanOne) { genAllOneUniqueJoinObject(); } else if (!needsPostEvaluation && !hasEmpty && !hasLeftSemiJoin) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java index 47ae047b24..c55eedf8ad 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java @@ -97,7 +97,7 @@ public void process(Object row, int tag) throws HiveException { List keyObject = (List) soi.getStructFieldData(row, sf); // Are we consuming too much memory if (alias == numAliases - 1 && !(handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0) && - !hasLeftSemiJoin) { + !hasLeftSemiJoin && !hasAntiJoin) { if (sz == joinEmitInterval && !hasFilter(condn[alias-1].getLeft()) && !hasFilter(condn[alias-1].getRight())) { // The input is sorted by alias, so if we are already in the last join diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinGenerateResultOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinGenerateResultOperator.java new file mode 100644 index 0000000000..7e8c147daa --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinGenerateResultOperator.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin; + +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSet; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.VectorDesc; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +// TODO : This class is duplicate of semi join. Need to do a refactoring to merge it with semi join. +/** + * This class has methods for generating vectorized join results for Anti joins. + * The big difference between inner joins and anti joins is existence testing. + * Inner joins use a hash map to lookup the 1 or more small table values. + * Anti joins are a specialized join for outputting big table rows whose key exists + * in the small table. + * + * No small table values are needed for anti since they would be empty. So, + * we use a hash set as the hash table. Hash sets just report whether a key exists. This + * is a big performance optimization. + */ +public abstract class VectorMapJoinAntiJoinGenerateResultOperator + extends VectorMapJoinGenerateResultOperator { + + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinAntiJoinGenerateResultOperator.class.getName()); + + // Anti join specific members. + + // An array of hash set results so we can do lookups on the whole batch before output result + // generation. + protected transient VectorMapJoinHashSetResult hashSetResults[]; + + // Pre-allocated member for storing the (physical) batch index of matching row (single- or + // multi-small-table-valued) indexes during a process call. + protected transient int[] allMatchs; + + // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. + protected transient int[] spills; + + // Pre-allocated member for storing index into the hashSetResults for each spilled row. + protected transient int[] spillHashMapResultIndices; + + /** Kryo ctor. */ + protected VectorMapJoinAntiJoinGenerateResultOperator() { + super(); + } + + public VectorMapJoinAntiJoinGenerateResultOperator(CompilationOpContext ctx) { + super(ctx); + } + + public VectorMapJoinAntiJoinGenerateResultOperator(CompilationOpContext ctx, OperatorDesc conf, + VectorizationContext vContext, VectorDesc vectorDesc) throws HiveException { + super(ctx, conf, vContext, vectorDesc); + } + + /* + * Setup our anti join specific members. + */ + protected void commonSetup() throws HiveException { + super.commonSetup(); + + // Anti join specific. + VectorMapJoinHashSet baseHashSet = (VectorMapJoinHashSet) vectorMapJoinHashTable; + + hashSetResults = new VectorMapJoinHashSetResult[VectorizedRowBatch.DEFAULT_SIZE]; + for (int i = 0; i < hashSetResults.length; i++) { + hashSetResults[i] = baseHashSet.createHashSetResult(); + } + + allMatchs = new int[VectorizedRowBatch.DEFAULT_SIZE]; + + spills = new int[VectorizedRowBatch.DEFAULT_SIZE]; + spillHashMapResultIndices = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + //----------------------------------------------------------------------------------------------- + + /* + * Anti join (hash set). + */ + + /** + * Generate the anti join output results for one vectorized row batch. The result is modified in the during hash + * table match to reverse the result for anti join. So here matching means, the row can be emitted as the row + * is actually not matching. + * + * @param batch + * The big table batch with any matching and any non matching rows both as + * selected in use. + * @param allMatchCount + * Number of matches in allMatches. + * @param spillCount + * Number of spills in spills. + * @param hashTableResults + * The array of all hash table results for the batch. We need the + * VectorMapJoinHashTableResult for the spill information. + */ + protected void finishAnti(VectorizedRowBatch batch, + int allMatchCount, int spillCount, + VectorMapJoinHashTableResult[] hashTableResults) throws HiveException, IOException { + + // Get rid of spills before we start modifying the batch. + if (spillCount > 0) { + spillHashMapBatch(batch, hashTableResults, + spills, spillHashMapResultIndices, spillCount); + } + + /* + * Optimize by running value expressions only over the matched rows. + */ + if (allMatchCount > 0 && bigTableValueExpressions != null) { + performValueExpressions(batch, allMatchs, allMatchCount); + } + + batch.size = generateHashSetResults(batch, allMatchs, allMatchCount); + batch.selectedInUse = true; + } + + /** + * Generate the matching anti join output results of a vectorized row batch. + * + * @param batch + * The big table batch. + * @param allMatches + * A subset of the rows of the batch that are matches. + * @param allMatchCount + * Number of matches in allMatches. + */ + private int generateHashSetResults(VectorizedRowBatch batch, + int[] allMatches, int allMatchCount) { + int numSel = 0; + // Generate result within big table batch itself. + for (int i = 0; i < allMatchCount; i++) { + int batchIndex = allMatchs[i]; + // Use the big table row as output. + batch.selected[numSel++] = batchIndex; + } + return numSel; + } + + /** + * Generate the anti join output results for one vectorized row batch with a repeated key. + * + * @param batch + * The big table batch whose repeated key matches. + */ + protected int generateHashSetResultRepeatedAll(VectorizedRowBatch batch) { + if (batch.selectedInUse) { + // The selected array is already filled in as we want it. + } else { + int[] selected = batch.selected; + for (int i = 0; i < batch.size; i++) { + selected[i] = i; + } + batch.selectedInUse = true; + } + return batch.size; + } + + protected void finishAntiRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, + VectorMapJoinHashTableResult hashSetResult) throws HiveException, IOException { + switch (joinResult) { + case MATCH: + + if (bigTableValueExpressions != null) { + // Run our value expressions over whole batch. + for(VectorExpression ve: bigTableValueExpressions) { + ve.evaluate(batch); + } + } + + // Generate special repeated case. + batch.size = generateHashSetResultRepeatedAll(batch); + batch.selectedInUse = true; + break; + + case SPILL: + // Whole batch is spilled. + spillBatchRepeated(batch, hashSetResult); + batch.size = 0; + break; + + case NOMATCH: + // No match for entire batch. + batch.size = 0; + break; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinLongOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinLongOperator.java new file mode 100644 index 0000000000..8424b14683 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinLongOperator.java @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin; + +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashSet; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.VectorDesc; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; + +// TODO : Duplicate codes need to merge with semi join. +// Single-Column Long hash table import. +// Single-Column Long specific imports. + +/* + * Specialized class for doing a vectorized map join that is an anti join on a Single-Column Long + * using a hash set. + */ +public class VectorMapJoinAntiJoinLongOperator extends VectorMapJoinAntiJoinGenerateResultOperator { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorMapJoinAntiJoinLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + // The above members are initialized by the constructor and must not be + // transient. + + // The hash map for this specialized class. + private transient VectorMapJoinLongHashSet hashSet; + + // Single-Column Long specific members. + // For integers, we have optional min/max filtering. + private transient boolean useMinMax; + private transient long min; + private transient long max; + + // The column number for this one column join specialization. + private transient int singleJoinColumn; + + // Pass-thru constructors. + /** Kryo ctor. */ + protected VectorMapJoinAntiJoinLongOperator() { + super(); + } + + public VectorMapJoinAntiJoinLongOperator(CompilationOpContext ctx) { + super(ctx); + } + + public VectorMapJoinAntiJoinLongOperator(CompilationOpContext ctx, OperatorDesc conf, + VectorizationContext vContext, VectorDesc vectorDesc) throws HiveException { + super(ctx, conf, vContext, vectorDesc); + } + + // Process Single-Column Long Anti Join on a vectorized row batch. + @Override + protected void commonSetup() throws HiveException { + super.commonSetup(); + + // Initialize Single-Column Long members for this specialized class. + singleJoinColumn = bigTableKeyColumnMap[0]; + } + + @Override + public void hashTableSetup() throws HiveException { + super.hashTableSetup(); + + // Get our Single-Column Long hash set information for this specialized class. + hashSet = (VectorMapJoinLongHashSet) vectorMapJoinHashTable; + useMinMax = hashSet.useMinMax(); + if (useMinMax) { + min = hashSet.min(); + max = hashSet.max(); + } + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws HiveException { + + try { + // (Currently none) + // antiPerBatchSetup(batch); + + // For anti joins, we may apply the filter(s) now. + for(VectorExpression ve : bigTableFilterExpressions) { + ve.evaluate(batch); + } + + final int inputLogicalSize = batch.size; + if (inputLogicalSize == 0) { + return; + } + + // Perform any key expressions. Results will go into scratch columns. + if (bigTableKeyExpressions != null) { + for (VectorExpression ve : bigTableKeyExpressions) { + ve.evaluate(batch); + } + } + + // The one join column for this specialized class. + LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; + long[] vector = joinColVector.vector; + + // Check single column for repeating. + boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + + if (allKeyInputColumnsRepeating) { + // All key input columns are repeating. Generate key once. Lookup once. + // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + JoinUtil.JoinResult joinResult; + if (!joinColVector.noNulls && joinColVector.isNull[0]) { + // For anti join, if the right side is null then its a match. + joinResult = JoinUtil.JoinResult.MATCH; + } else { + long key = vector[0]; + if (useMinMax && (key < min || key > max)) { + // Out of range for whole batch. Its a match for anti join. We can emit the row. + joinResult = JoinUtil.JoinResult.MATCH; + } else { + joinResult = hashSet.contains(key, hashSetResults[0]); + // reverse the join result for anti join. + if (joinResult == JoinUtil.JoinResult.NOMATCH) { + joinResult = JoinUtil.JoinResult.MATCH; + } else if (joinResult == JoinUtil.JoinResult.MATCH) { + joinResult = JoinUtil.JoinResult.NOMATCH; + } + } + } + + // Common repeated join result processing. + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + } + finishAntiRepeated(batch, joinResult, hashSetResults[0]); + } else { + // NOT Repeating. + + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); + } + + // We remember any matching rows in matches / matchSize. At the end of the loop, + // selected / batch.size will represent both matching and non-matching rows for outer join. + // Only deferred rows will have been removed from selected. + int selected[] = batch.selected; + boolean selectedInUse = batch.selectedInUse; + + int hashSetResultCount = 0; + int allMatchCount = 0; + int spillCount = 0; + long saveKey = 0; + + // We optimize performance by only looking up the first key in a series of equal keys. + boolean haveSaveKey = false; + JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + + // Logical loop over the rows in the batch since the batch may have selected in use. + for (int logical = 0; logical < inputLogicalSize; logical++) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + + // Single-Column Long get key. + long currentKey; + boolean isNull; + if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) { + currentKey = 0; + isNull = true; + } else { + currentKey = vector[batchIndex]; + isNull = false; + } + + // Equal key series checking. + if (isNull || !haveSaveKey || currentKey != saveKey) { + // New key. + if (haveSaveKey) { + // Move on with our counts. + switch (saveJoinResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + break; + case SPILL: + // We keep the hash set result for its spill information. + hashSetResultCount++; + break; + case NOMATCH: + break; + } + } + + if (isNull) { + saveJoinResult = JoinUtil.JoinResult.MATCH; + haveSaveKey = false; + } else { + // Regardless of our matching result, we keep that information to make multiple use + // of it for a possible series of equal keys. + haveSaveKey = true; + saveKey = currentKey; + if (useMinMax && (currentKey < min || currentKey > max)) { + // Key out of range for whole hash table, is a valid match for anti join. + saveJoinResult = JoinUtil.JoinResult.NOMATCH; + } else { + saveJoinResult = hashSet.contains(currentKey, hashSetResults[hashSetResultCount]); + } + + // Reverse the match result for anti join. + if (saveJoinResult == JoinUtil.JoinResult.NOMATCH) { + saveJoinResult = JoinUtil.JoinResult.MATCH; + } else if (saveJoinResult == JoinUtil.JoinResult.MATCH) { + saveJoinResult = JoinUtil.JoinResult.NOMATCH; + } + } + + // Common anti join result processing. + switch (saveJoinResult) { + case MATCH: + allMatchs[allMatchCount++] = batchIndex; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); + break; + + case SPILL: + spills[spillCount] = batchIndex; + spillHashMapResultIndices[spillCount] = hashSetResultCount; + spillCount++; + break; + + case NOMATCH: + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); + break; + } + } else { + // Series of equal keys. + switch (saveJoinResult) { + case MATCH: + allMatchs[allMatchCount++] = batchIndex; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); + break; + + case SPILL: + spills[spillCount] = batchIndex; + spillHashMapResultIndices[spillCount] = hashSetResultCount; + spillCount++; + break; + + case NOMATCH: + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); + break; + } + } + } + + if (haveSaveKey) { + // Update our counts for the last key. + switch (saveJoinResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + break; + case SPILL: + // We keep the hash set result for its spill information. + hashSetResultCount++; + break; + case NOMATCH: + break; + } + } + + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + + " spills " + intArrayToRangesString(spills, spillCount) + + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); + } + + finishAnti(batch, allMatchCount, spillCount, hashSetResults); + } + + if (batch.size > 0) { + // Forward any remaining selected rows. + forwardBigTableBatch(batch); + } + + } catch (Exception e) { + throw new HiveException(e); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinMultiKeyOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinMultiKeyOperator.java new file mode 100644 index 0000000000..cec5c29bb3 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinMultiKeyOperator.java @@ -0,0 +1,400 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin; + +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.VectorDesc; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; + +// Multi-Key hash table import. +// Multi-Key specific imports. + +// TODO : Duplicate codes need to merge with semi join. +/* + * Specialized class for doing a vectorized map join that is an anti join on Multi-Key + * using hash set. + */ +public class VectorMapJoinAntiJoinMultiKeyOperator extends VectorMapJoinAntiJoinGenerateResultOperator { + + private static final long serialVersionUID = 1L; + + //------------------------------------------------------------------------------------------------ + + private static final String CLASS_NAME = VectorMapJoinAntiJoinMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ + + // (none) + + // The above members are initialized by the constructor and must not be + // transient. + //--------------------------------------------------------------------------- + + // The hash map for this specialized class. + private transient VectorMapJoinBytesHashSet hashSet; + + //--------------------------------------------------------------------------- + // Multi-Key specific members. + // + + // Object that can take a set of columns in row in a vectorized row batch and serialized it. + // Known to not have any nulls. + private transient VectorSerializeRow keyVectorSerializeWrite; + + // The BinarySortable serialization of the current key. + private transient Output currentKeyOutput; + + // The BinarySortable serialization of the saved key for a possible series of equal keys. + private transient Output saveKeyOutput; + + //--------------------------------------------------------------------------- + // Pass-thru constructors. + // + + /** Kryo ctor. */ + protected VectorMapJoinAntiJoinMultiKeyOperator() { + super(); + } + + public VectorMapJoinAntiJoinMultiKeyOperator(CompilationOpContext ctx) { + super(ctx); + } + + public VectorMapJoinAntiJoinMultiKeyOperator(CompilationOpContext ctx, OperatorDesc conf, + VectorizationContext vContext, VectorDesc vectorDesc) throws HiveException { + super(ctx, conf, vContext, vectorDesc); + } + + //--------------------------------------------------------------------------- + // Process Multi-Key Anti Join on a vectorized row batch. + // + + @Override + protected void commonSetup() throws HiveException { + super.commonSetup(); + + /* + * Initialize Multi-Key members for this specialized class. + */ + + keyVectorSerializeWrite = new VectorSerializeRow(BinarySortableSerializeWrite.with( + this.getConf().getKeyTblDesc().getProperties(), bigTableKeyColumnMap.length)); + keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); + + currentKeyOutput = new Output(); + saveKeyOutput = new Output(); + } + + @Override + public void hashTableSetup() throws HiveException { + super.hashTableSetup(); + + /* + * Get our Multi-Key hash set information for this specialized class. + */ + + hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws HiveException { + + try { + + // Do the per-batch setup for an anti join. + + // (Currently none) + // antiPerBatchSetup(batch); + + // For anti joins, we may apply the filter(s) now. + for(VectorExpression ve : bigTableFilterExpressions) { + ve.evaluate(batch); + } + + final int inputLogicalSize = batch.size; + if (inputLogicalSize == 0) { + return; + } + + // Perform any key expressions. Results will go into scratch columns. + if (bigTableKeyExpressions != null) { + for (VectorExpression ve : bigTableKeyExpressions) { + ve.evaluate(batch); + } + } + + /* + * Multi-Key specific declarations. + */ + + // None. + + /* + * Multi-Key Long check for repeating. + */ + + // If all BigTable input columns to key expressions are isRepeating, then + // calculate key once; lookup once. + boolean allKeyInputColumnsRepeating; + if (bigTableKeyColumnMap.length == 0) { + allKeyInputColumnsRepeating = false; + } else { + allKeyInputColumnsRepeating = true; + for (int i = 0; i < bigTableKeyColumnMap.length; i++) { + if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) { + allKeyInputColumnsRepeating = false; + break; + } + } + } + + if (allKeyInputColumnsRepeating) { + + /* + * Repeating. + */ + + // All key input columns are repeating. Generate key once. Lookup once. + // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + + /* + * Multi-Key specific repeated lookup. + */ + + keyVectorSerializeWrite.setOutput(currentKeyOutput); + keyVectorSerializeWrite.serializeWrite(batch, 0); + JoinUtil.JoinResult joinResult; + if (keyVectorSerializeWrite.getHasAnyNulls()) { + // If right side is null, its a match for anti join. + joinResult = JoinUtil.JoinResult.MATCH; + } else { + byte[] keyBytes = currentKeyOutput.getData(); + int keyLength = currentKeyOutput.getLength(); + // LOG.debug(CLASS_NAME + " processOp all " + displayBytes(keyBytes, 0, keyLength)); + joinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[0]); + // reverse the join result from hash table for anti join. + if (joinResult == JoinUtil.JoinResult.NOMATCH) { + joinResult = JoinUtil.JoinResult.MATCH; + } else if (joinResult == JoinUtil.JoinResult.MATCH) { + joinResult = JoinUtil.JoinResult.NOMATCH; + } + } + + /* + * Common repeated join result processing. + */ + + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + } + finishAntiRepeated(batch, joinResult, hashSetResults[0]); + } else { + + /* + * NOT Repeating. + */ + + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); + } + + // We remember any matching rows in matches / matchSize. At the end of the loop, + // selected / batch.size will represent both matching and non-matching rows for outer join. + // Only deferred rows will have been removed from selected. + int selected[] = batch.selected; + boolean selectedInUse = batch.selectedInUse; + + int hashSetResultCount = 0; + int allMatchCount = 0; + int spillCount = 0; + + /* + * Multi-Key specific variables. + */ + + Output temp; + + // We optimize performance by only looking up the first key in a series of equal keys. + boolean haveSaveKey = false; + JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + + // Logical loop over the rows in the batch since the batch may have selected in use. + for (int logical = 0; logical < inputLogicalSize; logical++) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + + /* + * Multi-Key get key. + */ + + // Generate binary sortable key for current row in vectorized row batch. + keyVectorSerializeWrite.setOutput(currentKeyOutput); + keyVectorSerializeWrite.serializeWrite(batch, batchIndex); + boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls(); + + // LOG.debug(CLASS_NAME + " currentKey " + + // VectorizedBatchUtil.displayBytes(currentKeyOutput.getData(), 0, currentKeyOutput.getLength())); + + /* + * Equal key series checking. + */ + + if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { + + // New key. + + if (haveSaveKey) { + // Move on with our counts. + switch (saveJoinResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + break; + case SPILL: + // We keep the hash set result for its spill information. + hashSetResultCount++; + break; + case NOMATCH: + break; + } + } + + if (isAnyNull) { + saveJoinResult = JoinUtil.JoinResult.MATCH; + haveSaveKey = false; + } else { + // Regardless of our matching result, we keep that information to make multiple use + // of it for a possible series of equal keys. + haveSaveKey = true; + + /* + * Multi-Key specific save key and lookup. + */ + + temp = saveKeyOutput; + saveKeyOutput = currentKeyOutput; + currentKeyOutput = temp; + + /* + * Multi-key specific lookup key. + */ + + byte[] keyBytes = saveKeyOutput.getData(); + int keyLength = saveKeyOutput.getLength(); + saveJoinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[hashSetResultCount]); + if (saveJoinResult == JoinUtil.JoinResult.NOMATCH) { + saveJoinResult = JoinUtil.JoinResult.MATCH; + } else if (saveJoinResult == JoinUtil.JoinResult.MATCH) { + saveJoinResult = JoinUtil.JoinResult.NOMATCH; + } + } + + /* + * Common anti join result processing. + */ + + switch (saveJoinResult) { + case MATCH: + allMatchs[allMatchCount++] = batchIndex; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); + break; + + case SPILL: + spills[spillCount] = batchIndex; + spillHashMapResultIndices[spillCount] = hashSetResultCount; + spillCount++; + break; + + case NOMATCH: + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); + break; + } + } else { + // Series of equal keys. + + switch (saveJoinResult) { + case MATCH: + allMatchs[allMatchCount++] = batchIndex; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); + break; + + case SPILL: + spills[spillCount] = batchIndex; + spillHashMapResultIndices[spillCount] = hashSetResultCount; + spillCount++; + break; + + case NOMATCH: + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); + break; + } + } + } + + if (haveSaveKey) { + // Update our counts for the last key. + switch (saveJoinResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + break; + case SPILL: + // We keep the hash set result for its spill information. + hashSetResultCount++; + break; + case NOMATCH: + break; + } + } + + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + + " spills " + intArrayToRangesString(spills, spillCount) + + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); + } + + finishAnti(batch, allMatchCount, spillCount, hashSetResults); + } + + if (batch.size > 0) { + // Forward any remaining selected rows. + forwardBigTableBatch(batch); + } + } catch (Exception e) { + throw new HiveException(e); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinStringOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinStringOperator.java new file mode 100644 index 0000000000..e66f67b4fe --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinStringOperator.java @@ -0,0 +1,371 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin; + +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.VectorDesc; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; + +// Single-Column String hash table import. +// Single-Column String specific imports. + +// TODO : Duplicate codes need to merge with semi join. +/* + * Specialized class for doing a vectorized map join that is an anti join on a Single-Column String + * using a hash set. + */ +public class VectorMapJoinAntiJoinStringOperator extends VectorMapJoinAntiJoinGenerateResultOperator { + + private static final long serialVersionUID = 1L; + + //------------------------------------------------------------------------------------------------ + + private static final String CLASS_NAME = VectorMapJoinAntiJoinStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ + + // (none) + + // The above members are initialized by the constructor and must not be + // transient. + //--------------------------------------------------------------------------- + + // The hash map for this specialized class. + private transient VectorMapJoinBytesHashSet hashSet; + + //--------------------------------------------------------------------------- + // Single-Column String specific members. + // + + // The column number for this one column join specialization. + private transient int singleJoinColumn; + + //--------------------------------------------------------------------------- + // Pass-thru constructors. + // + + /** Kryo ctor. */ + protected VectorMapJoinAntiJoinStringOperator() { + super(); + } + + public VectorMapJoinAntiJoinStringOperator(CompilationOpContext ctx) { + super(ctx); + } + + public VectorMapJoinAntiJoinStringOperator(CompilationOpContext ctx, OperatorDesc conf, + VectorizationContext vContext, VectorDesc vectorDesc) throws HiveException { + super(ctx, conf, vContext, vectorDesc); + } + + //--------------------------------------------------------------------------- + // Process Single-Column String anti Join on a vectorized row batch. + // + + @Override + protected void commonSetup() throws HiveException { + super.commonSetup(); + + /* + * Initialize Single-Column String members for this specialized class. + */ + + singleJoinColumn = bigTableKeyColumnMap[0]; + } + + @Override + public void hashTableSetup() throws HiveException { + super.hashTableSetup(); + + /* + * Get our Single-Column String hash set information for this specialized class. + */ + + hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws HiveException { + + try { + + // Do the per-batch setup for an anti join. + + // (Currently none) + // antiPerBatchSetup(batch); + + // For anti joins, we may apply the filter(s) now. + for(VectorExpression ve : bigTableFilterExpressions) { + ve.evaluate(batch); + } + + final int inputLogicalSize = batch.size; + if (inputLogicalSize == 0) { + return; + } + + // Perform any key expressions. Results will go into scratch columns. + if (bigTableKeyExpressions != null) { + for (VectorExpression ve : bigTableKeyExpressions) { + ve.evaluate(batch); + } + } + + /* + * Single-Column String specific declarations. + */ + + // The one join column for this specialized class. + BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; + byte[][] vector = joinColVector.vector; + int[] start = joinColVector.start; + int[] length = joinColVector.length; + + /* + * Single-Column Long check for repeating. + */ + + // Check single column for repeating. + boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + + if (allKeyInputColumnsRepeating) { + + /* + * Repeating. + */ + + // All key input columns are repeating. Generate key once. Lookup once. + // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + + /* + * Single-Column String specific repeated lookup. + */ + + JoinUtil.JoinResult joinResult; + if (!joinColVector.noNulls && joinColVector.isNull[0]) { + joinResult = JoinUtil.JoinResult.MATCH; + } else { + byte[] keyBytes = vector[0]; + int keyStart = start[0]; + int keyLength = length[0]; + joinResult = hashSet.contains(keyBytes, keyStart, keyLength, hashSetResults[0]); + if (joinResult == JoinUtil.JoinResult.NOMATCH) { + joinResult = JoinUtil.JoinResult.MATCH; + } else if (joinResult == JoinUtil.JoinResult.MATCH) { + joinResult = JoinUtil.JoinResult.NOMATCH; + } + } + + /* + * Common repeated join result processing. + */ + + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + } + finishAntiRepeated(batch, joinResult, hashSetResults[0]); + } else { + + /* + * NOT Repeating. + */ + + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); + } + + // We remember any matching rows in matchs / matchSize. At the end of the loop, + // selected / batch.size will represent both matching and non-matching rows for outer join. + // Only deferred rows will have been removed from selected. + int selected[] = batch.selected; + boolean selectedInUse = batch.selectedInUse; + + int hashSetResultCount = 0; + int allMatchCount = 0; + int spillCount = 0; + + /* + * Single-Column String specific variables. + */ + + int saveKeyBatchIndex = -1; + + // We optimize performance by only looking up the first key in a series of equal keys. + boolean haveSaveKey = false; + JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + + // Logical loop over the rows in the batch since the batch may have selected in use. + for (int logical = 0; logical < inputLogicalSize; logical++) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + + /* + * Single-Column String get key. + */ + + // Implicit -- use batchIndex. + boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; + + /* + * Equal key series checking. + */ + + if (isNull || !haveSaveKey || + !StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], + vector[batchIndex], start[batchIndex], length[batchIndex])) { + + // New key. + + if (haveSaveKey) { + // Move on with our counts. + switch (saveJoinResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + break; + case SPILL: + // We keep the hash set result for its spill information. + hashSetResultCount++; + break; + case NOMATCH: + break; + } + } + + if (isNull) { + saveJoinResult = JoinUtil.JoinResult.NOMATCH; + haveSaveKey = false; + } else { + // Regardless of our matching result, we keep that information to make multiple use + // of it for a possible series of equal keys. + haveSaveKey = true; + + /* + * Single-Column String specific save key and lookup. + */ + + saveKeyBatchIndex = batchIndex; + + /* + * Single-Column String specific lookup key. + */ + + byte[] keyBytes = vector[batchIndex]; + int keyStart = start[batchIndex]; + int keyLength = length[batchIndex]; + saveJoinResult = hashSet.contains(keyBytes, keyStart, keyLength, hashSetResults[hashSetResultCount]); + if (saveJoinResult == JoinUtil.JoinResult.NOMATCH) { + saveJoinResult = JoinUtil.JoinResult.MATCH; + } else if (saveJoinResult == JoinUtil.JoinResult.MATCH) { + saveJoinResult = JoinUtil.JoinResult.NOMATCH; + } + } + + /* + * Common anti join result processing. + */ + + switch (saveJoinResult) { + case MATCH: + allMatchs[allMatchCount++] = batchIndex; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); + break; + + case SPILL: + spills[spillCount] = batchIndex; + spillHashMapResultIndices[spillCount] = hashSetResultCount; + spillCount++; + break; + + case NOMATCH: + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); + break; + } + } else { + // Series of equal keys. + + switch (saveJoinResult) { + case MATCH: + allMatchs[allMatchCount++] = batchIndex; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); + break; + + case SPILL: + spills[spillCount] = batchIndex; + spillHashMapResultIndices[spillCount] = hashSetResultCount; + spillCount++; + break; + + case NOMATCH: + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); + break; + } + } + } + + if (haveSaveKey) { + // Update our counts for the last key. + switch (saveJoinResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + break; + case SPILL: + // We keep the hash set result for its spill information. + hashSetResultCount++; + break; + case NOMATCH: + break; + } + } + + if (LOG.isDebugEnabled()) { + LOG.debug(CLASS_NAME + + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + + " spills " + intArrayToRangesString(spills, spillCount) + + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); + } + + finishAnti(batch, allMatchCount, spillCount, hashSetResults); + } + + if (batch.size > 0) { + // Forward any remaining selected rows. + forwardBigTableBatch(batch); + } + } catch (Exception e) { + throw new HiveException(e); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java index 2d3daeb771..f43ba922ab 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java @@ -1418,7 +1418,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object.. private boolean skipFolding(JoinDesc joinDesc) { for (JoinCondDesc cond : joinDesc.getConds()) { if (cond.getType() == JoinDesc.INNER_JOIN || cond.getType() == JoinDesc.UNIQUE_JOIN - || cond.getType() == JoinDesc.LEFT_SEMI_JOIN) { + || cond.getType() == JoinDesc.LEFT_SEMI_JOIN || cond.getType() == JoinDesc.ANTI_JOIN) { continue; } return true; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java index 655b5f154d..90b23e9a73 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java @@ -954,6 +954,7 @@ private boolean hasOuterJoin(JoinOperator joinOp) throws SemanticException { switch (joinCondDesc.getType()) { case JoinDesc.INNER_JOIN: case JoinDesc.LEFT_SEMI_JOIN: + case JoinDesc.ANTI_JOIN: case JoinDesc.UNIQUE_JOIN: hasOuter = false; break; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java index 4f1c9b2640..c4cb839e24 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java @@ -752,7 +752,8 @@ public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator o seenPostitions.add(condn.getRight()); if (joinType == JoinDesc.LEFT_OUTER_JOIN - || joinType == JoinDesc.LEFT_SEMI_JOIN) { + || joinType == JoinDesc.LEFT_SEMI_JOIN + || joinType == JoinDesc.ANTI_JOIN) { seenOuterJoin = true; if(bigTableCandidates.size() == 0) { bigTableCandidates.add(condn.getLeft()); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java index 6be826e57a..585f361f83 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java @@ -46,6 +46,7 @@ import org.apache.calcite.tools.RelBuilderFactory; import org.apache.calcite.util.ImmutableBitSet; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAntiJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExcept; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIntersect; @@ -170,6 +171,10 @@ public RelNode createJoin(RelNode left, RelNode right, RexNode condition, final RelOptCluster cluster = left.getCluster(); return HiveSemiJoin.getSemiJoin(cluster, left.getTraitSet(), left, right, condition); } + if (joinType == JoinRelType.ANTI) { + final RelOptCluster cluster = left.getCluster(); + return HiveAntiJoin.getAntiJoin(cluster, left.getTraitSet(), left, right, condition); + } return HiveJoin.getJoin(left.getCluster(), left, right, condition, joinType); } } @@ -188,6 +193,20 @@ public RelNode createSemiJoin(RelNode left, RelNode right, } } + /** + * Implementation of {@link AntiJoinFactory} that returns + * {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAntiJoin} + * . + */ + private static class HiveAntiJoinFactoryImpl implements SemiJoinFactory { + @Override + public RelNode createSemiJoin(RelNode left, RelNode right, + RexNode condition) { + final RelOptCluster cluster = left.getCluster(); + return HiveAntiJoin.getAntiJoin(cluster, left.getTraitSet(), left, right, condition); + } + } + private static class HiveSortFactoryImpl implements SortFactory { @Override public RelNode createSort(RelTraitSet traits, RelNode input, RelCollation collation, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java index 1e2c1e2f87..8c7171fcce 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java @@ -747,7 +747,7 @@ public static RewritablePKFKJoinInfo isRewritablePKFKJoin(Join join, final RelNode nonFkInput = leftInputPotentialFK ? join.getRight() : join.getLeft(); final RewritablePKFKJoinInfo nonRewritable = RewritablePKFKJoinInfo.of(false, null); - if (joinType != JoinRelType.INNER && !join.isSemiJoin()) { + if (joinType != JoinRelType.INNER && !join.isSemiJoin() && joinType != JoinRelType.ANTI) { // If it is not an inner, we transform it as the metadata // providers for expressions do not pull information through // outer join (as it would not be correct) @@ -854,7 +854,7 @@ public static RewritablePKFKJoinInfo isRewritablePKFKJoin(Join join, if (ecT.getEquivalenceClassesMap().containsKey(uniqueKeyColumnRef) && ecT.getEquivalenceClassesMap().get(uniqueKeyColumnRef).contains(foreignKeyColumnRef)) { if (foreignKeyColumnType.isNullable()) { - if (joinType == JoinRelType.INNER || join.isSemiJoin()) { + if (joinType == JoinRelType.INNER || join.isSemiJoin() || joinType == JoinRelType.ANTI) { // If it is nullable and it is an INNER, we just need a IS NOT NULL filter RexNode originalCondOp = refToRex.get(foreignKeyColumnRef); assert originalCondOp != null; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveSubQRemoveRelBuilder.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveSubQRemoveRelBuilder.java index a1d617be59..6f2e2dd2be 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveSubQRemoveRelBuilder.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveSubQRemoveRelBuilder.java @@ -1112,7 +1112,7 @@ public RexNode field(RexNode e, String name) { } public HiveSubQRemoveRelBuilder join(JoinRelType joinType, RexNode condition, - Set variablesSet, boolean createSemiJoin) { + Set variablesSet) { Frame right = stack.pop(); final Frame left = stack.pop(); final RelNode join; @@ -1138,13 +1138,8 @@ public HiveSubQRemoveRelBuilder join(JoinRelType joinType, RexNode condition, default: postCondition = condition; } - if(createSemiJoin) { - join = correlateFactory.createCorrelate(left.rel, right.rel, id, - requiredColumns, JoinRelType.SEMI); - } else { - join = correlateFactory.createCorrelate(left.rel, right.rel, id, - requiredColumns, joinType); - } + join = correlateFactory.createCorrelate(left.rel, right.rel, id, + requiredColumns, joinType); } else { join = joinFactory.createJoin(left.rel, right.rel, condition, variablesSet, joinType, false); @@ -1156,14 +1151,7 @@ public HiveSubQRemoveRelBuilder join(JoinRelType joinType, RexNode condition, filter(postCondition); return this; } - - /** Creates a {@link org.apache.calcite.rel.core.Join} with correlating - * variables. */ - public HiveSubQRemoveRelBuilder join(JoinRelType joinType, RexNode condition, - Set variablesSet) { - return join(joinType, condition, variablesSet, false); - } - + /** Creates a {@link org.apache.calcite.rel.core.Join} using USING syntax. * *

For each of the field names, both left and right inputs must have a diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAntiJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAntiJoin.java new file mode 100644 index 0000000000..d689323d77 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAntiJoin.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.apache.calcite.plan.RelOptCluster; +import org.apache.calcite.plan.RelTraitSet; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexNode; +import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry; + +import java.util.ArrayList; +import java.util.List; + +public class HiveAntiJoin extends Join implements HiveRelNode { + + private final RexNode joinFilter; + + public static HiveAntiJoin getAntiJoin( + RelOptCluster cluster, + RelTraitSet traitSet, + RelNode left, + RelNode right, + RexNode condition) { + try { + return new HiveAntiJoin(cluster, traitSet, left, right, condition); + } catch (CalciteSemanticException e) { + throw new RuntimeException(e); + } + } + + protected HiveAntiJoin(RelOptCluster cluster, + RelTraitSet traitSet, + RelNode left, + RelNode right, + RexNode condition) throws CalciteSemanticException { + super(cluster, traitSet, left, right, condition, JoinRelType.ANTI, Sets.newHashSet()); + final List systemFieldList = ImmutableList.of(); + List> joinKeyExprs = new ArrayList>(); + List filterNulls = new ArrayList(); + for (int i=0; i()); + } + this.joinFilter = HiveRelOptUtil.splitHiveJoinCondition(systemFieldList, this.getInputs(), + this.getCondition(), joinKeyExprs, filterNulls, null); + } + + public RexNode getJoinFilter() { + return joinFilter; + } + + @Override + public HiveAntiJoin copy(RelTraitSet traitSet, RexNode condition, + RelNode left, RelNode right, JoinRelType joinType, boolean semiJoinDone) { + try { + HiveAntiJoin antiJoin = new HiveAntiJoin(getCluster(), traitSet, left, right, condition); + // If available, copy state to registry for optimization rules + HiveRulesRegistry registry = antiJoin.getCluster().getPlanner().getContext().unwrap(HiveRulesRegistry.class); + if (registry != null) { + registry.copyPushedPredicates(this, antiJoin); + } + return antiJoin; + } catch (CalciteSemanticException e) { + // Semantic error not possible. Must be a bug. Convert to + // internal error. + throw new AssertionError(e); + } + } + + @Override + public void implement(Implementor implementor) { + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java index b2ff255d7c..0026b1a4d6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java @@ -91,8 +91,8 @@ public void onMatch(RelOptRuleCall call) { Set leftPushedPredicates = Sets.newHashSet(registry.getPushedPredicates(join, 0)); Set rightPushedPredicates = Sets.newHashSet(registry.getPushedPredicates(join, 1)); - boolean genPredOnLeft = join.getJoinType() == JoinRelType.RIGHT || join.getJoinType() == JoinRelType.INNER || join.isSemiJoin(); - boolean genPredOnRight = join.getJoinType() == JoinRelType.LEFT || join.getJoinType() == JoinRelType.INNER || join.isSemiJoin(); + boolean genPredOnLeft = join.getJoinType() == JoinRelType.RIGHT || join.getJoinType() == JoinRelType.INNER || join.isSemiJoin() || join.getJoinType() == JoinRelType.ANTI; + boolean genPredOnRight = join.getJoinType() == JoinRelType.LEFT || join.getJoinType() == JoinRelType.INNER || join.isSemiJoin()|| join.getJoinType() == JoinRelType.ANTI; RexNode newLeftPredicate = getNewPredicate(join, registry, joinPredInfo, leftPushedPredicates, genPredOnLeft, 0); RexNode newRightPredicate = getNewPredicate(join, registry, joinPredInfo, rightPushedPredicates, genPredOnRight, 1); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java index 8acecfccc2..7d1e26a5da 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java @@ -100,7 +100,8 @@ public void onMatch(RelOptRuleCall call) { // These boolean values represent corresponding left, right input which is potential FK boolean leftInputPotentialFK = topRefs.intersects(leftBits); boolean rightInputPotentialFK = topRefs.intersects(rightBits); - if (leftInputPotentialFK && rightInputPotentialFK && (joinType == JoinRelType.INNER || joinType == JoinRelType.SEMI)) { + if (leftInputPotentialFK && rightInputPotentialFK && + (joinType == JoinRelType.INNER || joinType == JoinRelType.SEMI || joinType == JoinRelType.ANTI)) { // Both inputs are referenced. Before making a decision, try to swap // references in join condition if it is an inner join, i.e. if a join // condition column is referenced above the join, then we can just @@ -183,6 +184,7 @@ public void onMatch(RelOptRuleCall call) { switch (joinType) { case SEMI: case INNER: + case ANTI: if (leftInputPotentialFK && rightInputPotentialFK) { // Bails out as it references columns from both sides (or no columns) // and there is nothing to transform diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinWithFilterToAntiJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinWithFilterToAntiJoinRule.java new file mode 100644 index 0000000000..f4a4364726 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinWithFilterToAntiJoinRule.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Filter; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.util.ImmutableBitSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +/** + * Planner rule that converts a join plus filter to anti join. + */ +public class HiveJoinWithFilterToAntiJoinRule extends RelOptRule { + protected static final Logger LOG = LoggerFactory.getLogger(HiveJoinWithFilterToAntiJoinRule.class); + public static final HiveJoinWithFilterToAntiJoinRule INSTANCE = new HiveJoinWithFilterToAntiJoinRule(); + + // HiveProject(fld=[$0]) + // HiveFilter(condition=[IS NULL($1)]) + // HiveJoin(condition=[=($0, $1)], joinType=[left], algorithm=[none], cost=[not available]) + // + // TO + // + // HiveProject(fld_tbl=[$0]) + // HiveAntiJoin(condition=[=($0, $1)], joinType=[anti]) + // + public HiveJoinWithFilterToAntiJoinRule() { + super(operand(Project.class, operand(Filter.class, operand(Join.class, RelOptRule.any()))), + "HiveJoinWithFilterToAntiJoinRule:filter"); + } + + // is null filter over a left join. + public void onMatch(final RelOptRuleCall call) { + final Project project = call.rel(0); + final Filter filter = call.rel(1); + final Join join = call.rel(2); + perform(call, project, filter, join); + } + + protected void perform(RelOptRuleCall call, Project project, Filter filter, Join join) { + LOG.debug("Matched HiveAntiJoinRule"); + + assert (filter != null); + + //We support conversion from left outer join only. + if (join.getJoinType() != JoinRelType.LEFT) { + return; + } + + List aboveFilters = RelOptUtil.conjunctions(filter.getCondition()); + boolean hasIsNull = false; + + // Get all filter condition and check if any of them is a "is null" kind. + for (RexNode filterNode : aboveFilters) { + if (filterNode.getKind() == SqlKind.IS_NULL && + isFilterFromRightSide(join, filterNode, join.getJoinType())) { + hasIsNull = true; + break; + } + } + + // Is null should be on a key from right side of the join. + if (!hasIsNull) { + return; + } + + // Build anti join with same left, right child and condition as original left outer join. + Join anti = join.copy(join.getTraitSet(), join.getCondition(), + join.getLeft(), join.getRight(), JoinRelType.ANTI, false); + + //TODO : Do we really need it + call.getPlanner().onCopy(join, anti); + + RelNode newProject = getNewProjectNode(project, anti); + if (newProject != null) { + call.getPlanner().onCopy(project, newProject); + call.transformTo(newProject); + } + } + + protected RelNode getNewProjectNode(Project oldProject, Join newJoin) { + List newJoinFiledList = newJoin.getRowType().getFieldList(); + List newProjectExpr = new ArrayList<>(); + for (RexNode field : oldProject.getProjects()) { + if (!(field instanceof RexInputRef)) { + return null; + } + int idx = ((RexInputRef)field).getIndex(); + if (idx > newJoinFiledList.size()) { + LOG.debug(" Project filed " + ((RexInputRef) field).getName() + + " is from right side of join. Can not convert to anti join."); + return null; + } + + final RexInputRef ref = newJoin.getCluster().getRexBuilder() + .makeInputRef(field.getType(), idx); + newProjectExpr.add(ref); + } + return oldProject.copy(oldProject.getTraitSet(), newJoin, newProjectExpr, oldProject.getRowType()); + } + + private boolean isFilterFromRightSide(RelNode joinRel, RexNode filter, JoinRelType joinType) { + List joinFields = joinRel.getRowType().getFieldList(); + int nTotalFields = joinFields.size(); + + List leftFields = (joinRel.getInputs().get(0)).getRowType().getFieldList(); + int nFieldsLeft = leftFields.size(); + List rightFields = (joinRel.getInputs().get(1)).getRowType().getFieldList(); + int nFieldsRight = rightFields.size(); + assert nTotalFields == (!joinType.projectsRight() ? nFieldsLeft : nFieldsLeft + nFieldsRight); + + ImmutableBitSet rightBitmap = ImmutableBitSet.range(nFieldsLeft, nTotalFields); + RelOptUtil.InputFinder inputFinder = RelOptUtil.InputFinder.analyze(filter); + ImmutableBitSet inputBits = inputFinder.inputBitSet.build(); + return rightBitmap.contains(inputBits); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveProjectJoinTransposeRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveProjectJoinTransposeRule.java index 545255cf7c..3fd76f5048 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveProjectJoinTransposeRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveProjectJoinTransposeRule.java @@ -23,6 +23,7 @@ import org.apache.calcite.plan.RelOptRuleCall; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.rules.PushProjector; import org.apache.calcite.rel.type.RelDataTypeField; @@ -76,7 +77,7 @@ public void onMatch(RelOptRuleCall call) { Project origProj = call.rel(0); final Join join = call.rel(1); - if (join.isSemiJoin()) { + if (join.getJoinType() == JoinRelType.SEMI || join.getJoinType() == JoinRelType.ANTI) { return; // TODO: support SemiJoin } // locate all fields referenced in the projection and join condition; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java index ab56ce8f8f..f2df8f9cb4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java @@ -98,6 +98,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelShuttleImpl; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAntiJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIntersect; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; @@ -1239,9 +1240,13 @@ public Frame decorrelateRel(HiveFilter rel) throws SemanticException { valueGenerator = false; } - if(oldInput instanceof LogicalCorrelate - && ((LogicalCorrelate) oldInput).getJoinType() == JoinRelType.SEMI - && !cm.mapRefRelToCorRef.containsKey(rel)) { + boolean isSemiJoin = false; + if (oldInput instanceof LogicalCorrelate) { + isSemiJoin = ((LogicalCorrelate) oldInput).getJoinType() == JoinRelType.SEMI || + ((LogicalCorrelate) oldInput).getJoinType() == JoinRelType.ANTI; + } + + if(isSemiJoin && !cm.mapRefRelToCorRef.containsKey(rel)) { // this conditions need to be pushed into semi-join since this condition // corresponds to IN HiveSemiJoin join = ((HiveSemiJoin)frame.r); @@ -1252,8 +1257,14 @@ public Frame decorrelateRel(HiveFilter rel) throws SemanticException { final RexNode condition = RexUtil.composeConjunction(rexBuilder, conditions, false); - RelNode newRel = HiveSemiJoin.getSemiJoin(frame.r.getCluster(), frame.r.getTraitSet(), - join.getLeft(), join.getRight(), condition); + RelNode newRel; + if (((LogicalCorrelate) oldInput).getJoinType() == JoinRelType.SEMI) { + newRel = HiveSemiJoin.getSemiJoin(frame.r.getCluster(), frame.r.getTraitSet(), + join.getLeft(), join.getRight(), condition); + } else { + newRel = HiveAntiJoin.getAntiJoin(frame.r.getCluster(), frame.r.getTraitSet(), + join.getLeft(), join.getRight(), condition); + } return register(rel, newRel, frame.oldToNewOutputs, frame.corDefOutputs); } @@ -1311,9 +1322,13 @@ public Frame decorrelateRel(Filter rel) { valueGenerator = false; } - if(oldInput instanceof LogicalCorrelate - && ((LogicalCorrelate) oldInput).getJoinType() == JoinRelType.SEMI - && !cm.mapRefRelToCorRef.containsKey(rel)) { + boolean isSemiJoin = false; + if (oldInput instanceof LogicalCorrelate) { + isSemiJoin = ((LogicalCorrelate) oldInput).getJoinType() == JoinRelType.SEMI || + ((LogicalCorrelate) oldInput).getJoinType() == JoinRelType.ANTI; + } + + if(isSemiJoin && !cm.mapRefRelToCorRef.containsKey(rel)) { // this conditions need to be pushed into semi-join since this condition // corresponds to IN HiveSemiJoin join = ((HiveSemiJoin)frame.r); @@ -1323,8 +1338,15 @@ public Frame decorrelateRel(Filter rel) { conditions.add(decorrelateExpr(rel.getCondition(), valueGenerator)); final RexNode condition = RexUtil.composeConjunction(rexBuilder, conditions, false); - RelNode newRel = HiveSemiJoin.getSemiJoin(frame.r.getCluster(), frame.r.getTraitSet(), - join.getLeft(), join.getRight(), condition); + + RelNode newRel; + if (((LogicalCorrelate) oldInput).getJoinType() == JoinRelType.SEMI) { + newRel = HiveSemiJoin.getSemiJoin(frame.r.getCluster(), frame.r.getTraitSet(), + join.getLeft(), join.getRight(), condition); + } else { + newRel = HiveAntiJoin.getAntiJoin(frame.r.getCluster(), frame.r.getTraitSet(), + join.getLeft(), join.getRight(), condition); + } return register(rel, newRel, frame.oldToNewOutputs, frame.corDefOutputs); } @@ -1447,14 +1469,18 @@ public Frame decorrelateRel(LogicalCorrelate rel) { RelNode newJoin = null; // this indicates original query was either correlated EXISTS or IN - if(rel.getJoinType() == JoinRelType.SEMI) { + if(rel.getJoinType() == JoinRelType.SEMI || rel.getJoinType() == JoinRelType.ANTI) { final List leftKeys = new ArrayList(); final List rightKeys = new ArrayList(); RelNode[] inputRels = new RelNode[] {leftFrame.r, rightFrame.r}; - newJoin = HiveSemiJoin.getSemiJoin(rel.getCluster(), - rel.getCluster().traitSetOf(HiveRelNode.CONVENTION), leftFrame.r, rightFrame.r, condition); - + if (rel.getJoinType() == JoinRelType.ANTI) { + newJoin = HiveAntiJoin.getAntiJoin(rel.getCluster(), + rel.getCluster().traitSetOf(HiveRelNode.CONVENTION), leftFrame.r, rightFrame.r, condition); + } else { + newJoin = HiveSemiJoin.getSemiJoin(rel.getCluster(), + rel.getCluster().traitSetOf(HiveRelNode.CONVENTION), leftFrame.r, rightFrame.r, condition); + } } else { // Right input positions are shifted by newLeftFieldCount. for (int i = 0; i < oldRightFieldCount; i++) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java index 0527e2b7f9..80a9abc5b5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java @@ -416,6 +416,7 @@ public RelOptPredicateList inferPredicates( case INNER: case LEFT: case SEMI: + case ANTI: infer(leftPreds, allExprsDigests, inferredPredicates, nonFieldsPredicates, includeEqualityInference, joinType == JoinRelType.LEFT ? rightFieldsBitSet @@ -426,6 +427,7 @@ public RelOptPredicateList inferPredicates( case INNER: case RIGHT: case SEMI: + case ANTI: infer(rightPreds, allExprsDigests, inferredPredicates, nonFieldsPredicates, includeEqualityInference, joinType == JoinRelType.RIGHT ? leftFieldsBitSet @@ -454,7 +456,8 @@ public RelOptPredicateList inferPredicates( } } - if ((joinType == JoinRelType.INNER || joinType == JoinRelType.SEMI) && !nonFieldsPredicates.isEmpty()) { + if ((joinType == JoinRelType.INNER || joinType == JoinRelType.SEMI || joinType == JoinRelType.ANTI) && + !nonFieldsPredicates.isEmpty()) { // Predicates without field references can be pushed to both inputs final Set leftPredsSet = new HashSet( Lists.transform(leftPreds, HiveCalciteUtil.REX_STR_FN)); @@ -477,6 +480,7 @@ public RelOptPredicateList inferPredicates( return RelOptPredicateList.of(rexBuilder, pulledUpPredicates, leftInferredPredicates, rightInferredPredicates); case SEMI: + case ANTI: return RelOptPredicateList.of(rexBuilder, Iterables.concat(leftPreds, leftInferredPredicates), leftInferredPredicates, rightInferredPredicates); case LEFT: diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java index 82e4cc1036..ed70c03569 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java @@ -248,7 +248,7 @@ public static PKFKRelationInfo analyzeJoinForPKFK(Join joinRel, RelMetadataQuery // @todo: remove this. 8/28/14 hb // for now adding because RelOptUtil.classifyFilters has an assertion about // column counts that is not true for semiJoins. - if (joinRel.isSemiJoin()) { + if (joinRel.isSemiJoin() || joinRel.getJoinType() == JoinRelType.ANTI) { return null; } @@ -355,7 +355,7 @@ public static PKFKRelationInfo analyzeJoinForPKFK(Join joinRel, RelMetadataQuery */ public static Pair constraintsBasedAnalyzeJoinForPKFK(Join join, RelMetadataQuery mq) { - if (join.isSemiJoin()) { + if (join.isSemiJoin() || join.getJoinType() == JoinRelType.ANTI) { // TODO: Support semijoin return null; } @@ -390,9 +390,9 @@ public static PKFKRelationInfo analyzeJoinForPKFK(Join joinRel, RelMetadataQuery return null; } - boolean leftIsKey = (join.getJoinType() == JoinRelType.INNER || join.isSemiJoin() || join.getJoinType() == JoinRelType.RIGHT) + boolean leftIsKey = (join.getJoinType() == JoinRelType.INNER || join.isSemiJoin() || join.getJoinType() == JoinRelType.ANTI || join.getJoinType() == JoinRelType.RIGHT) && leftInputResult.isPkFkJoin; - boolean rightIsKey = (join.getJoinType() == JoinRelType.INNER || join.isSemiJoin() || join.getJoinType() == JoinRelType.LEFT) + boolean rightIsKey = (join.getJoinType() == JoinRelType.INNER || join.isSemiJoin() || join.getJoinType() == JoinRelType.ANTI || join.getJoinType() == JoinRelType.LEFT) && rightInputResult.isPkFkJoin; if (!leftIsKey && !rightIsKey) { // Nothing to do here, bail out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java index 1724ab1d21..4887708afe 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java @@ -64,7 +64,7 @@ public Double getSelectivity(HiveTableScan t, RelMetadataQuery mq, RexNode predi } public Double getSelectivity(Join j, RelMetadataQuery mq, RexNode predicate) { - if (j.getJoinType().equals(JoinRelType.INNER) || j.isSemiJoin()) { + if (j.getJoinType().equals(JoinRelType.INNER) || j.isSemiJoin() || j.getJoinType().equals(JoinRelType.ANTI)) { return computeInnerJoinSelectivity(j, mq, predicate); } else if (j.getJoinType().equals(JoinRelType.LEFT) || j.getJoinType().equals(JoinRelType.RIGHT)) { @@ -142,7 +142,7 @@ private Double computeInnerJoinSelectivity(Join j, RelMetadataQuery mq, RexNode ndvEstimate = exponentialBackoff(peLst, colStatMap); } - if (j.isSemiJoin()) { + if (j.isSemiJoin() || (j instanceof HiveJoin && j.getJoinType().equals(JoinRelType.ANTI))) { ndvEstimate = Math.min(mq.getRowCount(j.getLeft()), ndvEstimate); } else if (j instanceof HiveJoin) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java index f714f0728a..20244fd089 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java @@ -174,6 +174,9 @@ public static ASTNode join(ASTNode left, ASTNode right, JoinRelType joinType, AS case FULL: b = ASTBuilder.construct(HiveParser.TOK_FULLOUTERJOIN, "TOK_FULLOUTERJOIN"); break; + case ANTI: + b = ASTBuilder.construct(HiveParser.TOK_ANTIJOIN, "TOK_ANTIJOIN"); + break; } b.add(left).add(right).add(cond); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java index 14dbb5f810..60a40415f1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java @@ -402,7 +402,7 @@ private QueryBlockInfo convertSource(RelNode r) throws CalciteSemanticException QueryBlockInfo right = convertSource(join.getRight()); s = new Schema(left.schema, right.schema); ASTNode cond = join.getCondition().accept(new RexVisitor(s, false, r.getCluster().getRexBuilder())); - boolean semiJoin = join.isSemiJoin(); + boolean semiJoin = join.isSemiJoin() || join.getJoinType() == JoinRelType.ANTI; if (join.getRight() instanceof Join && !semiJoin) { // should not be done for semijoin since it will change the semantics // Invert join inputs; this is done because otherwise the SemanticAnalyzer diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveOpConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveOpConverter.java index bc2b74281d..b19dda9bbc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveOpConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveOpConverter.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAntiJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveMultiJoin; @@ -99,6 +100,8 @@ OpAttr dispatch(RelNode rn) throws SemanticException { return new JoinVisitor(this).visit((HiveJoin) rn); } else if (rn instanceof HiveSemiJoin) { return new JoinVisitor(this).visit((HiveSemiJoin) rn); + } else if (rn instanceof HiveAntiJoin) { + return new JoinVisitor(this).visit((HiveAntiJoin) rn); } else if (rn instanceof HiveFilter) { return new HiveFilterVisitor(this).visit((HiveFilter) rn); } else if (rn instanceof HiveSortLimit) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/JoinVisitor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/JoinVisitor.java index 8d9d5aedb5..d82d50114d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/JoinVisitor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/JoinVisitor.java @@ -87,7 +87,8 @@ OpAttr visit(RelNode joinRel) throws SemanticException { // 3. Virtual columns Set newVcolsInCalcite = new HashSet(); newVcolsInCalcite.addAll(inputs[0].vcolsInCalcite); - if (joinRel instanceof HiveMultiJoin || !((joinRel instanceof Join) && ((Join) joinRel).isSemiJoin())) { + if (joinRel instanceof HiveMultiJoin || !((joinRel instanceof Join) && + ((((Join) joinRel).isSemiJoin()) || (((Join) joinRel).getJoinType() == JoinRelType.ANTI)))) { int shift = inputs[0].inputs.get(0).getSchema().getSignature().size(); for (int i = 1; i < inputs.length; i++) { newVcolsInCalcite.addAll(HiveCalciteUtil.shiftVColsSet(inputs[i].vcolsInCalcite, shift)); @@ -159,12 +160,24 @@ private JoinOperator genJoin(RelNode join, ExprNodeDesc[][] joinExpressions, noOuterJoin = !hmj.isOuterJoin(); } else { joinCondns = new JoinCondDesc[1]; - semiJoin = (join instanceof Join) && ((Join) join).isSemiJoin(); + JoinRelType joinRelType = JoinRelType.INNER; + if (join instanceof Join) { + joinRelType = ((Join) join).getJoinType(); + } JoinType joinType; - if (semiJoin) { - joinType = JoinType.LEFTSEMI; - } else { - joinType = transformJoinType(((Join)join).getJoinType()); + switch (joinRelType) { + case SEMI: + joinType = JoinType.LEFTSEMI; + semiJoin = true; + break; + case ANTI: + joinType = JoinType.ANTI; + semiJoin = true; + break; + default: + assert join instanceof Join; + joinType = transformJoinType(((Join)join).getJoinType()); + semiJoin = false; } joinCondns[0] = new JoinCondDesc(new JoinCond(0, 1, joinType)); noOuterJoin = joinType != JoinType.FULLOUTER && joinType != JoinType.LEFTOUTER diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java index c33f39d937..2c35b9dfbb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java @@ -266,7 +266,8 @@ private void analyzeReduceSinkOperatorsOfJoinOperator(JoinCondDesc[] joinConds, if (pos == joinCond.getLeft()) { if (type == JoinDesc.INNER_JOIN || type == JoinDesc.LEFT_OUTER_JOIN || - type == JoinDesc.LEFT_SEMI_JOIN) { + type == JoinDesc.LEFT_SEMI_JOIN || + type == JoinDesc.ANTI_JOIN) { Operator newCurrentRsOps = rsOps.get(joinCond.getRight()); analyzeReduceSinkOperatorsOfJoinOperator(joinConds, rsOps, newCurrentRsOps, correlatedRsOps); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index a690cd794b..9714314d49 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -45,6 +45,9 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.ConvertDecimal64ToDecimal; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorCoalesce; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.DecimalColDivideDecimalScalar; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinAntiJoinLongOperator; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinAntiJoinMultiKeyOperator; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinAntiJoinStringOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.*; import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFArgDesc; import org.apache.hadoop.hive.ql.io.AcidUtils; @@ -3416,6 +3419,10 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { vectorMapJoinVariation = VectorMapJoinVariation.LEFT_SEMI; hashTableKind = HashTableKind.HASH_SET; break; + case JoinDesc.ANTI_JOIN: + vectorMapJoinVariation = VectorMapJoinVariation.ANTI; + hashTableKind = HashTableKind.HASH_SET; + break; default: throw new HiveException("Unknown join type " + joinType); } @@ -3438,6 +3445,9 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { case LEFT_SEMI: opClass = VectorMapJoinLeftSemiLongOperator.class; break; + case ANTI: + opClass = VectorMapJoinAntiJoinLongOperator.class; + break; case OUTER: opClass = VectorMapJoinOuterLongOperator.class; break; @@ -3459,6 +3469,9 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { case LEFT_SEMI: opClass = VectorMapJoinLeftSemiStringOperator.class; break; + case ANTI: + opClass = VectorMapJoinAntiJoinStringOperator.class; + break; case OUTER: opClass = VectorMapJoinOuterStringOperator.class; break; @@ -3480,6 +3493,9 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { case LEFT_SEMI: opClass = VectorMapJoinLeftSemiMultiKeyOperator.class; break; + case ANTI: + opClass = VectorMapJoinAntiJoinMultiKeyOperator.class; + break; case OUTER: opClass = VectorMapJoinOuterMultiKeyOperator.class; break; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 4318c4b340..fc190066a2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -2514,6 +2514,7 @@ private void updateNumNulls(ColStatistics colStats, long leftUnmatchedRows, long case JoinDesc.INNER_JOIN: case JoinDesc.UNIQUE_JOIN: case JoinDesc.LEFT_SEMI_JOIN: + case JoinDesc.ANTI_JOIN: break; } colStats.setNumNulls(newNumNulls); @@ -2603,6 +2604,7 @@ private long computeFinalRowCount(List rowCountParents, long interimRowCou rowCountParents.get(joinCond.getLeft())), result); break; case JoinDesc.LEFT_SEMI_JOIN: + case JoinDesc.ANTI_JOIN: // max # of rows = rows from left side result = Math.min(rowCountParents.get(joinCond.getLeft()), result); break; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 335e25644a..f49247280a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -223,6 +223,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinProjectTransposeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinPushTransitivePredicatesRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinToMultiJoinRule; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinWithFilterToAntiJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePartitionPruneRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePointLookupOptimizerRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePreFilteringRule; @@ -1901,6 +1902,11 @@ public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlu calcitePreCboPlan = applyPreJoinOrderingTransforms(calciteGenPlan, mdProvider.getMetadataProvider(), executorProvider); + if (conf.getBoolVar(ConfVars.HIVE_CONVERT_ANTI_JOIN)) { + calcitePreCboPlan = hepPlan(calcitePreCboPlan, false, mdProvider.getMetadataProvider(), + null, HepMatchOrder.DEPTH_FIRST, HiveJoinWithFilterToAntiJoinRule.INSTANCE); + } + // 3. Materialized view based rewriting // We disable it for CTAS and MV creation queries (trying to avoid any problem // due to data freshness) @@ -1934,7 +1940,6 @@ public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlu LOG.debug("Plan After Join Reordering:\n" + RelOptUtil.toString(calciteOptimizedPlan, SqlExplainLevel.ALL_ATTRIBUTES)); } - return calciteOptimizedPlan; } @@ -2840,6 +2845,10 @@ private RelNode genJoinRelNode(RelNode leftRel, String leftTableAlias, RelNode r calciteJoinType = JoinRelType.SEMI; leftSemiJoin = true; break; + case ANTI: + calciteJoinType = JoinRelType.ANTI; + leftSemiJoin = true; + break; case INNER: default: calciteJoinType = JoinRelType.INNER; @@ -2978,6 +2987,9 @@ private RelNode genJoinLogicalPlan(ASTNode joinParseTree, Map a case HiveParser.TOK_LEFTSEMIJOIN: hiveJoinType = JoinType.LEFTSEMI; break; + case HiveParser.TOK_ANTIJOIN: + hiveJoinType = JoinType.ANTI; + break; default: hiveJoinType = JoinType.INNER; break; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/JoinType.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/JoinType.java index fd8bb53851..18e0de236e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/JoinType.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/JoinType.java @@ -23,5 +23,5 @@ * */ public enum JoinType { - INNER, LEFTOUTER, RIGHTOUTER, FULLOUTER, UNIQUE, LEFTSEMI + INNER, LEFTOUTER, RIGHTOUTER, FULLOUTER, UNIQUE, LEFTSEMI, ANTI }; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index de746a8d11..15f0cca3c6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -1469,6 +1469,7 @@ static boolean isJoinToken(ASTNode node) { || (node.getToken().getType() == HiveParser.TOK_CROSSJOIN) || isOuterJoinToken(node) || (node.getToken().getType() == HiveParser.TOK_LEFTSEMIJOIN) + || (node.getToken().getType() == HiveParser.TOK_ANTIJOIN) || (node.getToken().getType() == HiveParser.TOK_UNIQUEJOIN); } @@ -9760,6 +9761,10 @@ private QBJoinTree genSQJoinTree(QB qb, ISubQueryJoinInfo subQuery, joinTree.setNoSemiJoin(false); condn[0] = new JoinCond(0, 1, JoinType.LEFTSEMI); break; + case ANTI: + joinTree.setNoSemiJoin(false); + condn[0] = new JoinCond(0, 1, JoinType.ANTI); + break; default: condn[0] = new JoinCond(0, 1, JoinType.INNER); joinTree.setNoOuterJoin(true); @@ -9862,6 +9867,10 @@ private QBJoinTree genJoinTree(QB qb, ASTNode joinParseTree, joinTree.setNoSemiJoin(false); condn[0] = new JoinCond(0, 1, JoinType.LEFTSEMI); break; + case HiveParser.TOK_ANTIJOIN: + joinTree.setNoSemiJoin(false); + condn[0] = new JoinCond(0, 1, JoinType.ANTI); + break; default: condn[0] = new JoinCond(0, 1, JoinType.INNER); joinTree.setNoOuterJoin(true); @@ -10060,7 +10069,8 @@ private void parseStreamTables(QBJoinTree joinTree, QB qb) { for (ASTNode hintNode : hints) { for (Node node : hintNode.getChildren()) { ASTNode hint = (ASTNode) node; - if (hint.getChild(0).getType() != HintParser.TOK_LEFTSEMIJOIN) { + if (hint.getChild(0).getType() != HintParser.TOK_LEFTSEMIJOIN || + hint.getChild(0).getType() != HintParser.TOK_ANTIJOIN) { continue; } if (result == null) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinCondDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinCondDesc.java index 0eb03e98a3..75f9d81356 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinCondDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinCondDesc.java @@ -73,6 +73,9 @@ public JoinCondDesc(org.apache.hadoop.hive.ql.parse.JoinCond condn) { case LEFTSEMI: type = JoinDesc.LEFT_SEMI_JOIN; break; + case ANTI: + type = JoinDesc.ANTI_JOIN; + break; default: assert false; } @@ -140,6 +143,9 @@ public String getJoinCondString() { case JoinDesc.LEFT_SEMI_JOIN: sb.append("Left Semi Join "); break; + case JoinDesc.ANTI_JOIN: + sb.append("Anti Join "); + break; default: sb.append("Unknown Join "); break; @@ -175,6 +181,9 @@ public String getUserLevelJoinCondString() { case JoinDesc.LEFT_SEMI_JOIN: join.put("type", "Left Semi"); break; + case JoinDesc.ANTI_JOIN: + join.put("type", "Anti"); + break; default: join.put("type", "Unknown Join"); break; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java index 523fe54325..357b1e0c28 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java @@ -48,6 +48,7 @@ public static final int FULL_OUTER_JOIN = 3; public static final int UNIQUE_JOIN = 4; public static final int LEFT_SEMI_JOIN = 5; + public static final int ANTI_JOIN = 6; // used to handle skew join private boolean handleSkewJoin = false; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java index a0ee3a936a..a2a4563636 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java @@ -89,7 +89,8 @@ public PrimitiveTypeInfo getPrimitiveTypeInfo() { INNER_BIG_ONLY, LEFT_SEMI, OUTER, - FULL_OUTER + FULL_OUTER, + ANTI } private HashTableImplementationType hashTableImplementationType; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ppd/PredicateTransitivePropagate.java b/ql/src/java/org/apache/hadoop/hive/ql/ppd/PredicateTransitivePropagate.java index 9666dd779b..e1dcb119f9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/ppd/PredicateTransitivePropagate.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/ppd/PredicateTransitivePropagate.java @@ -199,6 +199,7 @@ private boolean filterExists(ReduceSinkOperator target, ExprNodeDesc replaced) { switch (cond.getType()) { case JoinDesc.INNER_JOIN: case JoinDesc.LEFT_SEMI_JOIN: + case JoinDesc.ANTI_JOIN: vector.add(left, right); vector.add(right, left); break; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java b/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java index 7316df0020..6b2cfb4c31 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java @@ -335,6 +335,7 @@ String getFuncText(String funcText, final int srcPos) { switch (cond.getType()) { case JoinDesc.INNER_JOIN: case JoinDesc.LEFT_SEMI_JOIN: + case JoinDesc.ANTI_JOIN: vector.add(left, right); vector.add(right, left); break; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/MapJoinTestConfig.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/MapJoinTestConfig.java index a250533f55..92be2541e1 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/MapJoinTestConfig.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/MapJoinTestConfig.java @@ -308,6 +308,9 @@ public static MapJoinDesc createMapJoinDesc(MapJoinTestDescription testDesc, case LEFT_SEMI: joinDescType = JoinDesc.LEFT_SEMI_JOIN; break; + case ANTI: + joinDescType = JoinDesc.ANTI_JOIN; + break; case OUTER: joinDescType = JoinDesc.LEFT_OUTER_JOIN; break; diff --git a/ql/src/test/queries/clientpositive/antijoin.q b/ql/src/test/queries/clientpositive/antijoin.q new file mode 100644 index 0000000000..08884c3adb --- /dev/null +++ b/ql/src/test/queries/clientpositive/antijoin.q @@ -0,0 +1,44 @@ +--! qt:dataset:src +--! qt:dataset:part +SET hive.vectorized.execution.enabled=false; +set hive.mapred.mode=nonstrict; +SET hive.auto.convert.join=false; +-- SORT_QUERY_RESULTS + +create table t1_n55 as select cast(key as int) key, value from src where key <= 10; + +select * from t1_n55 sort by key; + +create table t2_n33 as select cast(2*key as int) key, value from t1_n55; + +select * from t2_n33 sort by key; + +create table t3_n12 as select * from (select * from t1_n55 union all select * from t2_n33) b; +select * from t3_n12 sort by key, value; + +explain select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value; +explain cbo select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value; +select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value; + +explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null; +explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null; +select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null; + +explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value; +explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value; +select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value; + +SET hive.vectorized.execution.enabled=true; +SET hive.auto.convert.join=true; +explain select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value; +explain cbo select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value; +select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value; + +explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null; +explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null; +select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null; + +explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value; +explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value; +select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value; + diff --git a/ql/src/test/results/clientpositive/antijoin.q.out b/ql/src/test/results/clientpositive/antijoin.q.out new file mode 100644 index 0000000000..d297f1800e --- /dev/null +++ b/ql/src/test/results/clientpositive/antijoin.q.out @@ -0,0 +1,1059 @@ +PREHOOK: query: create table t1_n55 as select cast(key as int) key, value from src where key <= 10 +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +PREHOOK: Output: database:default +PREHOOK: Output: default@t1_n55 +POSTHOOK: query: create table t1_n55 as select cast(key as int) key, value from src where key <= 10 +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1_n55 +POSTHOOK: Lineage: t1_n55.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1_n55.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: select * from t1_n55 sort by key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +#### A masked pattern was here #### +POSTHOOK: query: select * from t1_n55 sort by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +#### A masked pattern was here #### +0 val_0 +0 val_0 +0 val_0 +10 val_10 +2 val_2 +4 val_4 +5 val_5 +5 val_5 +5 val_5 +8 val_8 +9 val_9 +PREHOOK: query: create table t2_n33 as select cast(2*key as int) key, value from t1_n55 +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@t1_n55 +PREHOOK: Output: database:default +PREHOOK: Output: default@t2_n33 +POSTHOOK: query: create table t2_n33 as select cast(2*key as int) key, value from t1_n55 +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2_n33 +POSTHOOK: Lineage: t2_n33.key EXPRESSION [(t1_n55)t1_n55.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: t2_n33.value SIMPLE [(t1_n55)t1_n55.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from t2_n33 sort by key +PREHOOK: type: QUERY +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: select * from t2_n33 sort by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +0 val_0 +0 val_0 +0 val_0 +10 val_5 +10 val_5 +10 val_5 +16 val_8 +18 val_9 +20 val_10 +4 val_2 +8 val_4 +PREHOOK: query: create table t3_n12 as select * from (select * from t1_n55 union all select * from t2_n33) b +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +PREHOOK: Output: database:default +PREHOOK: Output: default@t3_n12 +POSTHOOK: query: create table t3_n12 as select * from (select * from t1_n55 union all select * from t2_n33) b +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t3_n12 +POSTHOOK: Lineage: t3_n12.key EXPRESSION [(t1_n55)t1_n55.FieldSchema(name:key, type:int, comment:null), (t2_n33)t2_n33.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: t3_n12.value EXPRESSION [(t1_n55)t1_n55.FieldSchema(name:value, type:string, comment:null), (t2_n33)t2_n33.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from t3_n12 sort by key, value +PREHOOK: type: QUERY +PREHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +POSTHOOK: query: select * from t3_n12 sort by key, value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +0 val_0 +0 val_0 +0 val_0 +0 val_0 +0 val_0 +0 val_0 +10 val_10 +10 val_5 +10 val_5 +10 val_5 +16 val_8 +18 val_9 +2 val_2 +20 val_10 +4 val_2 +4 val_4 +5 val_5 +5 val_5 +5 val_5 +8 val_4 +8 val_8 +9 val_9 +PREHOOK: query: explain select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: explain select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + alias: b + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: key (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + TableScan + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Join Operator + condition map: + Anti Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: string) + null sort order: zz + sort order: ++ + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +CBO PLAN: +HiveSortExchange(distribution=[any], collation=[[0, 1]]) + HiveProject(key=[$0], value=[$1]) + HiveJoin(condition=[=($0, $3)], joinType=[anti], algorithm=[none], cost=[not available]) + HiveProject(key=[$0], value=[$1]) + HiveTableScan(table=[[default, t1_n55]], table:alias=[a]) + HiveProject(literalTrue=[true], key=[$0]) + HiveAggregate(group=[{0}]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t2_n33]], table:alias=[b]) + +PREHOOK: query: select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +2 val_2 +5 val_5 +5 val_5 +5 val_5 +9 val_9 +PREHOOK: query: explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + TableScan + alias: b + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Join Operator + condition map: + Anti Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 7 Data size: 651 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 7 Data size: 651 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +CBO PLAN: +HiveJoin(condition=[=($0, $2)], joinType=[anti], algorithm=[none], cost=[not available]) + HiveProject(key=[$0], value=[$1]) + HiveTableScan(table=[[default, t1_n55]], table:alias=[a]) + HiveProject(key=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t2_n33]], table:alias=[b]) + +PREHOOK: query: select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +2 val_2 +5 val_5 +5 val_5 +5 val_5 +9 val_9 +PREHOOK: query: explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +PREHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +POSTHOOK: query: explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +POSTHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: a + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + TableScan + alias: b + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Join Operator + condition map: + Anti Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 7 Data size: 651 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 7 Data size: 651 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + TableScan + alias: c + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 1395 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: string) + null sort order: zz + sort order: ++ + Statistics: Num rows: 15 Data size: 1395 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 1395 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 15 Data size: 1395 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +PREHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +POSTHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +CBO PLAN: +HiveSortExchange(distribution=[any], collation=[[0, 1]]) + HiveProject(key=[$1], value=[$2]) + HiveJoin(condition=[=($1, $0)], joinType=[inner], algorithm=[none], cost=[not available]) + HiveProject(key=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t3_n12]], table:alias=[c]) + HiveJoin(condition=[=($0, $2)], joinType=[anti], algorithm=[none], cost=[not available]) + HiveProject(key=[$0], value=[$1]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t1_n55]], table:alias=[a]) + HiveProject(key=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t2_n33]], table:alias=[b]) + +PREHOOK: query: select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +PREHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +POSTHOOK: query: select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +POSTHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +2 val_2 +5 val_5 +5 val_5 +5 val_5 +5 val_5 +5 val_5 +5 val_5 +5 val_5 +5 val_5 +5 val_5 +9 val_9 +PREHOOK: query: explain select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: explain select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-5 depends on stages: Stage-3 , consists of Stage-6, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-6 + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + alias: b + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: key (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME + TableScan + HashTable Sink Operator + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Anti Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Execution mode: vectorized + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: string) + null sort order: zz + sort order: ++ + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + TableScan + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Join Operator + condition map: + Anti Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +CBO PLAN: +HiveSortExchange(distribution=[any], collation=[[0, 1]]) + HiveProject(key=[$0], value=[$1]) + HiveJoin(condition=[=($0, $3)], joinType=[anti], algorithm=[none], cost=[not available]) + HiveProject(key=[$0], value=[$1]) + HiveTableScan(table=[[default, t1_n55]], table:alias=[a]) + HiveProject(literalTrue=[true], key=[$0]) + HiveAggregate(group=[{0}]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t2_n33]], table:alias=[b]) + +PREHOOK: query: select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: select a.key, a.value from t1_n55 a where not exists (select 1 from t2_n33 b where a.key=b.key) sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +PREHOOK: query: explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-3 depends on stages: Stage-4 + Stage-0 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + $hdt$_1:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $hdt$_1:b + TableScan + alias: b + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + HashTable Sink Operator + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Anti Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 7 Data size: 651 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 7 Data size: 651 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +CBO PLAN: +HiveJoin(condition=[=($0, $2)], joinType=[anti], algorithm=[none], cost=[not available]) + HiveProject(key=[$0], value=[$1]) + HiveTableScan(table=[[default, t1_n55]], table:alias=[a]) + HiveProject(key=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t2_n33]], table:alias=[b]) + +PREHOOK: query: select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +POSTHOOK: query: select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key where b.key is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +#### A masked pattern was here #### +PREHOOK: query: explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +PREHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +POSTHOOK: query: explain select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +POSTHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-3 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + $hdt$_0:c + Fetch Operator + limit: -1 + $hdt$_2:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $hdt$_0:c + TableScan + alias: c + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + HashTable Sink Operator + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + $hdt$_2:b + TableScan + alias: b + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + HashTable Sink Operator + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + alias: a + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 1023 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Anti Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 7 Data size: 651 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 1395 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: string) + null sort order: zz + sort order: ++ + Statistics: Num rows: 15 Data size: 1395 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 1395 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 15 Data size: 1395 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +PREHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +POSTHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +CBO PLAN: +HiveSortExchange(distribution=[any], collation=[[0, 1]]) + HiveProject(key=[$1], value=[$2]) + HiveJoin(condition=[=($1, $0)], joinType=[inner], algorithm=[none], cost=[not available]) + HiveProject(key=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t3_n12]], table:alias=[c]) + HiveJoin(condition=[=($0, $2)], joinType=[anti], algorithm=[none], cost=[not available]) + HiveProject(key=[$0], value=[$1]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t1_n55]], table:alias=[a]) + HiveProject(key=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, t2_n33]], table:alias=[b]) + +PREHOOK: query: select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_n55 +PREHOOK: Input: default@t2_n33 +PREHOOK: Input: default@t3_n12 +#### A masked pattern was here #### +POSTHOOK: query: select a.key, a.value from t1_n55 a left join t2_n33 b on a.key=b.key join t3_n12 c on a.key=c.key where b.key is null sort by a.key, a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_n55 +POSTHOOK: Input: default@t2_n33 +POSTHOOK: Input: default@t3_n12 +#### A masked pattern was here ####