diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 9fa9525..3332e3a 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -515,6 +515,7 @@ public class HiveConf extends Configuration { // It creates sub-directories in the final output, so should not be turned on in systems // where MAPREDUCE-1501 is not present HIVE_OPTIMIZE_UNION_REMOVE("hive.optimize.union.remove", false), + HIVEOPTCORRELATION("hive.optimize.correlation", false), // exploit intra-query correlations // whether hadoop map-reduce supports sub-directories. It was added by MAPREDUCE-1501. // Some optimizations can only be performed if the version of hadoop being used supports diff --git conf/hive-default.xml.template conf/hive-default.xml.template index f332f3a..5d79389 100644 --- conf/hive-default.xml.template +++ conf/hive-default.xml.template @@ -982,6 +982,12 @@ + hive.optimize.correlation + false + exploit intra-query correlations. + + + hive.exec.dynamic.partition true Whether or not to allow dynamic partitions in DML/DDL. diff --git ql/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/ql/plan/api/OperatorType.java ql/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/ql/plan/api/OperatorType.java index 7c4c413..96f6479 100644 --- ql/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/ql/plan/api/OperatorType.java +++ ql/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/ql/plan/api/OperatorType.java @@ -29,7 +29,10 @@ public enum OperatorType implements org.apache.thrift.TEnum { LATERALVIEWJOIN(14), LATERALVIEWFORWARD(15), HASHTABLESINK(16), - HASHTABLEDUMMY(17); + HASHTABLEDUMMY(17), + CORRELATIONCOMPOSITE(18), + CORRELATIONLOCALSIMULATIVEREDUCESINK(19), + CORRELATIONREDUCERDISPATCH(20); private final int value; @@ -86,6 +89,12 @@ public enum OperatorType implements org.apache.thrift.TEnum { return HASHTABLESINK; case 17: return HASHTABLEDUMMY; + case 18: + return CORRELATIONCOMPOSITE; + case 19: + return CORRELATIONLOCALSIMULATIVEREDUCESINK; + case 20: + return CORRELATIONREDUCERDISPATCH; default: return null; } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/BaseReduceSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/BaseReduceSinkOperator.java new file mode 100644 index 0000000..0443545 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/BaseReduceSinkOperator.java @@ -0,0 +1,200 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.io.HiveKey; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.BaseReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.serde2.Serializer; +import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +/** + * BaseReduceSinkOperator + **/ +public abstract class BaseReduceSinkOperator extends + TerminalOperator implements Serializable { + + private static final long serialVersionUID = 1L; + protected static final Log LOG = LogFactory.getLog(BaseReduceSinkOperator.class + .getName()); + + /** + * The evaluators for the key columns. Key columns decide the sort order on + * the reducer side. Key columns are passed to the reducer in the "key". + */ + protected transient ExprNodeEvaluator[] keyEval; + /** + * The evaluators for the value columns. Value columns are passed to reducer + * in the "value". + */ + protected transient ExprNodeEvaluator[] valueEval; + /** + * The evaluators for the partition columns (CLUSTER BY or DISTRIBUTE BY in + * Hive language). Partition columns decide the reducer that the current row + * goes to. Partition columns are not passed to reducer. + */ + protected transient ExprNodeEvaluator[] partitionEval; + + // TODO: we use MetadataTypedColumnsetSerDe for now, till DynamicSerDe is + // ready + protected transient Serializer keySerializer; + protected transient boolean keyIsText; + protected transient Serializer valueSerializer; + protected transient int tag; + protected transient byte[] tagByte = new byte[1]; + protected transient int numDistributionKeys; + protected transient int numDistinctExprs; + + @Override + protected void initializeOp(Configuration hconf) throws HiveException { + + try { + keyEval = new ExprNodeEvaluator[conf.getKeyCols().size()]; + int i = 0; + for (ExprNodeDesc e : conf.getKeyCols()) { + keyEval[i++] = ExprNodeEvaluatorFactory.get(e); + } + + numDistributionKeys = conf.getNumDistributionKeys(); + distinctColIndices = conf.getDistinctColumnIndices(); + numDistinctExprs = distinctColIndices.size(); + + valueEval = new ExprNodeEvaluator[conf.getValueCols().size()]; + i = 0; + for (ExprNodeDesc e : conf.getValueCols()) { + valueEval[i++] = ExprNodeEvaluatorFactory.get(e); + } + + partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()]; + i = 0; + for (ExprNodeDesc e : conf.getPartitionCols()) { + partitionEval[i++] = ExprNodeEvaluatorFactory.get(e); + } + + tag = conf.getTag(); + tagByte[0] = (byte) tag; + LOG.info("Using tag = " + tag); + + TableDesc keyTableDesc = conf.getKeySerializeInfo(); + keySerializer = (Serializer) keyTableDesc.getDeserializerClass() + .newInstance(); + keySerializer.initialize(null, keyTableDesc.getProperties()); + keyIsText = keySerializer.getSerializedClass().equals(Text.class); + + TableDesc valueTableDesc = conf.getValueSerializeInfo(); + valueSerializer = (Serializer) valueTableDesc.getDeserializerClass() + .newInstance(); + valueSerializer.initialize(null, valueTableDesc.getProperties()); + + isFirstRow = true; + initializeChildren(hconf); + } catch (Exception e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + + protected transient InspectableObject tempInspectableObject = new InspectableObject(); + protected transient HiveKey keyWritable = new HiveKey(); + protected transient Writable value; + + protected transient StructObjectInspector keyObjectInspector; + protected transient StructObjectInspector valueObjectInspector; + protected transient ObjectInspector[] partitionObjectInspectors; + + protected transient Object[][] cachedKeys; + protected transient Object[] cachedValues; + protected transient List> distinctColIndices; + + protected boolean isFirstRow; + + protected transient Random random; + + /** + * Initializes array of ExprNodeEvaluator. Adds Union field for distinct + * column indices for group by. + * Puts the return values into a StructObjectInspector with output column + * names. + * + * If distinctColIndices is empty, the object inspector is same as + * {@link Operator#initEvaluatorsAndReturnStruct(ExprNodeEvaluator[], List, ObjectInspector)} + */ + protected static StructObjectInspector initEvaluatorsAndReturnStruct( + ExprNodeEvaluator[] evals, List> distinctColIndices, + List outputColNames, + int length, ObjectInspector rowInspector) + throws HiveException { + int inspectorLen = evals.length > length ? length + 1 : evals.length; + List sois = new ArrayList(inspectorLen); + + // keys + ObjectInspector[] fieldObjectInspectors = initEvaluators(evals, 0, length, rowInspector); + sois.addAll(Arrays.asList(fieldObjectInspectors)); + + if (evals.length > length) { + // union keys + List uois = new ArrayList(); + for (List distinctCols : distinctColIndices) { + List names = new ArrayList(); + List eois = new ArrayList(); + int numExprs = 0; + for (int i : distinctCols) { + names.add(HiveConf.getColumnInternalName(numExprs)); + eois.add(evals[i].initialize(rowInspector)); + numExprs++; + } + uois.add(ObjectInspectorFactory.getStandardStructObjectInspector(names, eois)); + } + UnionObjectInspector uoi = + ObjectInspectorFactory.getStandardUnionObjectInspector(uois); + sois.add(uoi); + } + return ObjectInspectorFactory.getStandardStructObjectInspector(outputColNames, sois); + } + + @Override + public abstract void processOp(Object row, int tag) throws HiveException; + + /** + * @return the name of the operator + */ + @Override + public String getName() { + return "BaseReduceSink"; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationCompositeOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationCompositeOperator.java new file mode 100644 index 0000000..dfe3119 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationCompositeOperator.java @@ -0,0 +1,163 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.CorrelationCompositeDesc; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; +import org.apache.hadoop.io.LongWritable; + +/** + * Correlation composite operator implementation. This operator is used only in map phase for + * sharing table scan. Suppose that there are multiple operation paths (e.g. two different + * predicates on a table ) that share a common table. A row will be processed by these operation + * paths. To tag which operation paths actually forward this row, CorrelationCompositeOperator is + * used. For a row, this operator will buffer forwarded rows from its parents and then tag this row + * with a operation path tag indicating which paths forwarded this row. Right now, since operation + * path tag used in ReduceSinkOperator has 1 byte, this operator can have at most 8 parents + * (operation paths). For example, suppose that the common table is T and predicates P1 and P2 will + * be used in sub-queries SQ1 and SQ2, respectively. The CorrelationCompositeOperator + * will apply P1 and P2 on the row and tag the record based on if P1 or P2 is true. + **/ +public class CorrelationCompositeOperator extends Operator implements + Serializable { + + public static enum Counter { + FORWARDED + } + + private static final long serialVersionUID = 1L; + + private ReduceSinkOperator correspondingReduceSinkOperators; + + private transient final LongWritable forwarded_count; + + private transient boolean isFirstRow; + + private int[] allOperationPathTags; + + private Object[] rowBuffer; // buffer the output from multiple parents + + public CorrelationCompositeOperator() { + super(); + forwarded_count = new LongWritable(); + } + + @Override + protected void initializeOp(Configuration hconf) throws HiveException { + isFirstRow = true; + rowBuffer = new Object[parentOperators.size()]; + correspondingReduceSinkOperators = conf.getCorrespondingReduceSinkOperator(); + allOperationPathTags = conf.getAllOperationPathTags(); + statsMap.put(Counter.FORWARDED, forwarded_count); + outputObjInspector = + ObjectInspectorUtils.getStandardObjectInspector(outputObjInspector, + ObjectInspectorCopyOption.JAVA); + + // initialize its children + initializeChildren(hconf); + } + + @Override + public void processOp(Object row, int tag) throws HiveException { + rowBuffer[tag] = + ObjectInspectorUtils.copyToStandardObject(row, inputObjInspectors[tag], + ObjectInspectorCopyOption.JAVA); + } + + private void evaluateBuffer() throws HiveException { + List operationPathTags = new ArrayList(); + boolean isForward = false; + Object forwardedRow = null; + for (int i = 0; i < rowBuffer.length; i++) { + if (rowBuffer[i] != null) { + isForward = true; + operationPathTags.add(allOperationPathTags[i]); + if (forwardedRow == null) { + forwardedRow = rowBuffer[i]; + } + } + } + if (isForward) { + assert correspondingReduceSinkOperators != null; + correspondingReduceSinkOperators.setOperationPathTags(operationPathTags); + forwarded_count.set(forwarded_count.get() + 1); + forward(forwardedRow, null); + } + for (int i = 0; i < rowBuffer.length; i++) { + rowBuffer[i] = null; + } + } + + @Override + public void setRowNumber(long rowNumber) throws HiveException { + this.rowNumber = rowNumber; + if (childOperators == null) { + return; + } + for (int i = 0; i < childOperatorsArray.length; i++) { + assert rowNumber >= childOperatorsArray[i].getRowNumber(); + if (rowNumber != childOperatorsArray[i].getRowNumber()) { + childOperatorsArray[i].setRowNumber(rowNumber); + } + } + if (isFirstRow) { + for (int i = 0; i < rowBuffer.length; i++) { + rowBuffer[i] = null; + } + isFirstRow = false; + } else { + evaluateBuffer(); + } + } + + @Override + public void closeOp(boolean abort) throws HiveException { + if (!abort) { + evaluateBuffer(); + } + } + + /** + * @return the name of the operator + */ + @Override + public String getName() { + return getOperatorName(); + } + + static public String getOperatorName() { + return "CCO"; + } + + @Override + public OperatorType getType() { + return OperatorType.CORRELATIONCOMPOSITE; + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationLocalSimulativeReduceSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationLocalSimulativeReduceSinkOperator.java new file mode 100644 index 0000000..da86cd9 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationLocalSimulativeReduceSinkOperator.java @@ -0,0 +1,315 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.HiveKey; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.CorrelationLocalSimulativeReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.Serializer; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.ReflectionUtils; +import org.eclipse.jdt.core.dom.ThisExpression; + +/** + * CorrelationLocalSimulativeReduceSinkOperator simulates a ReduceSinkOperator and sends output to + * another operator (JOIN or GBY). CorrelationLocalSimulativeReduceSinkOperator is used only in + * reduce phase. Basically, it is a bridge from one JOIN or GBY operator to another JOIN or GBY + * operator. A CorrelationLocalSimulativeReduceSinkOperator will take care actions of startGroup and + * endGroup of its succeeding JOIN or GBY operator. + * Example: A query involves a JOIN operator and a GBY operator and the GBY operator consume the + * output of the JOIN operator. In this case, if join keys and group by keys are the same, we do not + * need to shuffle the data again, since data has been already partitioned by the JOIN operator. + * Thus, in CorrelationOptimizer, the ReduceSinkOperator between JOIN and GBY operator will be + * replaced by a CorrelationLocalSimulativeReduceSinkOperator and the JOIN operator and GBY operator + * will be executed in a single reduce phase. + **/ +public class CorrelationLocalSimulativeReduceSinkOperator + extends BaseReduceSinkOperator { + + private static final long serialVersionUID = 1L; + protected static final Log LOG = LogFactory.getLog( + CorrelationLocalSimulativeReduceSinkOperator.class.getName()); + + private transient TableDesc keyTableDesc; + + private transient Deserializer inputKeyDeserializer; + + private transient SerDe inputValueDeserializer; + + private transient ByteWritable tagWritable; + + private transient ObjectInspector outputKeyObjectInspector; + private transient ObjectInspector outputValueObjectInspector; + + private List forwardedRow; + private Object keyObject; + private Object valueObject; + + private BytesWritable groupKey; + + private static String[] fieldNames; + + static { + List fieldNameArray = new ArrayList(); + for (Utilities.ReduceField r : Utilities.ReduceField.values()) { + fieldNameArray.add(r.toString()); + } + fieldNames = fieldNameArray.toArray(new String[0]); + } + + public CorrelationLocalSimulativeReduceSinkOperator() { + } + + @Override + protected void initializeOp(Configuration hconf) throws HiveException { + forwardedRow = new ArrayList(3); + tagByte = new byte[1]; + tagWritable = new ByteWritable(); + tempInspectableObject = new InspectableObject(); + keyWritable = new HiveKey(); + assert childOperatorsArray.length == 1; + try { + keyEval = new ExprNodeEvaluator[conf.getKeyCols().size()]; + int i = 0; + for (ExprNodeDesc e : conf.getKeyCols()) { + keyEval[i++] = ExprNodeEvaluatorFactory.get(e); + } + + numDistributionKeys = conf.getNumDistributionKeys(); + distinctColIndices = conf.getDistinctColumnIndices(); + numDistinctExprs = distinctColIndices.size(); + + valueEval = new ExprNodeEvaluator[conf.getValueCols().size()]; + i = 0; + for (ExprNodeDesc e : conf.getValueCols()) { + valueEval[i++] = ExprNodeEvaluatorFactory.get(e); + } + + tag = conf.getTag(); + tagByte[0] = (byte) tag; + tagWritable.set(tagByte[0]); + LOG.info("Using tag = " + tag); + + TableDesc keyTableDesc = conf.getKeySerializeInfo(); + keySerializer = (Serializer) keyTableDesc.getDeserializerClass() + .newInstance(); + keySerializer.initialize(null, keyTableDesc.getProperties()); + keyIsText = keySerializer.getSerializedClass().equals(Text.class); + + inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc + .getDeserializerClass(), null); + inputKeyDeserializer.initialize(null, keyTableDesc.getProperties()); + outputKeyObjectInspector = inputKeyDeserializer.getObjectInspector(); + + TableDesc valueTableDesc = conf.getValueSerializeInfo(); + valueSerializer = (Serializer) valueTableDesc.getDeserializerClass() + .newInstance(); + valueSerializer.initialize(null, valueTableDesc.getProperties()); + + inputValueDeserializer = (SerDe) ReflectionUtils.newInstance( + valueTableDesc.getDeserializerClass(), null); + inputValueDeserializer.initialize(null, valueTableDesc + .getProperties()); + outputValueObjectInspector = inputValueDeserializer.getObjectInspector(); + + ObjectInspector rowInspector = inputObjInspectors[0]; + + keyObjectInspector = initEvaluatorsAndReturnStruct(keyEval, + distinctColIndices, + conf.getOutputKeyColumnNames(), numDistributionKeys, rowInspector); + valueObjectInspector = initEvaluatorsAndReturnStruct(valueEval, conf + .getOutputValueColumnNames(), rowInspector); + int numKeys = numDistinctExprs > 0 ? numDistinctExprs : 1; + int keyLen = numDistinctExprs > 0 ? numDistributionKeys + 1 : + numDistributionKeys; + cachedKeys = new Object[numKeys][keyLen]; + cachedValues = new Object[valueEval.length]; + assert cachedKeys.length == 1; + + List ois = new ArrayList(); + ois.add(outputKeyObjectInspector); + ois.add(outputValueObjectInspector); + ois.add(PrimitiveObjectInspectorFactory.writableByteObjectInspector); + + outputObjInspector = ObjectInspectorFactory + .getStandardStructObjectInspector(Arrays.asList(fieldNames), ois); + + LOG.info("Simulative ReduceSink inputObjInspectors" + + ((StructObjectInspector) inputObjInspectors[0]).getTypeName()); + + LOG.info("Simulative ReduceSink outputObjInspectors " + + this.getChildOperators().get(0).getParentOperators().indexOf(this) + + " " + ((StructObjectInspector) outputObjInspector).getTypeName()); + + initializeChildren(hconf); + } catch (Exception e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + + @Override + public void processOp(Object row, int tag) throws HiveException { + try { + // Evaluate the value + for (int i = 0; i < valueEval.length; i++) { + cachedValues[i] = valueEval[i].evaluate(row); + } + // Serialize the value + value = valueSerializer.serialize(cachedValues, valueObjectInspector); + valueObject = inputValueDeserializer.deserialize(value); + + // Evaluate the keys + Object[] distributionKeys = new Object[numDistributionKeys]; + for (int i = 0; i < numDistributionKeys; i++) { + distributionKeys[i] = keyEval[i].evaluate(row); + } + + if (numDistinctExprs > 0) { + // with distinct key(s) + for (int i = 0; i < numDistinctExprs; i++) { + System.arraycopy(distributionKeys, 0, cachedKeys[i], 0, numDistributionKeys); + Object[] distinctParameters = + new Object[distinctColIndices.get(i).size()]; + for (int j = 0; j < distinctParameters.length; j++) { + distinctParameters[j] = + keyEval[distinctColIndices.get(i).get(j)].evaluate(row); + } + cachedKeys[i][numDistributionKeys] = + new StandardUnion((byte) i, distinctParameters); + } + } else { + // no distinct key + System.arraycopy(distributionKeys, 0, cachedKeys[0], 0, numDistributionKeys); + } + + for (int i = 0; i < cachedKeys.length; i++) { + if (keyIsText) { + Text key = (Text) keySerializer.serialize(cachedKeys[i], + keyObjectInspector); + keyWritable.set(key.getBytes(), 0, key.getLength()); + } else { + // Must be BytesWritable + BytesWritable key = (BytesWritable) keySerializer.serialize( + cachedKeys[i], keyObjectInspector); + keyWritable.set(key.getBytes(), 0, key.getLength()); + } + + if (!keyWritable.equals(groupKey)) { + try { + keyObject = inputKeyDeserializer.deserialize(keyWritable); + } catch (Exception e) { + throw new HiveException( + "Hive Runtime Error: Unable to deserialize reduce input key from " + + Utilities.formatBinaryString(keyWritable.get(), 0, + keyWritable.getSize()) + " with properties " + + keyTableDesc.getProperties(), e); + } + if (groupKey == null) { // the first group + groupKey = new BytesWritable(); + } else { + // if its child has not been ended, end it + if (!keyWritable.equals(childOperatorsArray[0].getBytesWritableGroupKey())) { + childOperatorsArray[0].endGroup(); + } + } + groupKey.set(keyWritable.get(), 0, keyWritable.getSize()); + if (!groupKey.equals(childOperatorsArray[0].getBytesWritableGroupKey())) { + childOperatorsArray[0].startGroup(); + childOperatorsArray[0].setGroupKeyObject(keyObject); + childOperatorsArray[0].setBytesWritableGroupKey(groupKey); + } + } + forwardedRow.clear(); + forwardedRow.add(keyObject); + forwardedRow.add(valueObject); + forwardedRow.add(tagWritable); + forward(forwardedRow, outputObjInspector); + } + } catch (SerDeException e) { + throw new HiveException(e); + } + } + + @Override + public void closeOp(boolean abort) throws HiveException { + if (!abort) { + Operator child = childOperatorsArray[0]; + if (child.allInitializedParentsAreClosed()) { + LOG.info("All parents of " + child.getName() + " (id: " + child.getIdentifier() + + ") has been closed. Invoke its endGroup"); + childOperatorsArray[0].endGroup(); + } + } + } + + @Override + public void startGroup() throws HiveException { + // do nothing + } + + @Override + public void endGroup() throws HiveException { + // do nothing + } + + @Override + public void setGroupKeyObject(Object keyObject) { + // do nothing + } + + /** + * @return the name of the operator + */ + @Override + public String getName() { + return getOperatorName(); + } + + static public String getOperatorName() { + return "CLSReduceSink"; + } + + @Override + public OperatorType getType() { + return OperatorType.CORRELATIONLOCALSIMULATIVEREDUCESINK; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationReducerDispatchOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationReducerDispatchOperator.java new file mode 100644 index 0000000..83db208 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/CorrelationReducerDispatchOperator.java @@ -0,0 +1,454 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.CorrelationReducerDispatchDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** + * CorrelationReducerDispatchOperator is an operator used by MapReduce join optimized by + * CorrelationOptimizer. If used, CorrelationReducerDispatchOperator is the first operator in reduce + * phase. In the case that multiple operation paths are merged into a single one, it will dispatch + * the record to corresponding JOIN or GBY operators. Every child of this operator is associated + * with a DispatcherHnadler, which evaluates the input row of this operator and then select + * corresponding fields for its associated child. + */ +public class CorrelationReducerDispatchOperator extends Operator + implements Serializable { + + private static final long serialVersionUID = 1L; + private static String[] fieldNames; + static { + List fieldNameArray = new ArrayList(); + for (Utilities.ReduceField r : Utilities.ReduceField.values()) { + fieldNameArray.add(r.toString()); + } + fieldNames = fieldNameArray.toArray(new String[0]); + } + + protected static class DispatchHandler { + + protected Log l4j = LogFactory.getLog(this.getClass().getName()); + + private final ObjectInspector[] inputObjInspector; + private ObjectInspector outputObjInspector; + private ObjectInspector keyObjInspector; + private ObjectInspector valueObjInspector; + private final byte inputTag; + private final byte outputTag; + private final byte childIndx; + private final ByteWritable outputTagByteWritable; + private final SelectDesc keySelectDesc; + private final SelectDesc valueSelectDesc; + private ExprNodeEvaluator[] keyEval; + private ExprNodeEvaluator[] eval; + + // counters for debugging + private transient long cntr = 0; + private transient long nextCntr = 1; + + private long getNextCntr(long cntr) { + // A very simple counter to keep track of number of rows processed by an + // operator. It dumps + // every 1 million times, and quickly before that + if (cntr >= 1000000) { + return cntr + 1000000; + } + return 10 * cntr; + } + + public long getCntr() { + return this.cntr; + } + + private final Log LOG; + private final boolean isLogInfoEnabled; + private final String id; + + public DispatchHandler(ObjectInspector[] inputObjInspector, byte inputTag, byte childIndx, + byte outputTag, + SelectDesc valueSelectDesc, SelectDesc keySelectDesc, Log LOG, String id) + throws HiveException { + this.inputObjInspector = inputObjInspector; + assert this.inputObjInspector.length == 1; + this.inputTag = inputTag; + this.childIndx = childIndx; + this.outputTag = outputTag; + this.valueSelectDesc = valueSelectDesc; + this.keySelectDesc = keySelectDesc; + this.outputTagByteWritable = new ByteWritable(outputTag); + this.LOG = LOG; + this.isLogInfoEnabled = LOG.isInfoEnabled(); + this.id = id; + init(); + } + + private void init() throws HiveException { + List ois = new ArrayList(); + if (keySelectDesc.isSelStarNoCompute()) { + ois.add((ObjectInspector) ((List) inputObjInspector[0]).get(0)); + } else { + List colList = this.keySelectDesc.getColList(); + keyEval = new ExprNodeEvaluator[colList.size()]; + for (int k = 0; k < colList.size(); k++) { + assert (colList.get(k) != null); + keyEval[k] = ExprNodeEvaluatorFactory.get(colList.get(k)); + } + keyObjInspector = + initEvaluatorsAndReturnStruct(keyEval, keySelectDesc + .getOutputColumnNames(), ((StandardStructObjectInspector) inputObjInspector[0]) + .getAllStructFieldRefs().get(0).getFieldObjectInspector()); + + ois.add(keyObjInspector); + l4j.info("Key: input tag " + (int) inputTag + ", output tag " + (int) outputTag + + ", SELECT inputOIForThisTag" + + ((StructObjectInspector) inputObjInspector[0]).getTypeName()); + } + if (valueSelectDesc.isSelStarNoCompute()) { + ois.add((ObjectInspector) ((List) inputObjInspector[0]).get(1)); + } else { + List colList = this.valueSelectDesc.getColList(); + eval = new ExprNodeEvaluator[colList.size()]; + for (int k = 0; k < colList.size(); k++) { + assert (colList.get(k) != null); + eval[k] = ExprNodeEvaluatorFactory.get(colList.get(k)); + } + valueObjInspector = + initEvaluatorsAndReturnStruct(eval, valueSelectDesc + .getOutputColumnNames(), ((StandardStructObjectInspector) inputObjInspector[0]) + .getAllStructFieldRefs().get(1).getFieldObjectInspector()); + + ois.add(valueObjInspector); + l4j.info("input tag " + (int) inputTag + ", output tag " + (int) outputTag + + ", SELECT inputOIForThisTag" + + ((StructObjectInspector) inputObjInspector[0]).getTypeName()); + } + ois.add(PrimitiveObjectInspectorFactory.writableByteObjectInspector); + outputObjInspector = ObjectInspectorFactory + .getStandardStructObjectInspector(Arrays.asList(fieldNames), ois); + l4j.info("input tag " + (int) inputTag + ", output tag " + (int) outputTag + + ", SELECT outputObjInspector" + + ((StructObjectInspector) outputObjInspector).getTypeName()); + } + + public ObjectInspector getOutputObjInspector() { + return outputObjInspector; + } + + public Object process(Object row) throws HiveException { + List keyOutput = new ArrayList(keyEval.length); + Object[] valueOutput = new Object[eval.length]; + List outputRow = new ArrayList(3); + List thisRow = (List) row; + if (keySelectDesc.isSelStarNoCompute()) { + outputRow.add(thisRow.get(0)); + } else { + Object key = thisRow.get(0); + for (int j = 0; j < keyEval.length; j++) { + try { + keyOutput.add(keyEval[j].evaluate(key)); + } catch (HiveException e) { + throw e; + } catch (RuntimeException e) { + throw new HiveException("Error evaluating " + + keySelectDesc.getColList().get(j).getExprString(), e); + } + } + outputRow.add(keyOutput); + } + + if (valueSelectDesc.isSelStarNoCompute()) { + outputRow.add(thisRow.get(1)); + } else { + Object value = thisRow.get(1); + for (int j = 0; j < eval.length; j++) { + try { + valueOutput[j] = eval[j].evaluate(value); + } catch (HiveException e) { + throw e; + } catch (RuntimeException e) { + throw new HiveException("Error evaluating " + + valueSelectDesc.getColList().get(j).getExprString(), e); + } + } + outputRow.add(valueOutput); + } + outputRow.add(outputTagByteWritable); + + if (isLogInfoEnabled) { + cntr++; + if (cntr == nextCntr) { + LOG.info(id + "(inputTag, childIndx, outputTag)=(" + inputTag + ", " + childIndx + ", " + + outputTag + "), forwarding " + cntr + " rows"); + nextCntr = getNextCntr(cntr); + } + } + + return outputRow; + } + + public void printCloseOpLog() { + LOG.info(id + "(inputTag, childIndx, outputTag)=(" + inputTag + ", " + childIndx + ", " + + outputTag + "), forwarded " + cntr + " rows"); + } + } + + // inputTag->(Child->List) + private Map>> dispatchConf; + // inputTag->(Child->List) + private Map>> dispatchValueSelectDescConf; + // inputTag->(Child->List) + private Map>> dispatchKeySelectDescConf; + // inputTag->(Child->List) + private Map>> dispatchHandlers; + // Child->(outputTag->DispatchHandler) + private Map> child2OutputTag2DispatchHandlers; + // Child->Child's inputObjInspectors + private Map childInputObjInspectors; + + private int operationPathTag; + private int inputTag; + + private Object[] lastDispatchedRows; + private int[] lastDispatchedTags; + + @Override + protected void initializeOp(Configuration hconf) throws HiveException { + dispatchConf = conf.getDispatchConf(); + dispatchValueSelectDescConf = conf.getDispatchValueSelectDescConf(); + dispatchKeySelectDescConf = conf.getDispatchKeySelectDescConf(); + dispatchHandlers = new HashMap>>(); + for (Entry>> entry : dispatchConf.entrySet()) { + Map> tmp = + new HashMap>(); + for (Entry> child2outputTag : entry.getValue().entrySet()) { + tmp.put(child2outputTag.getKey(), new ArrayList()); + int indx = 0; + for (Integer outputTag : child2outputTag.getValue()) { + ObjectInspector[] thisInputObjectInspector = + new ObjectInspector[] {inputObjInspectors[entry.getKey()]}; + Integer thisInputTag = entry.getKey(); + Integer thisChildIndx = child2outputTag.getKey(); + SelectDesc thisValueSelectDesc = dispatchValueSelectDescConf.get(thisInputTag) + .get(thisChildIndx).get(indx); + SelectDesc thisKeySelectDesc = dispatchKeySelectDescConf.get(thisInputTag) + .get(thisChildIndx).get(indx); + tmp.get(child2outputTag.getKey()).add( + new DispatchHandler(thisInputObjectInspector, + thisInputTag.byteValue(), thisChildIndx.byteValue(), outputTag.byteValue(), + thisValueSelectDesc, thisKeySelectDesc, LOG, id)); + indx++; + } + } + dispatchHandlers.put(entry.getKey(), tmp); + } + + child2OutputTag2DispatchHandlers = new HashMap>(); + for (Entry>> entry : dispatchConf.entrySet()) { + for (Entry> child2outputTag : entry.getValue().entrySet()) { + if (!child2OutputTag2DispatchHandlers.containsKey(child2outputTag.getKey())) { + child2OutputTag2DispatchHandlers.put(child2outputTag.getKey(), + new HashMap()); + } + int indx = 0; + for (Integer outputTag : child2outputTag.getValue()) { + child2OutputTag2DispatchHandlers.get(child2outputTag.getKey()). + put(outputTag, + dispatchHandlers.get(entry.getKey()).get(child2outputTag.getKey()).get(indx)); + indx++; + } + } + } + + childInputObjInspectors = new HashMap(); + for (Entry> entry : child2OutputTag2DispatchHandlers + .entrySet()) { + Integer l = Collections.max(entry.getValue().keySet()); + ObjectInspector[] childObjInspectors = new ObjectInspector[l.intValue() + 1]; + for (Entry e : entry.getValue().entrySet()) { + if (e.getKey().intValue() == -1) { + assert childObjInspectors.length == 1; + childObjInspectors[0] = e.getValue().getOutputObjInspector(); + } else { + childObjInspectors[e.getKey().intValue()] = e.getValue().getOutputObjInspector(); + } + } + childInputObjInspectors.put(entry.getKey(), childObjInspectors); + } + + lastDispatchedRows = new Object[childOperatorsArray.length]; + lastDispatchedTags = new int[childOperatorsArray.length]; + for (int i = 0; i < childOperatorsArray.length; i++) { + lastDispatchedRows[i] = null; + lastDispatchedTags[i] = -1; + } + + initializeChildren(hconf); + } + + // Each child should has its own outputObjInspector + @Override + protected void initializeChildren(Configuration hconf) throws HiveException { + state = State.INIT; + LOG.info("Operator " + id + " " + getName() + " initialized"); + if (childOperators == null) { + return; + } + LOG.info("Initializing children of " + id + " " + getName()); + for (int i = 0; i < childOperatorsArray.length; i++) { + LOG.info("Initializing child " + i + " " + childOperatorsArray[i].getIdentifier() + " " + + childOperatorsArray[i].getName() + + " " + childInputObjInspectors.get(i).length); + childOperatorsArray[i].initialize(hconf, childInputObjInspectors.get(i)); + if (reporter != null) { + childOperatorsArray[i].setReporter(reporter); + } + } + } + + @Override + public void processOp(Object row, int tag) throws HiveException { + List thisRow = (List) row; + assert thisRow.size() == 4; + operationPathTag = ((ByteWritable) thisRow.get(3)).get(); + inputTag = ((ByteWritable) thisRow.get(2)).get(); + forward(thisRow.subList(0, 3), inputObjInspectors[inputTag]); + } + + @Override + public void forward(Object row, ObjectInspector rowInspector) + throws HiveException { + if ((++outputRows % 1000) == 0) { + if (counterNameToEnum != null) { + incrCounter(numOutputRowsCntr, outputRows); + outputRows = 0; + } + } + + if (childOperatorsArray == null && childOperators != null) { + throw new HiveException("Internal Hive error during operator initialization."); + } + + if ((childOperatorsArray == null) || (getDone())) { + return; + } + + int childrenDone = 0; + int forwardFlag = 1; + assert childOperatorsArray.length <= 8; + for (int i = 0; i < childOperatorsArray.length; i++) { + Operator o = childOperatorsArray[i]; + if (o.getDone()) { + childrenDone++; + } else { + int isProcess = (operationPathTag & (forwardFlag << i)); + if (isProcess != 0) { + if (o.getName().equals(GroupByOperator.getOperatorName())) { + GroupByOperator gbyop = (GroupByOperator) o; + gbyop.setForcedForward(false); + if (!this.bytesWritableGroupKey.equals(o.getBytesWritableGroupKey())) { + o.setBytesWritableGroupKey(this.bytesWritableGroupKey); + } + } + for (int j = 0; j < dispatchHandlers.get(inputTag).get(i).size(); j++) { + Object dispatchedRow = dispatchHandlers.get(inputTag).get(i).get(j).process(row); + int dispatchedTag = dispatchConf.get(inputTag).get(i).get(j); + o.process(dispatchedRow, dispatchedTag); + lastDispatchedRows[i] = dispatchedRow; + lastDispatchedTags[i] = dispatchedTag; + } + } + if (isProcess == 0 && o.getName().equals(GroupByOperator.getOperatorName())) { + if (lastDispatchedRows[i] != null && + !this.bytesWritableGroupKey.equals(o.getBytesWritableGroupKey())) { + GroupByOperator gbyop = (GroupByOperator) o; + gbyop.setForcedForward(true); + o.setBytesWritableGroupKey(this.bytesWritableGroupKey); + o.process(lastDispatchedRows[i], lastDispatchedTags[i]); + } + } + } + } + + // if all children are done, this operator is also done + if (childrenDone == childOperatorsArray.length) { + setDone(true); + } + } + + @Override + protected void closeOp(boolean abort) throws HiveException { + // log the number of rows forwarded from each dispatcherHandler + for (Map> childIndx2DispatchHandlers : dispatchHandlers + .values()) { + for (List dispatchHandlers : childIndx2DispatchHandlers.values()) { + for (DispatchHandler dispatchHandler : dispatchHandlers) { + dispatchHandler.printCloseOpLog(); + } + } + } + } + + @Override + public void setGroupKeyObject(Object keyObject) { + this.groupKeyObject = keyObject; + for (Operator op : childOperators) { + op.setGroupKeyObject(keyObject); + } + } + + /** + * @return the name of the operator + */ + @Override + public String getName() { + return getOperatorName(); + } + + static public String getOperatorName() { + return "CDP"; + } + + @Override + public OperatorType getType() { + return OperatorType.CORRELATIONREDUCERDISPATCH; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/ExecReducer.java ql/src/java/org/apache/hadoop/hive/ql/exec/ExecReducer.java index 18a9bd2..b37f554 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecReducer.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecReducer.java @@ -25,6 +25,7 @@ import java.net.URLClassLoader; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; +import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -61,6 +62,7 @@ public class ExecReducer extends MapReduceBase implements Reducer { private Reporter rp; private boolean abort = false; private boolean isTagged = false; + private boolean isOperationPathTagged = false; private long cntr = 0; private long nextCntr = 1; @@ -116,6 +118,7 @@ public class ExecReducer extends MapReduceBase implements Reducer { reducer.setParentOperators(null); // clear out any parents as reducer is the // root isTagged = gWork.getNeedsTagging(); + isOperationPathTagged = gWork.getNeedsOperationPathTagging(); try { keyTableDesc = gWork.getKeyDesc(); inputKeyDeserializer = (SerDe) ReflectionUtils.newInstance(keyTableDesc @@ -164,8 +167,9 @@ public class ExecReducer extends MapReduceBase implements Reducer { private BytesWritable groupKey; - ArrayList row = new ArrayList(3); + List row = new ArrayList(4); ByteWritable tag = new ByteWritable(); + ByteWritable operationPathTags = new ByteWritable(); public void reduce(Object key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { @@ -190,6 +194,14 @@ public class ExecReducer extends MapReduceBase implements Reducer { keyWritable.setSize(size); } + operationPathTags.set((byte)0); + if (isOperationPathTagged) { + // remove the operation plan tag + int size = keyWritable.getSize() - 1; + operationPathTags.set(keyWritable.get()[size]); + keyWritable.setSize(size); + } + if (!keyWritable.equals(groupKey)) { // If a operator wants to do some work at the beginning of a group if (groupKey == null) { // the first group @@ -214,6 +226,7 @@ public class ExecReducer extends MapReduceBase implements Reducer { l4j.trace("Start Group"); reducer.startGroup(); reducer.setGroupKeyObject(keyObject); + reducer.setBytesWritableGroupKey(groupKey); } // System.err.print(keyObject.toString()); while (values.hasNext()) { @@ -236,6 +249,7 @@ public class ExecReducer extends MapReduceBase implements Reducer { row.add(valueObject[tag.get()]); // The tag is not used any more, we should remove it. row.add(tag); + row.add(operationPathTags); if (isLogInfoEnabled) { cntr++; if (cntr == nextCntr) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java index 46daeb2..8eeb88a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java @@ -153,6 +153,13 @@ public class GroupByOperator extends Operator implements private List groupingSetsBitSet; transient private List newKeysGroupingSets; + private boolean forcedForward; // only used by CorrelationReducerDispatchOperator to make + // GroupByOperator has the same pace with other + // GroupByOperators and JoinOperators. + // If true and newKeys is different from currentKeys, + // data associated with currentKeys will be + // forwarded, otherwise, nothing happens. + /** * This is used to store the position and field names for variable length * fields. @@ -430,6 +437,7 @@ public class GroupByOperator extends Operator implements memoryMXBean = ManagementFactory.getMemoryMXBean(); maxMemory = memoryMXBean.getHeapMemoryUsage().getMax(); memoryThreshold = this.getConf().getMemoryThreshold(); + forcedForward = false; initializeChildren(hconf); } @@ -867,6 +875,10 @@ public class GroupByOperator extends Operator implements } } + public void setForcedForward(boolean forcedForward) { + this.forcedForward = forcedForward; + } + // Non-hash aggregation private void processAggr(Object row, ObjectInspector rowInspector, @@ -881,11 +893,16 @@ public class GroupByOperator extends Operator implements newKeys.equals(currentKeys) : false; // Forward the current keys if needed for sort-based aggregation - if (currentKeys != null && !keysAreEqual) { + if (currentKeys != null && (!keysAreEqual || forcedForward)) { forward(currentKeys.getKeyArray(), aggregations); countAfterReport = 0; } + if (forcedForward) { + currentKeys = null; + return; + } + // Need to update the keys? if (currentKeys == null || !keysAreEqual) { if (currentKeys == null) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 68302f8..c36641f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapred.Counters; import org.apache.hadoop.mapred.OutputCollector; @@ -1429,4 +1430,52 @@ public abstract class Operator implements Serializable,C public boolean supportUnionRemoveOptimization() { return false; } + + // bytesWritableGroupKey is only used when a query plan is optimized by CorrelationOptimizer. + // CorrelationLocalSimulativeReduceSinkOperator will use this variable to determine when it needs to start or end the group + // for its child operator. + protected BytesWritable bytesWritableGroupKey; + + public void setBytesWritableGroupKey(BytesWritable groupKey) { + if (bytesWritableGroupKey == null) { + bytesWritableGroupKey = new BytesWritable(); + } + bytesWritableGroupKey.set(groupKey.get(), 0, groupKey.getSize()); + } + + public BytesWritable getBytesWritableGroupKey() { + return bytesWritableGroupKey; + } + + // The number of current row + protected long rowNumber; + + public void initializeRowNumber() { + this.rowNumber = 0L; + LOG.info("Operator " + id + " " + getName() + " row number initialized to 0"); + if (childOperators == null) { + return; + } + LOG.info("Initializing row numbers of children of " + id + " " + getName()); + for (int i = 0; i < childOperatorsArray.length; i++) { + childOperatorsArray[i].initializeRowNumber(); + } + } + + public void setRowNumber(long rowNumber) throws HiveException { + this.rowNumber = rowNumber; + if (childOperators == null) { + return; + } + for (int i = 0; i < childOperatorsArray.length; i++) { + assert rowNumber >= childOperatorsArray[i].getRowNumber(); + if (rowNumber != childOperatorsArray[i].getRowNumber()) { + childOperatorsArray[i].setRowNumber(rowNumber); + } + } + } + + public long getRowNumber() { + return rowNumber; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java index 0c22141..064afc9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java @@ -22,6 +22,9 @@ import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.ql.plan.CollectDesc; +import org.apache.hadoop.hive.ql.plan.CorrelationCompositeDesc; +import org.apache.hadoop.hive.ql.plan.CorrelationLocalSimulativeReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.CorrelationReducerDispatchDesc; import org.apache.hadoop.hive.ql.plan.ExtractDesc; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc; @@ -91,6 +94,12 @@ public final class OperatorFactory { HashTableDummyOperator.class)); opvec.add(new OpTuple(HashTableSinkDesc.class, HashTableSinkOperator.class)); + opvec.add(new OpTuple(CorrelationCompositeDesc.class, + CorrelationCompositeOperator.class)); + opvec.add(new OpTuple(CorrelationReducerDispatchDesc.class, + CorrelationReducerDispatchOperator.class)); + opvec.add(new OpTuple(CorrelationLocalSimulativeReduceSinkDesc.class, + CorrelationLocalSimulativeReduceSinkOperator.class)); } public static Operator get(Class opClass) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java index 919a140..899dd9c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java @@ -21,179 +21,50 @@ package org.apache.hadoop.hive.ql.exec; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Random; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; -import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.Serializer; -import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; /** * Reduce Sink Operator sends output to the reduce stage. **/ -public class ReduceSinkOperator extends TerminalOperator +public class ReduceSinkOperator extends BaseReduceSinkOperator implements Serializable { private static final long serialVersionUID = 1L; - /** - * The evaluators for the key columns. Key columns decide the sort order on - * the reducer side. Key columns are passed to the reducer in the "key". - */ - protected transient ExprNodeEvaluator[] keyEval; - /** - * The evaluators for the value columns. Value columns are passed to reducer - * in the "value". - */ - protected transient ExprNodeEvaluator[] valueEval; - /** - * The evaluators for the partition columns (CLUSTER BY or DISTRIBUTE BY in - * Hive language). Partition columns decide the reducer that the current row - * goes to. Partition columns are not passed to reducer. - */ - protected transient ExprNodeEvaluator[] partitionEval; - - // TODO: we use MetadataTypedColumnsetSerDe for now, till DynamicSerDe is - // ready - transient Serializer keySerializer; - transient boolean keyIsText; - transient Serializer valueSerializer; - transient int tag; - transient byte[] tagByte = new byte[1]; - transient protected int numDistributionKeys; - transient protected int numDistinctExprs; - - @Override - protected void initializeOp(Configuration hconf) throws HiveException { - - try { - keyEval = new ExprNodeEvaluator[conf.getKeyCols().size()]; - int i = 0; - for (ExprNodeDesc e : conf.getKeyCols()) { - keyEval[i++] = ExprNodeEvaluatorFactory.get(e); - } - - numDistributionKeys = conf.getNumDistributionKeys(); - distinctColIndices = conf.getDistinctColumnIndices(); - numDistinctExprs = distinctColIndices.size(); - - valueEval = new ExprNodeEvaluator[conf.getValueCols().size()]; - i = 0; - for (ExprNodeDesc e : conf.getValueCols()) { - valueEval[i++] = ExprNodeEvaluatorFactory.get(e); - } - - partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()]; - i = 0; - for (ExprNodeDesc e : conf.getPartitionCols()) { - partitionEval[i++] = ExprNodeEvaluatorFactory.get(e); - } - - tag = conf.getTag(); - tagByte[0] = (byte) tag; - LOG.info("Using tag = " + tag); + private final List operationPathTags = new ArrayList(); // operation path tags + private final byte[] operationPathTagsByte = new byte[1]; - TableDesc keyTableDesc = conf.getKeySerializeInfo(); - keySerializer = (Serializer) keyTableDesc.getDeserializerClass() - .newInstance(); - keySerializer.initialize(null, keyTableDesc.getProperties()); - keyIsText = keySerializer.getSerializedClass().equals(Text.class); - - TableDesc valueTableDesc = conf.getValueSerializeInfo(); - valueSerializer = (Serializer) valueTableDesc.getDeserializerClass() - .newInstance(); - valueSerializer.initialize(null, valueTableDesc.getProperties()); - - firstRow = true; - initializeChildren(hconf); - } catch (Exception e) { - e.printStackTrace(); - throw new RuntimeException(e); + public void setOperationPathTags(List operationPathTags) { + this.operationPathTags.addAll(operationPathTags); + int operationPathTagsInt = 0; + int tmp = 1; + for (Integer operationPathTag: operationPathTags) { + operationPathTagsInt += tmp << operationPathTag.intValue(); } + operationPathTagsByte[0] = (byte) operationPathTagsInt; } - transient InspectableObject tempInspectableObject = new InspectableObject(); - transient HiveKey keyWritable = new HiveKey(); - transient Writable value; - - transient StructObjectInspector keyObjectInspector; - transient StructObjectInspector valueObjectInspector; - transient ObjectInspector[] partitionObjectInspectors; - - transient Object[][] cachedKeys; - transient Object[] cachedValues; - transient List> distinctColIndices; - - boolean firstRow; - - transient Random random; - - /** - * Initializes array of ExprNodeEvaluator. Adds Union field for distinct - * column indices for group by. - * Puts the return values into a StructObjectInspector with output column - * names. - * - * If distinctColIndices is empty, the object inspector is same as - * {@link Operator#initEvaluatorsAndReturnStruct(ExprNodeEvaluator[], List, ObjectInspector)} - */ - protected static StructObjectInspector initEvaluatorsAndReturnStruct( - ExprNodeEvaluator[] evals, List> distinctColIndices, - List outputColNames, - int length, ObjectInspector rowInspector) - throws HiveException { - int inspectorLen = evals.length > length ? length + 1 : evals.length; - List sois = new ArrayList(inspectorLen); - - // keys - ObjectInspector[] fieldObjectInspectors = initEvaluators(evals, 0, length, rowInspector); - sois.addAll(Arrays.asList(fieldObjectInspectors)); - - if (evals.length > length) { - // union keys - List uois = new ArrayList(); - for (List distinctCols : distinctColIndices) { - List names = new ArrayList(); - List eois = new ArrayList(); - int numExprs = 0; - for (int i : distinctCols) { - names.add(HiveConf.getColumnInternalName(numExprs)); - eois.add(evals[i].initialize(rowInspector)); - numExprs++; - } - uois.add(ObjectInspectorFactory.getStandardStructObjectInspector(names, eois)); - } - UnionObjectInspector uoi = - ObjectInspectorFactory.getStandardUnionObjectInspector(uois); - sois.add(uoi); - } - return ObjectInspectorFactory.getStandardStructObjectInspector(outputColNames, sois ); + public List getOperationPathTags() { + return this.operationPathTags; } @Override public void processOp(Object row, int tag) throws HiveException { try { ObjectInspector rowInspector = inputObjInspectors[tag]; - if (firstRow) { - firstRow = false; + if (isFirstRow) { + isFirstRow = false; keyObjectInspector = initEvaluatorsAndReturnStruct(keyEval, distinctColIndices, conf.getOutputKeyColumnNames(), numDistributionKeys, rowInspector); @@ -267,9 +138,18 @@ public class ReduceSinkOperator extends TerminalOperator keyWritable.set(key.getBytes(), 0, key.getLength()); } else { int keyLength = key.getLength(); - keyWritable.setSize(keyLength + 1); + if (!this.getConf().getNeedsOperationPathTagging()) { + keyWritable.setSize(keyLength + 1); + } else { + keyWritable.setSize(keyLength + 2); + } System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength); - keyWritable.get()[keyLength] = tagByte[0]; + if (!this.getConf().getNeedsOperationPathTagging()) { + keyWritable.get()[keyLength] = tagByte[0]; + } else { + keyWritable.get()[keyLength] = operationPathTagsByte[0]; + keyWritable.get()[keyLength + 1] = tagByte[0]; + } } } else { // Must be BytesWritable @@ -279,9 +159,18 @@ public class ReduceSinkOperator extends TerminalOperator keyWritable.set(key.getBytes(), 0, key.getLength()); } else { int keyLength = key.getLength(); - keyWritable.setSize(keyLength + 1); + if (!this.getConf().getNeedsOperationPathTagging()) { + keyWritable.setSize(keyLength + 1); + } else { + keyWritable.setSize(keyLength + 2); + } System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength); - keyWritable.get()[keyLength] = tagByte[0]; + if (!this.getConf().getNeedsOperationPathTagging()) { + keyWritable.get()[keyLength] = tagByte[0]; + } else { + keyWritable.get()[keyLength] = operationPathTagsByte[0]; + keyWritable.get()[keyLength + 1] = tagByte[0]; + } } } keyWritable.setHashCode(keyHashCode); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index 1469325..6f9b62c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -80,6 +80,9 @@ public class TableScanOperator extends Operator implements if (conf != null && conf.isGatherStats()) { gatherStats(row); } + if (conf != null && conf.isForwardRowNumber()) { + setRowNumber(rowNumber+1); + } forward(row, inputObjInspectors[tag]); } @@ -169,6 +172,12 @@ public class TableScanOperator extends Operator implements if (conf == null) { return; } + + LOG.info(this.getName() + " forward row number " + conf.isForwardRowNumber()); + if(conf.isForwardRowNumber()){ + initializeRowNumber(); + } + if (!conf.isGatherStats()) { return; } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/CorrelationOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/CorrelationOptimizer.java new file mode 100644 index 0000000..ffa5b30 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/CorrelationOptimizer.java @@ -0,0 +1,1048 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.CommonJoinOperator; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QB; +import org.apache.hadoop.hive.ql.parse.QBExpr; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.JoinCondDesc; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; + +/** + * Implementation of correlation optimizer. The optimization is based on + * the paper "YSmart: Yet Another SQL-to-MapReduce Translator" + * (Rubao Lee, Tian Luo, Yin Huai, Fusheng Wang, Yongqiang He, and Xiaodong Zhang) + * (http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/papers/TR-11-7.pdf). + * This optimizer first detects three kinds of + * correlations, Input Correlation (IC), Transit Correlation (TC) and Job Flow Correlation (JFC), + * and then merge correlated MapReduce-jobs (MR-jobs) into one MR-job. + * + * For the definitions of correlations, see the original paper of YSmart. + * + * Rules for merging correlated MR-jobs implemented in this correlation + * optimizer are: + * 1. If an MR-job for a Join operation has the same partitioning keys with its all + * preceding MR-jobs, correlation optimizer merges these MR-jobs into one MR-job. + * 2. If an MR-job for a GroupBy and Aggregation operation has the same partitioning keys + * with its preceding MR-job, correlation optimizer merges these two MR-jobs into one MR-job. + * + * Note: In the current implementation, if correlation optimizer detects MR-jobs of a sub-plan tree + * are correlated, it transforms this sub-plan tree to a single MR-job when the input of this + * sub-plan tree is not a temporary table. Otherwise, the current implementation will ignore this + * sub-plan tree. + * + * There are several future work that will enhance the correlation optimizer. + * Here are four examples: + * 1. Add a new rule that is if two MR-jobs share the same + * partitioning keys and they have common input tables, merge these two MR-jobs into a single + * MR-job. + * 2. The current implementation detects MR-jobs which have the same partitioning keys + * as correlated MR-jobs. However, the condition of same partitioning keys can be relaxed to use + * common partitioning keys. + * 3. The current implementation cannot optimize MR-jobs for the + * aggregation functions with a distinct keyword, which should be supported in the future + * implementation of the correlation optimizer. + * 4. Optimize queries involving self-join. + */ + +public class CorrelationOptimizer implements Transform { + + static final private Log LOG = LogFactory.getLog(CorrelationOptimizer.class.getName()); + private final Map aliastoTabName; + private final Map aliastoTab; + + protected ParseContext pGraphContext; + private LinkedHashMap, OpParseContext> opParseCtx; + + private boolean abort; + + private Map groupbyNonMapSide2MapSide; + private Map groupbyMapSide2NonMapSide; + + //Join operators which may be converted by CommonJoinOperator; + private Set> skipedJoinOperators; + + public CorrelationOptimizer() { + super(); + aliastoTabName = new HashMap(); + aliastoTab = new HashMap(); + pGraphContext = null; + skipedJoinOperators = new HashSet>(); + abort = false; + } + + private boolean initializeAliastoTabNameMapping(QB qb) { + // If any sub-query's qb is null, CorrelationOptimizer will not optimize this query. + // e.g. auto_join27.q + if (qb == null) { + return false; + } + boolean ret = true; + for (String alias : qb.getAliases()) { + aliastoTabName.put(alias, qb.getTabNameForAlias(alias)); + aliastoTab.put(alias, qb.getMetaData().getSrcForAlias(alias)); + } + for (String subqalias : qb.getSubqAliases()) { + QBExpr qbexpr = qb.getSubqForAlias(subqalias); + ret = ret && initializeAliastoTabNameMapping(qbexpr.getQB()); + } + return ret; + } + + /** + * Transform the query tree. + * + * @param pactx + * current parse context + * @throws SemanticException + */ + public ParseContext transform(ParseContext pctx) throws SemanticException { + + pGraphContext = pctx; + opParseCtx = pctx.getOpParseCtx(); + + groupbyNonMapSide2MapSide = pctx.getGroupbyNonMapSide2MapSide(); + groupbyMapSide2NonMapSide = pctx.getGroupbyMapSide2NonMapSide(); + + QB qb = pGraphContext.getQB(); + abort = !initializeAliastoTabNameMapping(qb); + if (abort) { + LOG.info("Abort. Reasons are ..."); + LOG.info("-- This query or its sub-queries has a null qb."); + return pGraphContext; + } + + if (HiveConf.getBoolVar(pGraphContext.getConf(),HiveConf.ConfVars.HIVECONVERTJOIN)) { + // 0: Guess if CommonJoinResolver will work. If CommonJoinResolver may + // convert a join operation, correlation optimizer will not merge that join. + for (JoinOperator joinOp: pGraphContext.getJoinContext().keySet()) { + boolean isAbleToGuess = true; + boolean mayConvert = false; + // Get total size and individual alias's size + long aliasTotalKnownInputSize = 0; + Map aliasToSize = new HashMap(); + Map posToAlias = new HashMap(); + for (Operator op: joinOp.getParentOperators()) { + TableScanOperator tsop = CorrelationOptimizerUtils.findTableScanOperator(op); + if (tsop == null) { + isAbleToGuess = false; + break; + } + + Table table = pGraphContext.getTopToTable().get(tsop); + String alias = tsop.getConf().getAlias(); + posToAlias.put(joinOp.getParentOperators().indexOf(op), alias); + if (table == null) { + isAbleToGuess = false; + break; + } + + Path p = table.getPath(); + FileSystem fs = null; + ContentSummary resultCs = null; + try { + fs = table.getPath().getFileSystem(pGraphContext.getConf()); + resultCs = fs.getContentSummary(p); + } catch (IOException e) { + LOG.warn("Encounter a error while querying content summary of table " + + table.getCompleteName() + " from FileSystem. " + + "Cannot guess if CommonJoinOperator will optimize " + + joinOp.getName() + " " + joinOp.getIdentifier()); + } + if (resultCs == null) { + isAbleToGuess = false; + break; + } + + long size = resultCs.getLength(); + aliasTotalKnownInputSize += size; + Long es = aliasToSize.get(alias); + if(es == null) { + es = new Long(0); + } + es += size; + aliasToSize.put(alias, es); + } + + if (!isAbleToGuess) { + LOG.info("Cannot guess if CommonJoinOperator will optimize " + + joinOp.getName() + " " + joinOp.getIdentifier()); + continue; + } + + JoinDesc joinDesc = joinOp.getConf(); + Byte[] order = joinDesc.getTagOrder(); + int numAliases = order.length; + HashSet bigTableCandidates = + MapJoinProcessor.getBigTableCandidates(joinDesc.getConds()); + if (bigTableCandidates == null) { + continue; + } + + String bigTableAlias = null; + long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(pGraphContext.getConf(), + HiveConf.ConfVars.HIVESMALLTABLESFILESIZE); + for (int i = 0; i < numAliases; i++) { + // this table cannot be big table + if (!bigTableCandidates.contains(i)) { + continue; + } + bigTableAlias = posToAlias.get(i); + Long aliasKnownSize = aliasToSize.get(bigTableAlias); + if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) { + long smallTblTotalKnownSize = aliasTotalKnownInputSize + - aliasKnownSize.longValue(); + if(smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) { + //this table is not good to be a big table. + continue; + } else { + mayConvert = true; + } + } + } + + if (mayConvert) { + LOG.info(joinOp.getName() + " " + joinOp.getIdentifier() + " may be converted to MapJoin by CommonJoinResolver"); + skipedJoinOperators.add(joinOp); + } + } + } + + // 1: Replace all map-side group by pattern (GBY-RS-GBY) to + // non-map-side group by pattern (RS-GBY) if necessary + if (pGraphContext.getConf().getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) { + for (Entry entry : groupbyMapSide2NonMapSide.entrySet()) { + GroupByOperator mapSidePatternStart = entry.getKey(); + Operator op1 = mapSidePatternStart.getChildOperators().get(0); + Operator op2 = op1.getChildOperators().get(0); + if (!(op1 instanceof ReduceSinkOperator && op2 instanceof GroupByOperator)) { + LOG.info("Abort. Reasons are ..."); + LOG.info("-- This plan has been converted to a plan involving map-only groupby"); + // e.g. test query groupby_sort_1.q, which is introduced in HIVE-3432. + return pGraphContext; + } + } + + for (Entry entry : groupbyMapSide2NonMapSide.entrySet()) { + GroupByOperator mapSidePatternStart = entry.getKey(); + GroupByOperator mapSidePatternEnd = (GroupByOperator) mapSidePatternStart + .getChildOperators().get(0).getChildOperators().get(0); + ReduceSinkOperator nonMapSidePatternStart = entry.getValue(); + GroupByOperator nonMapSidePatternEnd = (GroupByOperator) nonMapSidePatternStart + .getChildOperators().get(0); + + List> parents = mapSidePatternStart.getParentOperators(); + List> children = mapSidePatternEnd.getChildOperators(); + + nonMapSidePatternStart.setParentOperators(parents); + nonMapSidePatternEnd.setChildOperators(children); + + for (Operator parent : parents) { + parent.replaceChild(mapSidePatternStart, nonMapSidePatternStart); + } + for (Operator child : children) { + child.replaceParent(mapSidePatternEnd, nonMapSidePatternEnd); + } + } + } + + // 2: detect correlations + CorrelationNodeProcCtx correlationCtx = new CorrelationNodeProcCtx(); + + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", ReduceSinkOperator.getOperatorName() + "%"), + new CorrelationNodeProc()); + + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, correlationCtx); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topOp nodes + List topNodes = new ArrayList(); + topNodes.addAll(pGraphContext.getTopOps().values()); + ogw.startWalking(topNodes, null); + abort = correlationCtx.isAbort(); + int correlationsAppliedCount = 0; + if (abort) { + LOG.info("Abort. Reasons are ..."); + for (String reason : correlationCtx.getAbortReasons()) { + LOG.info("-- " + reason); + } + } else { + // 3: transform the query plan tree + LOG.info("Begain query plan transformation based on intra-query correlations. " + + correlationCtx.getCorrelations().size() + " correlation(s) to be applied"); + for (IntraQueryCorrelation correlation : correlationCtx.getCorrelations()) { + boolean ret = CorrelationOptimizerUtils.applyCorrelation( + correlation, pGraphContext, groupbyNonMapSide2MapSide, opParseCtx); + if (ret) { + correlationsAppliedCount++; + } + } + } + + // 4: if no correlation applied, replace all non-map-side group by pattern (GBY-RS-GBY) to + // map-side group by pattern (RS-GBY) if necessary + if (correlationsAppliedCount == 0 && + pGraphContext.getConf().getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) { + for (Entry entry : groupbyNonMapSide2MapSide.entrySet()) { + GroupByOperator mapSidePatternStart = entry.getValue(); + GroupByOperator mapSidePatternEnd = (GroupByOperator) mapSidePatternStart + .getChildOperators().get(0).getChildOperators().get(0); + ReduceSinkOperator nonMapSidePatternStart = entry.getKey(); + GroupByOperator nonMapSidePatternEnd = (GroupByOperator) nonMapSidePatternStart + .getChildOperators().get(0); + + List> parents = nonMapSidePatternStart + .getParentOperators(); + List> children = nonMapSidePatternEnd.getChildOperators(); + + mapSidePatternStart.setParentOperators(parents); + mapSidePatternEnd.setChildOperators(children); + + for (Operator parent : parents) { + parent.replaceChild(nonMapSidePatternStart, mapSidePatternStart); + } + for (Operator child : children) { + child.replaceParent(nonMapSidePatternEnd, mapSidePatternEnd); + } + } + } + LOG.info("Finish query plan transformation based on intra-query correlations. " + + correlationsAppliedCount + " correlation(s) actually be applied"); + return pGraphContext; + } + + private class CorrelationNodeProc implements NodeProcessor { + + + /** + * Find all upstream (close to FileSinkOperator) ReduceSinkOperators starting from + * input Operators + * + * @param ops + * Operators starting the search + * @return + */ + public List> findUpstreamReduceSinkOperators( + List> ops) { + List> downstreamReduceSinkOperatos = + new ArrayList>(); + for (Operator op : ops) { + if (op.getName().equals(ReduceSinkOperator.getOperatorName())) { + downstreamReduceSinkOperatos.add(op); + } else if (op.getName().equals(FileSinkOperator.getOperatorName())) { + continue; + } else { + downstreamReduceSinkOperatos.addAll(findUpstreamReduceSinkOperators( + op.getChildOperators())); + } + } + + return downstreamReduceSinkOperatos; + } + + private void analyzeReduceSinkOperatorsOfJoinOperator(JoinCondDesc[] joinConds, + List> rsOps, Operator curentRsOps, + Set correlatedRsOps) { + if (correlatedRsOps.contains((ReduceSinkOperator) curentRsOps)) { + return; + } + + correlatedRsOps.add((ReduceSinkOperator) curentRsOps); + + int pos = rsOps.indexOf(curentRsOps); + for (int i = 0; i < joinConds.length; i++) { + JoinCondDesc joinCond = joinConds[i]; + int type = joinCond.getType(); + if (pos == joinCond.getLeft()) { + if (type == JoinDesc.INNER_JOIN || type == JoinDesc.LEFT_OUTER_JOIN) { + Operator newCurrentRsOps = rsOps.get(joinCond.getRight()); + analyzeReduceSinkOperatorsOfJoinOperator(joinConds, rsOps, newCurrentRsOps, + correlatedRsOps); + } + } else if (pos == joinCond.getRight()) { + if (type == JoinDesc.INNER_JOIN || type == JoinDesc.RIGHT_OUTER_JOIN) { + Operator newCurrentRsOps = rsOps.get(joinCond.getLeft()); + analyzeReduceSinkOperatorsOfJoinOperator(joinConds, rsOps, newCurrentRsOps, + correlatedRsOps); + } + } + } + } + + private Set findCorrelatedReduceSinkOperators( + Operator op, Set keyColumns, + IntraQueryCorrelation correlation) throws SemanticException { + + LOG.info("now detecting operator " + op.getIdentifier() + " " + op.getName()); + + Set correlatedReduceSinkOperators = new HashSet(); + if (skipedJoinOperators.contains(op)) { + LOG.info(op.getName() + " " + op.getIdentifier() + " may be converted to MapJoin by " + + "CommonJoinResolver. Correlation optimizer will not detect correlations" + + "involved in this operator"); + return correlatedReduceSinkOperators; + } + if (op.getParentOperators() == null) { + return correlatedReduceSinkOperators; + } + if (op.getColumnExprMap() == null && !(op instanceof ReduceSinkOperator)) { + for (Operator parent : op.getParentOperators()) { + correlatedReduceSinkOperators.addAll(findCorrelatedReduceSinkOperators( + parent, keyColumns, correlation)); + } + } else if (op.getColumnExprMap() != null && !(op instanceof ReduceSinkOperator)) { + Set newKeyColumns = new HashSet(); + for (String keyColumn : keyColumns) { + ExprNodeDesc col = op.getColumnExprMap().get(keyColumn); + if (col instanceof ExprNodeColumnDesc) { + newKeyColumns.add(((ExprNodeColumnDesc) col).getColumn()); + } + } + + if (op.getName().equals(CommonJoinOperator.getOperatorName())) { + Set tableNeedToCheck = new HashSet(); + for (String keyColumn : keyColumns) { + for (ColumnInfo cinfo : opParseCtx.get(op).getRowResolver().getColumnInfos()) { + if (keyColumn.equals(cinfo.getInternalName())) { + tableNeedToCheck.add(cinfo.getTabAlias()); + } + } + } + Set correlatedRsOps = new HashSet(); + for (Operator parent : op.getParentOperators()) { + Set tableNames = + opParseCtx.get(parent).getRowResolver().getTableNames(); + for (String tbl : tableNames) { + if (tableNeedToCheck.contains(tbl)) { + correlatedRsOps.addAll(findCorrelatedReduceSinkOperators(parent, + newKeyColumns, correlation)); + } + } + } + + // Right now, if any ReduceSinkOperator of this JoinOperator is not correlated, we will + // not optimize this query + if (correlatedRsOps.size() == op.getParentOperators().size()) { + correlatedReduceSinkOperators.addAll(correlatedRsOps); + } else { + correlatedReduceSinkOperators.clear(); + } + } else { + for (Operator parent : op.getParentOperators()) { + correlatedReduceSinkOperators.addAll(findCorrelatedReduceSinkOperators( + parent, newKeyColumns, correlation)); + } + } + } else if (op.getColumnExprMap() != null && op instanceof ReduceSinkOperator) { + Set newKeyColumns = new HashSet(); + for (String keyColumn : keyColumns) { + ExprNodeDesc col = op.getColumnExprMap().get(keyColumn); + if (col instanceof ExprNodeColumnDesc) { + newKeyColumns.add(((ExprNodeColumnDesc) col).getColumn()); + } + } + + ReduceSinkOperator rsop = (ReduceSinkOperator) op; + Set thisKeyColumns = new HashSet(); + for (ExprNodeDesc key : rsop.getConf().getKeyCols()) { + if (key instanceof ExprNodeColumnDesc) { + thisKeyColumns.add(((ExprNodeColumnDesc) key).getColumn()); + } + } + + boolean isCorrelated = false; + Set intersection = new HashSet(newKeyColumns); + intersection.retainAll(thisKeyColumns); + // TODO: relax the condition to handle more cases + isCorrelated = (!intersection.isEmpty() && + intersection.size() == thisKeyColumns.size() && + intersection.size() == newKeyColumns.size()); + + + if (isCorrelated) { + List> upstreamReduceSinkOperators = + findUpstreamReduceSinkOperators(rsop.getChildOperators()); + // downstreamReduceSinkOperators will not be empty because rsop is not a + // ReduceSinkOperator which is nearest to FileSinkOperator + assert upstreamReduceSinkOperators.size() != 0; + for (Operator dsRSop : upstreamReduceSinkOperators) { + assert dsRSop instanceof ReduceSinkOperator; + if (intersection.size() != ((ReduceSinkOperator) dsRSop).getConf().getKeyCols().size()) { + isCorrelated = false; + } + } + } + + if (isCorrelated) { + LOG.info("Operator " + op.getIdentifier() + " " + op.getName() + " is correlated"); + LOG.info("--keys of this operator: " + thisKeyColumns.toString()); + LOG.info("--keys of child operator: " + keyColumns.toString()); + LOG.info("--keys of child operator mapped to this operator:" + newKeyColumns.toString()); + if (((Operator) (op.getChildOperators().get(0))).getName() + .equals(CommonJoinOperator.getOperatorName())) { + JoinOperator joinOp = (JoinOperator) op.getChildOperators().get(0); + JoinCondDesc[] joinConds = joinOp.getConf().getConds(); + List> rsOps = joinOp.getParentOperators(); + Set correlatedRsOps = new HashSet(); + analyzeReduceSinkOperatorsOfJoinOperator(joinConds, rsOps, op, correlatedRsOps); + correlatedReduceSinkOperators.addAll(correlatedRsOps); + } else { + correlatedReduceSinkOperators.add(rsop); + } + } else { + LOG.info("Operator " + op.getIdentifier() + " " + op.getName() + " is not correlated"); + LOG.info("--keys of this operator: " + thisKeyColumns.toString()); + LOG.info("--keys of child operator: " + keyColumns.toString()); + LOG.info("--keys of child operator mapped to this operator:" + newKeyColumns.toString()); + correlatedReduceSinkOperators.clear(); + } + } else { + LOG.error("ReduceSinkOperator " + op.getIdentifier() + " does not have ColumnExprMap"); + throw new SemanticException("CorrelationOptimizer cannot optimize this plan. " + + "ReduceSinkOperator " + op.getIdentifier() + + " does not have ColumnExprMap"); + } + return correlatedReduceSinkOperators; + } + + private Set exploitJFC(ReduceSinkOperator op, + CorrelationNodeProcCtx correlationCtx, IntraQueryCorrelation correlation) + throws SemanticException { + + correlationCtx.addWalked(op); + correlation.addToAllReduceSinkOperators(op); + + Set reduceSinkOperators = new HashSet(); + + boolean shouldDetect = true; + + List keys = op.getConf().getKeyCols(); + Set keyColumns = new HashSet(); + for (ExprNodeDesc key : keys) { + if (!(key instanceof ExprNodeColumnDesc)) { + shouldDetect = false; + } else { + keyColumns.add(((ExprNodeColumnDesc) key).getColumn()); + } + } + + if (shouldDetect) { + Set newReduceSinkOperators = new HashSet(); + for (Operator parent : op.getParentOperators()) { + LOG.info("Operator " + op.getIdentifier() + + ": start detecting correlation from this operator"); + LOG.info("--keys of this operator: " + keyColumns.toString()); + Set correlatedReduceSinkOperators = + findCorrelatedReduceSinkOperators(parent, keyColumns, correlation); + if (correlatedReduceSinkOperators.size() == 0) { + newReduceSinkOperators.add(op); + } else { + for (ReduceSinkOperator rsop : correlatedReduceSinkOperators) { + + // For two ReduceSinkOperators, we say the one closer to FileSinkOperators is up and + // another one is down + + if (!correlation.getUpstreamToDownstreamRSops().containsKey(op)) { + correlation.getUpstreamToDownstreamRSops().put(op, + new ArrayList()); + } + correlation.getUpstreamToDownstreamRSops().get(op).add(rsop); + + if (!correlation.getDownstreamToUpStreamRSops().containsKey(rsop)) { + correlation.getDownstreamToUpStreamRSops().put(rsop, + new ArrayList()); + } + correlation.getDownstreamToUpStreamRSops().get(rsop).add(op); + Set exploited = exploitJFC(rsop, correlationCtx, + correlation); + if (exploited.size() == 0) { + newReduceSinkOperators.add(rsop); + } else { + newReduceSinkOperators.addAll(exploited); + } + } + } + } + reduceSinkOperators.addAll(newReduceSinkOperators); + } + return reduceSinkOperators; + } + + private void annotateOpPlan(IntraQueryCorrelation correlation) { + Map bottomReduceSink2OperationPath = + new HashMap(); + int indx = 0; + for (ReduceSinkOperator rsop : correlation.getBottomReduceSinkOperators()) { + if (!bottomReduceSink2OperationPath.containsKey(rsop)) { + bottomReduceSink2OperationPath.put(rsop, indx); + for (ReduceSinkOperator peerRSop : CorrelationOptimizerUtils + .findPeerReduceSinkOperators(rsop)) { + if (correlation.getBottomReduceSinkOperators().contains(peerRSop)) { + bottomReduceSink2OperationPath.put(peerRSop, indx); + } + } + indx++; + } + } + correlation.setBottomReduceSink2OperationPathMap(bottomReduceSink2OperationPath); + } + + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + CorrelationNodeProcCtx correlationCtx = (CorrelationNodeProcCtx) ctx; + ReduceSinkOperator op = (ReduceSinkOperator) nd; + + if (correlationCtx.isWalked(op)) { + return null; + } + + LOG.info("Walk to operator " + ((Operator) nd).getIdentifier() + " " + + ((Operator) nd).getName()); + + if (op.getConf().getKeyCols().size() == 0 || + (!op.getChildOperators().get(0).getName().equals(CommonJoinOperator.getOperatorName()) && + !op.getChildOperators().get(0).getName().equals(GroupByOperator.getOperatorName()))) { + correlationCtx.addWalked(op); + return null; + } + + // 1: find out correlation + IntraQueryCorrelation correlation = new IntraQueryCorrelation(); + List peerReduceSinkOperators = + CorrelationOptimizerUtils.findPeerReduceSinkOperators(op); + List bottomReduceSinkOperators = new ArrayList(); + for (ReduceSinkOperator rsop : peerReduceSinkOperators) { + Set thisBottomReduceSinkOperators = exploitJFC(rsop, + correlationCtx, correlation); + if (thisBottomReduceSinkOperators.size() == 0) { + thisBottomReduceSinkOperators.add(rsop); + } else { + boolean isClear = false; + // bottom ReduceSinkOperators are those ReduceSinkOperators which are close to + // TableScanOperators + for (ReduceSinkOperator bottomRSop : thisBottomReduceSinkOperators) { + TableScanOperator tsop = CorrelationOptimizerUtils.findTableScanOperator(bottomRSop); + if (tsop == null) { + isClear = true; // currently the optimizer can only optimize correlations involving + // source tables (input tables) + } else { + // Top ReduceSinkOperators are those ReduceSinkOperators which are close to + // FileSinkOperators + if (!correlation.getTopRSopToTSops().containsKey(rsop)) { + correlation.getTopRSopToTSops().put(rsop, new ArrayList()); + } + correlation.getTopRSopToTSops().get(rsop).add(tsop); + + if (!correlation.getBottomRSopToTSops().containsKey(bottomRSop)) { + correlation.getBottomRSopToTSops().put(bottomRSop, + new ArrayList()); + } + correlation.getBottomRSopToTSops().get(bottomRSop).add(tsop); + } + } + if (isClear) { + thisBottomReduceSinkOperators.clear(); + thisBottomReduceSinkOperators.add(rsop); + } + } + bottomReduceSinkOperators.addAll(thisBottomReduceSinkOperators); + } + + if (!peerReduceSinkOperators.containsAll(bottomReduceSinkOperators)) { + LOG.info("has job flow correlation"); + correlation.setJobFlowCorrelation(true); + correlation.setJFCCorrelation(peerReduceSinkOperators, bottomReduceSinkOperators); + annotateOpPlan(correlation); + } + + if (correlation.hasJobFlowCorrelation()) { + boolean hasICandTC = findICandTC(correlation); + LOG.info("has input correlation and transit correlation? " + hasICandTC); + correlation.setInputCorrelation(hasICandTC); + correlation.setTransitCorrelation(hasICandTC); + boolean hasSelfJoin = hasSelfJoin(correlation); + LOG.info("has self-join? " + hasSelfJoin); + correlation.setInvolveSelfJoin(hasSelfJoin); + // TODO: support cases involving self-join. For self-join related operation paths, after the + // correlation dispatch operator, each path should be filtered by a filter operator + if (!hasSelfJoin) { + LOG.info("correlation detected"); + correlationCtx.addCorrelation(correlation); + } else { + LOG.info("correlation discarded. The current optimizer cannot optimize self-join"); + } + } + correlationCtx.addWalkedAll(peerReduceSinkOperators); + return null; + } + + private boolean hasSelfJoin(IntraQueryCorrelation correlation) { + boolean hasSelfJoin = false; + for (Entry> entry : correlation + .getTableToCorrelatedRSops().entrySet()) { + for (ReduceSinkOperator rsop : entry.getValue()) { + Set intersection = new HashSet( + CorrelationOptimizerUtils.findPeerReduceSinkOperators(rsop)); + intersection.retainAll(entry.getValue()); + // if self-join is involved + if (intersection.size() > 1) { + hasSelfJoin = true; + return hasSelfJoin; + } + } + } + return hasSelfJoin; + } + + private boolean findICandTC(IntraQueryCorrelation correlation) { + + boolean hasICandTC = false; + Map> table2RSops = + new HashMap>(); + Map> table2TSops = + new HashMap>(); + + for (Entry> entry : correlation + .getBottomRSopToTSops().entrySet()) { + String tbl = aliastoTabName.get(entry.getValue().get(0).getConf().getAlias()); + if (!table2RSops.containsKey(tbl) && !table2TSops.containsKey(tbl)) { + table2RSops.put(tbl, new ArrayList()); + table2TSops.put(tbl, new ArrayList()); + } + assert entry.getValue().size() == 1; + table2RSops.get(tbl).add(entry.getKey()); + table2TSops.get(tbl).add(entry.getValue().get(0)); + } + + for (Entry> entry : table2RSops.entrySet()) { + if (entry.getValue().size() > 1) { + hasICandTC = true; + break; + } + } + correlation.setICandTCCorrelation(table2RSops, table2TSops); + return hasICandTC; + } + } + + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, + NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { + LOG.info("Walk to operator " + ((Operator) nd).getIdentifier() + " " + + ((Operator) nd).getName() + ". No actual work to do"); + CorrelationNodeProcCtx correlationCtx = (CorrelationNodeProcCtx) ctx; + Operator op = (Operator) nd; + if (op.getName().equals(MapJoinOperator.getOperatorName())) { + correlationCtx.setAbort(true); + correlationCtx.getAbortReasons().add("Found MAPJOIN"); + } + if (op.getName().equals(FileSinkOperator.getOperatorName())) { + correlationCtx.incrementFileSinkOperatorCount(); + } + return null; + } + }; + } + + public class IntraQueryCorrelation { + + private final Map> downstreamRSopToUpstreamRSops = + new HashMap>(); + private final Map> upstreamToDownstreamRSops = + new HashMap>(); + + private final Map> topRSopToTSops = + new HashMap>(); + private final Map> bottomRSopToTSops = + new HashMap>(); + + private List topReduceSinkOperators; + private List bottomReduceSinkOperators; + + private Map> tableToCorrelatedRSops; + + private Map> tableToCorrelatedTSops; + + private Map bottomReduceSink2OperationPathMap; + + private final Map>> dispatchConf = + new HashMap>>(); // inputTag->(Child->outputTag) + private final Map>> dispatchValueSelectDescConf = + new HashMap>>(); // inputTag->(Child->SelectDesc) + private final Map>> dispatchKeySelectDescConf = + new HashMap>>(); // inputTag->(Child->SelectDesc) + + private final Set allReduceSinkOperators = + new HashSet(); + + public void addToAllReduceSinkOperators(ReduceSinkOperator rsop) { + allReduceSinkOperators.add(rsop); + } + + public Set getAllReduceSinkOperators() { + return allReduceSinkOperators; + } + + public Map>> getDispatchConf() { + return dispatchConf; + } + + public Map>> getDispatchValueSelectDescConf() { + return dispatchValueSelectDescConf; + } + + public Map>> getDispatchKeySelectDescConf() { + return dispatchKeySelectDescConf; + } + + public void addOperationPathToDispatchConf(Integer opPlan) { + if (!dispatchConf.containsKey(opPlan)) { + dispatchConf.put(opPlan, new HashMap>()); + } + } + + public Map> getDispatchConfForOperationPath(Integer opPlan) { + return dispatchConf.get(opPlan); + } + + public void addOperationPathToDispatchValueSelectDescConf(Integer opPlan) { + if (!dispatchValueSelectDescConf.containsKey(opPlan)) { + dispatchValueSelectDescConf.put(opPlan, new HashMap>()); + } + } + + public Map> getDispatchValueSelectDescConfForOperationPath( + Integer opPlan) { + return dispatchValueSelectDescConf.get(opPlan); + } + + public void addOperationPathToDispatchKeySelectDescConf(Integer opPlan) { + if (!dispatchKeySelectDescConf.containsKey(opPlan)) { + dispatchKeySelectDescConf.put(opPlan, new HashMap>()); + } + } + + public Map> getDispatchKeySelectDescConfForOperationPath( + Integer opPlan) { + return dispatchKeySelectDescConf.get(opPlan); + } + + private boolean inputCorrelation = false; + private boolean transitCorrelation = false; + private boolean jobFlowCorrelation = false; + + public void setBottomReduceSink2OperationPathMap( + Map bottomReduceSink2OperationPathMap) { + this.bottomReduceSink2OperationPathMap = bottomReduceSink2OperationPathMap; + } + + public Map getBottomReduceSink2OperationPathMap() { + return bottomReduceSink2OperationPathMap; + } + + public void setInputCorrelation(boolean inputCorrelation) { + this.inputCorrelation = inputCorrelation; + } + + public boolean hasInputCorrelation() { + return inputCorrelation; + } + + public void setTransitCorrelation(boolean transitCorrelation) { + this.transitCorrelation = transitCorrelation; + } + + public boolean hasTransitCorrelation() { + return transitCorrelation; + } + + public void setJobFlowCorrelation(boolean jobFlowCorrelation) { + this.jobFlowCorrelation = jobFlowCorrelation; + } + + public boolean hasJobFlowCorrelation() { + return jobFlowCorrelation; + } + + public Map> getTopRSopToTSops() { + return topRSopToTSops; + } + + public Map> getBottomRSopToTSops() { + return bottomRSopToTSops; + } + + public Map> getDownstreamToUpStreamRSops() { + return downstreamRSopToUpstreamRSops; + } + + public Map> getUpstreamToDownstreamRSops() { + return upstreamToDownstreamRSops; + } + + public void setJFCCorrelation(List peerReduceSinkOperators, + List bottomReduceSinkOperators) { + this.topReduceSinkOperators = peerReduceSinkOperators; + this.bottomReduceSinkOperators = bottomReduceSinkOperators; + } + + public List getTopReduceSinkOperators() { + return topReduceSinkOperators; + } + + public List getBottomReduceSinkOperators() { + return bottomReduceSinkOperators; + } + + public void setICandTCCorrelation(Map> tableToRSops, + Map> tableToTSops) { + this.tableToCorrelatedRSops = tableToRSops; + this.tableToCorrelatedTSops = tableToTSops; + } + + public Map> getTableToCorrelatedRSops() { + return tableToCorrelatedRSops; + } + + public Map> getTableToCorrelatedTSops() { + return tableToCorrelatedTSops; + } + + private boolean isInvolveSelfJoin = false; + + public boolean isInvolveSelfJoin() { + return isInvolveSelfJoin; + } + + public void setInvolveSelfJoin(boolean isInvolveSelfJoin) { + this.isInvolveSelfJoin = isInvolveSelfJoin; + } + + } + + private class CorrelationNodeProcCtx implements NodeProcessorCtx { + + private boolean abort; + + private final List abortReasons; + + private final Set walked; + + private final List correlations; + + private int fileSinkOperatorCount; + + public CorrelationNodeProcCtx() { + walked = new HashSet(); + correlations = new ArrayList(); + abort = false; + abortReasons = new ArrayList(); + fileSinkOperatorCount = 0; + } + + public void setAbort(boolean abort) { + this.abort = abort; + } + + public boolean isAbort() { + return abort; + } + + public List getAbortReasons() { + return abortReasons; + } + + public void addCorrelation(IntraQueryCorrelation correlation) { + correlations.add(correlation); + } + + public List getCorrelations() { + return correlations; + } + + public boolean isWalked(ReduceSinkOperator op) { + return walked.contains(op); + } + + public void addWalked(ReduceSinkOperator op) { + walked.add(op); + } + + public void addWalkedAll(Collection c) { + walked.addAll(c); + } + + public void incrementFileSinkOperatorCount() { + fileSinkOperatorCount++; + if (fileSinkOperatorCount == 2) { + abort = true; + abortReasons + .add("-- Currently, a query with multiple FileSinkOperators are not supported."); + } + } + + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/CorrelationOptimizerUtils.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/CorrelationOptimizerUtils.java new file mode 100644 index 0000000..2189d8a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/CorrelationOptimizerUtils.java @@ -0,0 +1,797 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Queue; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.CorrelationCompositeOperator; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.CorrelationOptimizer.IntraQueryCorrelation; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory; +import org.apache.hadoop.hive.ql.plan.CorrelationCompositeDesc; +import org.apache.hadoop.hive.ql.plan.CorrelationLocalSimulativeReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.CorrelationReducerDispatchDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.ForwardDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; + + +public final class CorrelationOptimizerUtils { + + static final private Log LOG = LogFactory.getLog(CorrelationOptimizerUtils.class.getName()); + + public static boolean isExisted(ExprNodeDesc expr, List col_list) { + for (ExprNodeDesc thisExpr : col_list) { + if (expr.getExprString().equals(thisExpr.getExprString())) { + return true; + } + } + return false; + } + + public static String getColumnName(Map opColumnExprMap, ExprNodeDesc expr) { + for (Entry entry : opColumnExprMap.entrySet()) { + if (expr.getExprString().equals(entry.getValue().getExprString())) { + return entry.getKey(); + } + } + return null; + } + + + public static Operator unionUsedColumnsAndMakeNewSelect( + List rsops, IntraQueryCorrelation correlation, + TableScanOperator input, + LinkedHashMap, OpParseContext> originalOpParseCtx) + throws SemanticException { + + ArrayList columnNames = new ArrayList(); + Map colExprMap = new HashMap(); + ArrayList col_list = new ArrayList(); + RowResolver out_rwsch = new RowResolver(); + boolean isSelectAll = false; + + int pos = 0; + for (ReduceSinkOperator rsop : rsops) { + List tsops = correlation.getBottomRSopToTSops().get(rsop); + // bottom ReduceSinkOperaotr should only have 1 corresponding TableScanOperator. + assert tsops.size() == 1; + Queue> opToBeVisited = + new LinkedList>(tsops.get(0).getChildOperators()); + + Operator curr = opToBeVisited.poll(); + assert curr != null; + while (curr != null) { + if (curr.getName().equals(SelectOperator.getOperatorName())) { + SelectOperator selOp = (SelectOperator) curr; + if (selOp.getColumnExprMap() != null) { + for (Entry entry : selOp.getColumnExprMap().entrySet()) { + ExprNodeDesc expr = entry.getValue(); + if (!isExisted(expr, col_list)) { + String outputName = entry.getKey(); + String[] colRef = originalOpParseCtx.get(selOp).getRowResolver(). + reverseLookup(outputName); + if (colRef == null) { + continue; + } + col_list.add(expr); + String tabAlias = colRef[0]; + String colAlias = colRef[1]; + out_rwsch.put(tabAlias, colAlias, new ColumnInfo( + outputName, expr.getTypeInfo(), tabAlias, false)); + pos++; + columnNames.add(outputName); + colExprMap.put(outputName, expr); + } + } + } else { + for (ExprNodeDesc expr : selOp.getConf().getColList()) { + if (!isExisted(expr, col_list)) { + String outputName = expr.getCols().get(0); + String[] colRef = originalOpParseCtx.get(selOp).getRowResolver() + .reverseLookup(outputName); + if (colRef == null) { + continue; + } + col_list.add(expr); + String tabAlias = colRef[0]; + String colAlias = colRef[1]; + out_rwsch.put(tabAlias, colAlias, new ColumnInfo( + outputName, expr.getTypeInfo(), tabAlias, false)); + columnNames.add(outputName); + colExprMap.put(outputName, expr); + pos++; + } + } + } + break; + } else if (curr.getName().equals(FilterOperator.getOperatorName())) { + // reach FilterOperator before reaching SelectOperator or ReduceSinkOperaotr + isSelectAll = true; + break; + } else if (curr.getName().equals(ReduceSinkOperator.getOperatorName())) { + ReduceSinkOperator thisRSop = (ReduceSinkOperator) curr; + for (ExprNodeDesc expr : thisRSop.getConf().getKeyCols()) { + if (!isExisted(expr, col_list)) { + assert expr.getCols().size() == 1; + String columnName = getColumnName(thisRSop.getColumnExprMap(), expr); + String[] colRef = originalOpParseCtx.get(thisRSop).getRowResolver() + .reverseLookup(columnName); + if (colRef == null) { + continue; + } + col_list.add(expr); + String tabAlias = colRef[0]; + String colAlias = colRef[1]; + String outputName = expr.getCols().get(0); + out_rwsch.put(tabAlias, colAlias, new ColumnInfo( + outputName, expr.getTypeInfo(), tabAlias, false)); + columnNames.add(outputName); + colExprMap.put(outputName, expr); + pos++; + } + } + for (ExprNodeDesc expr : thisRSop.getConf().getValueCols()) { + if (!isExisted(expr, col_list)) { + assert expr.getCols().size() == 1; + String columnName = getColumnName(thisRSop.getColumnExprMap(), expr); + String[] colRef = originalOpParseCtx.get(thisRSop).getRowResolver() + .reverseLookup(columnName); + if (colRef == null) { + continue; + } + col_list.add(expr); + String tabAlias = colRef[0]; + String colAlias = colRef[1]; + String outputName = expr.getCols().get(0); + out_rwsch.put(tabAlias, colAlias, new ColumnInfo( + outputName, expr.getTypeInfo(), tabAlias, false)); + columnNames.add(outputName); + colExprMap.put(outputName, expr); + pos++; + } + } + break; + } else { + opToBeVisited.addAll(curr.getChildOperators()); + } + curr = opToBeVisited.poll(); + } + } + + Operator output; + if (isSelectAll) { + output = input; + } else { + output = putOpInsertMap(OperatorFactory.getAndMakeChild( + new SelectDesc(col_list, columnNames, false), new RowSchema( + out_rwsch.getColumnInfos()), input), out_rwsch, originalOpParseCtx); + output.setColumnExprMap(colExprMap); + output.setChildOperators(Utilities.makeList()); + } + + return output; + } + + + public static Operator putOpInsertMap( + Operator op, + RowResolver rr, LinkedHashMap, OpParseContext> opParseCtx) { + OpParseContext ctx = new OpParseContext(rr); + opParseCtx.put(op, ctx); + op.augmentPlan(); + return op; + } + + public static Map, String> getAliasIDtTopOps( + Map> topOps) { + Map, String> aliasIDtTopOps = + new HashMap, String>(); + for (Entry> entry : topOps.entrySet()) { + assert !aliasIDtTopOps.containsKey(entry.getValue()); + aliasIDtTopOps.put(entry.getValue(), entry.getKey()); + } + return aliasIDtTopOps; + } + + /** + * Find all peer ReduceSinkOperators (which have the same child operator of op) of op (op + * included). + */ + public static List findPeerReduceSinkOperators(ReduceSinkOperator op) { + List peerReduceSinkOperators = new ArrayList(); + List> children = op.getChildOperators(); + assert children.size() == 1; // A ReduceSinkOperator should have only one child + for (Operator parent : children.get(0).getParentOperators()) { + assert (parent instanceof ReduceSinkOperator); + peerReduceSinkOperators.add((ReduceSinkOperator) parent); + } + return peerReduceSinkOperators; + } + + /** + * Search the query plan tree from startPoint to the bottom. If there is no ReduceSinkOperator + * between startPoint and the corresponding TableScanOperator, return the corresponding + * TableScanOperator. Otherwise, return null. + * @param startPoint the operator which the search will start at + * @return the TableScanOperator traced from startPoint. Null, if the search encounters any + * ReduceSinkOperator. + */ + public static TableScanOperator findTableScanOperator(Operator startPoint) { + Operator thisOp = startPoint.getParentOperators().get(0); + while (true) { + if (thisOp.getName().equals(ReduceSinkOperator.getOperatorName())) { + return null; + } else if (thisOp.getName().equals(TableScanOperator.getOperatorName())) { + return (TableScanOperator) thisOp; + } else { + if (thisOp.getParentOperators() != null) { + thisOp = thisOp.getParentOperators().get(0); + } else { + break; + } + } + } + return null; + } + + + public static boolean applyCorrelation( + IntraQueryCorrelation correlation, + ParseContext inputpGraphContext, + Map groupbyRegular2MapSide, + LinkedHashMap, OpParseContext> originalOpParseCtx) + throws SemanticException { + + ParseContext pGraphContext = inputpGraphContext; + + Operator curr; + + // 1: Create table scan operator + LOG.info("apply correlation step 1: create table scan operator"); + Map oldTSOP2newTSOP = + new HashMap(); + Map> oldTopOps = pGraphContext.getTopOps(); + Map, String> oldAliasIDtTopOps = + getAliasIDtTopOps(oldTopOps); + Map oldTopToTable = pGraphContext.getTopToTable(); + Map> addedTopOps = + new HashMap>(); + Map addedTopToTable = new HashMap(); + for (Entry> entry : correlation.getTableToCorrelatedTSops() + .entrySet()) { + TableScanOperator oldTSop = entry.getValue().get(0); + TableScanDesc tsDesc = new TableScanDesc(oldTSop.getConf().getAlias(), oldTSop.getConf() + .getVirtualCols()); + tsDesc.setForwardRowNumber(true); + OpParseContext opParseCtx = pGraphContext.getOpParseCtx().get(oldTSop); + Operator top = putOpInsertMap(OperatorFactory.get(tsDesc, + new RowSchema(opParseCtx.getRowResolver().getColumnInfos())), + opParseCtx.getRowResolver(), pGraphContext.getOpParseCtx()); + top.setParentOperators(null); + top.setChildOperators(Utilities.makeList()); + for (TableScanOperator tsop : entry.getValue()) { + addedTopOps.put(oldAliasIDtTopOps.get(tsop), top); + addedTopToTable.put((TableScanOperator) top, oldTopToTable.get(tsop)); + oldTSOP2newTSOP.put(tsop, (TableScanOperator) top); + } + } + + List> childrenOfDispatch = + new ArrayList>(); + for (ReduceSinkOperator rsop : correlation.getBottomReduceSinkOperators()) { + // TODO: currently, correlation optimizer can not handle the case that + // a table is directly connected to a post computation operator. e.g. + // Join + // / \ + // GBY T2 + // | + // T1 + if (!correlation.getBottomReduceSinkOperators() + .containsAll(findPeerReduceSinkOperators(rsop))) { + LOG.info("Can not handle the case that " + + "a table is directly connected to a post computation operator. Use original plan"); + return false; + } + Operator op = rsop.getChildOperators().get(0); + if (!childrenOfDispatch.contains(op)) { + LOG.info("Add :" + op.getIdentifier() + " " + op.getName() + + " to the children list of dispatch operator"); + childrenOfDispatch.add(op); + } + } + + int opTag = 0; + Map operationPath2CorrelationReduceSinkOps = + new HashMap(); + for (Entry> entry : correlation + .getTableToCorrelatedRSops().entrySet()) { + + // 2: Create select operator for shared operation paths + LOG.info("apply correlation step 2: create select operator for shared operation path for " + + "the table of " + entry.getKey()); + ReduceSinkOperator bottomRSop = entry.getValue().get(0); + TableScanOperator oldTSop = correlation.getBottomRSopToTSops().get(bottomRSop).get(0); + curr = unionUsedColumnsAndMakeNewSelect(entry.getValue(), correlation, + oldTSOP2newTSOP.get(oldTSop), originalOpParseCtx); + + // 3: Create CorrelationCompositeOperator, CorrelationReduceSinkOperator + LOG.info("apply correlation step 3: create correlation composite Operator and correlation " + + "reduce sink operator for the table of " + + entry.getKey()); + Operator input = curr; + RowResolver intputRR = pGraphContext.getOpParseCtx().get(curr).getRowResolver(); + curr = createCorrelationCompositeOperatorAndReducesinkOperaotr( + correlation.getTableToCorrelatedTSops().get(entry.getKey()), entry.getValue(), + correlation, input, intputRR, + childrenOfDispatch, entry.getKey(), opTag, originalOpParseCtx); + + operationPath2CorrelationReduceSinkOps.put(new Integer(opTag), (ReduceSinkOperator) curr); + opTag++; + } + + + // 4: Create correlation dispatch operator for operation paths + LOG.info("apply correlation step 4: create correlation dispatch operator for operation paths"); + RowResolver outputRS = new RowResolver(); + List> correlationReduceSinkOps = + new ArrayList>(); + for (Entry entry : operationPath2CorrelationReduceSinkOps + .entrySet()) { + curr = entry.getValue(); + correlationReduceSinkOps.add(curr); + RowResolver inputRS = pGraphContext.getOpParseCtx().get(curr).getRowResolver(); + for (Entry> e1 : inputRS.getRslvMap().entrySet()) { + for (Entry e2 : e1.getValue().entrySet()) { + outputRS.put(e1.getKey(), e2.getKey(), e2.getValue()); + } + } + } + + Operator dispatchOp = putOpInsertMap(OperatorFactory.get( + new CorrelationReducerDispatchDesc(correlation.getDispatchConf(), correlation + .getDispatchKeySelectDescConf(), correlation.getDispatchValueSelectDescConf()), + new RowSchema(outputRS.getColumnInfos())), + outputRS, pGraphContext.getOpParseCtx()); + + dispatchOp.setParentOperators(correlationReduceSinkOps); + for (Operator thisOp : correlationReduceSinkOps) { + thisOp.setChildOperators(Utilities.makeList(dispatchOp)); + } + + // 5: Replace the old plan in the original plan tree with new plan + LOG.info("apply correlation step 5: Replace the old plan in the original plan tree with " + + "the new plan"); + Set> processed = + new HashSet>(); + for (Operator op : childrenOfDispatch) { + List> parents = + new ArrayList>(); + for (Operator oldParent : op.getParentOperators()) { + if (!correlation.getBottomReduceSinkOperators().contains(oldParent)) { + parents.add(oldParent); + } + } + parents.add(dispatchOp); + op.setParentOperators(parents); + } + dispatchOp.setChildOperators(childrenOfDispatch); + HashMap> newTopOps = + new HashMap>(); + for (Entry> entry : oldTopOps.entrySet()) { + if (addedTopOps.containsKey(entry.getKey())) { + newTopOps.put(entry.getKey(), addedTopOps.get(entry.getKey())); + } else { + newTopOps.put(entry.getKey(), entry.getValue()); + } + } + pGraphContext.setTopOps(newTopOps); + HashMap newTopToTable = new HashMap(); + for (Entry entry : oldTopToTable.entrySet()) { + if (addedTopToTable.containsKey(oldTSOP2newTSOP.get(entry.getKey()))) { + newTopToTable.put(oldTSOP2newTSOP.get(entry.getKey()), + addedTopToTable.get(oldTSOP2newTSOP.get(entry.getKey()))); + } else { + newTopToTable.put(entry.getKey(), entry.getValue()); + } + } + pGraphContext.setTopToTable(newTopToTable); + + // 6: Change every JFC related ReduceSinkOperator to a + // CorrelationLocalSimulativeReduceSinkOperator + LOG.info("apply correlation step 6: Change every JFC related reduce sink operator to a " + + "CorrelationLocalSimulativeReduceSinkOperator"); + for (ReduceSinkOperator rsop : correlation.getAllReduceSinkOperators()) { + if (!correlation.getBottomReduceSinkOperators().contains(rsop)) { + Operator childOP = rsop.getChildOperators().get(0); + Operator parentOP = rsop.getParentOperators().get(0); + Operator correlationLocalSimulativeReduceSinkOperator = + putOpInsertMap( + OperatorFactory.get( + new CorrelationLocalSimulativeReduceSinkDesc(rsop.getConf()), + new RowSchema(pGraphContext.getOpParseCtx().get(rsop).getRowResolver() + .getColumnInfos())), + pGraphContext.getOpParseCtx().get(rsop).getRowResolver(), + pGraphContext.getOpParseCtx()); + correlationLocalSimulativeReduceSinkOperator.setChildOperators(Utilities.makeList(childOP)); + correlationLocalSimulativeReduceSinkOperator.setParentOperators(Utilities + .makeList(parentOP)); + parentOP.getChildOperators().set(parentOP.getChildOperators().indexOf(rsop), + correlationLocalSimulativeReduceSinkOperator); + childOP.getParentOperators().set(childOP.getParentOperators().indexOf(rsop), + correlationLocalSimulativeReduceSinkOperator); + } + } + return true; + } + + public static Operator + createCorrelationCompositeOperatorAndReducesinkOperaotr( + List tsops, + List rsops, + IntraQueryCorrelation correlation, + Operator input, + RowResolver inputRR, + List> childrenOfDispatch, + String tableName, + int newTag, + LinkedHashMap, OpParseContext> originalOpParseCtx) + throws SemanticException { + + // Create CorrelationCompositeOperator + List> tops = + new ArrayList>(); + List> bottoms = + new ArrayList>(); + List opTags = new ArrayList(); + + for (ReduceSinkOperator rsop : rsops) { + TableScanOperator tsop = correlation.getBottomRSopToTSops().get(rsop).get(0); + Operator curr = tsop.getChildOperators().get(0); + if (curr == rsop) { + // no filter needed, just forward + ForwardDesc forwardCtx = new ForwardDesc(); + Operator forwardOp = OperatorFactory.get(ForwardDesc.class); + forwardOp.setConf(forwardCtx); + tops.add(forwardOp); + bottoms.add(forwardOp); + opTags.add(correlation.getBottomReduceSink2OperationPathMap().get(rsop)); + } else { + // Add filter operator + FilterOperator currFilOp = null; + while (curr != rsop) { + if (curr.getName().equals("FIL")) { + FilterOperator fil = (FilterOperator) curr; + FilterDesc filterCtx = new FilterDesc(fil.getConf().getPredicate(), false); + Operator nowFilOp = OperatorFactory.get(FilterDesc.class); + nowFilOp.setConf(filterCtx); + if (currFilOp == null) { + currFilOp = (FilterOperator) nowFilOp; + tops.add(currFilOp); + } else { + nowFilOp.setParentOperators(Utilities.makeList(currFilOp)); + currFilOp.setChildOperators(Utilities.makeList(nowFilOp)); + currFilOp = (FilterOperator) nowFilOp; + } + } + curr = curr.getChildOperators().get(0); + } + if (currFilOp == null) { + ForwardDesc forwardCtx = new ForwardDesc(); + Operator forwardOp = OperatorFactory.get(ForwardDesc.class); + forwardOp.setConf(forwardCtx); + tops.add(forwardOp); + bottoms.add(forwardOp); + } else { + bottoms.add(currFilOp); + } + opTags.add(correlation.getBottomReduceSink2OperationPathMap().get(rsop)); + + } + } + + int[] opTagsArray = new int[opTags.size()]; + for (int i = 0; i < opTags.size(); i++) { + opTagsArray[i] = opTags.get(i).intValue(); + } + + for (Operator op : bottoms) { + op.setParentOperators(Utilities.makeList(input)); + } + input.setChildOperators(bottoms); + + CorrelationCompositeDesc ycoCtx = new CorrelationCompositeDesc(); + ycoCtx.setAllOperationPathTags(opTagsArray); + + Operator ycop = putOpInsertMap(OperatorFactory.get(ycoCtx, + new RowSchema(inputRR.getColumnInfos())), + inputRR, originalOpParseCtx); + ycop.setParentOperators(tops); + for (Operator op : tops) { + op.setChildOperators(Utilities.makeList(ycop)); + } + + // Create CorrelationReduceSinkOperator + ArrayList partitionCols = new ArrayList(); + ArrayList keyCols = new ArrayList(); + Map colExprMap = new HashMap(); + ArrayList keyOutputColumnNames = new ArrayList(); + ReduceSinkOperator firstRSop = rsops.get(0); + + RowResolver orginalFirstRSopRR = originalOpParseCtx.get(firstRSop).getRowResolver(); + RowResolver outputRR = new RowResolver(); + Map keyColToExprForDispatch = + new HashMap(); + Map valueColToExprForDispatch = + new HashMap(); + + for (ExprNodeDesc expr : firstRSop.getConf().getKeyCols()) { + assert expr instanceof ExprNodeColumnDesc; + ExprNodeColumnDesc encd = (ExprNodeColumnDesc) expr; + String ouputName = getColumnName(firstRSop.getColumnExprMap(), expr); + ColumnInfo cinfo = orginalFirstRSopRR.getColumnInfos().get( + orginalFirstRSopRR.getPosition(ouputName)); + + String col = SemanticAnalyzer.getColumnInternalName(keyCols.size()); + keyOutputColumnNames.add(col); + ColumnInfo newColInfo = new ColumnInfo(col, cinfo.getType(), tableName, cinfo + .getIsVirtualCol(), cinfo.isHiddenVirtualCol()); + + colExprMap.put(newColInfo.getInternalName(), expr); + + outputRR.put(tableName, newColInfo.getInternalName(), newColInfo); + keyCols.add(expr); + + keyColToExprForDispatch.put(encd.getColumn(), new ExprNodeColumnDesc(cinfo.getType(), col, + tableName, + encd.getIsPartitionColOrVirtualCol())); + + } + + ArrayList valueCols = new ArrayList(); + ArrayList valueOutputColumnNames = new ArrayList(); + + correlation.addOperationPathToDispatchConf(newTag); + correlation.addOperationPathToDispatchKeySelectDescConf(newTag); + correlation.addOperationPathToDispatchValueSelectDescConf(newTag); + + + for (ReduceSinkOperator rsop : rsops) { + LOG.debug("Analyzing ReduceSinkOperator " + rsop.getIdentifier()); + RowResolver orginalRS = originalOpParseCtx.get(rsop).getRowResolver(); + Integer childOpIndex = childrenOfDispatch.indexOf(rsop.getChildOperators().get(0)); + int outputTag = rsop.getConf().getTag(); + if (outputTag == -1) { + outputTag = 0; + } + if (!correlation.getDispatchConfForOperationPath(newTag).containsKey(childOpIndex)) { + correlation.getDispatchConfForOperationPath(newTag).put(childOpIndex, + new ArrayList()); + } + correlation.getDispatchConfForOperationPath(newTag).get(childOpIndex).add(outputTag); + + ArrayList thisKeyColsInDispatch = new ArrayList(); + ArrayList outputKeyNamesInDispatch = new ArrayList(); + for (ExprNodeDesc expr : rsop.getConf().getKeyCols()) { + assert expr instanceof ExprNodeColumnDesc; + ExprNodeColumnDesc encd = (ExprNodeColumnDesc) expr; + String outputName = getColumnName(rsop.getColumnExprMap(), expr); + LOG.debug("key column: " + outputName); + thisKeyColsInDispatch.add(keyColToExprForDispatch.get(encd.getColumn())); + String[] names = outputName.split("\\."); + String outputKeyName = ""; + switch (names.length) { + case 1: + outputKeyName = names[0]; + break; + case 2: + outputKeyName = names[1]; + break; + default: + throw (new SemanticException("found a un-sopported internal key name structure")); + } + outputKeyNamesInDispatch.add(outputKeyName); + } + + if (!correlation.getDispatchKeySelectDescConfForOperationPath(newTag).containsKey( + childOpIndex)) { + correlation.getDispatchKeySelectDescConfForOperationPath(newTag).put(childOpIndex, + new ArrayList()); + } + correlation.getDispatchKeySelectDescConfForOperationPath(newTag).get(childOpIndex). + add(new SelectDesc(thisKeyColsInDispatch, outputKeyNamesInDispatch, false)); + + ArrayList thisValueColsInDispatch = new ArrayList(); + ArrayList outputValueNamesInDispatch = new ArrayList(); + for (ExprNodeDesc expr : rsop.getConf().getValueCols()) { + + String outputName = getColumnName(rsop.getColumnExprMap(), expr); + LOG.debug("value column: " + outputName); + LOG.debug("originalOpColumnExprMap.get(rsop):" + rsop.getColumnExprMap() + + " expr:" + expr.toString() + + " orginalRS.getColumnInfos().toString:" + orginalRS.getColumnInfos().toString() + " " + + outputName); + ColumnInfo cinfo = orginalRS.getColumnInfos().get(orginalRS.getPosition(outputName)); + if (!valueColToExprForDispatch.containsKey(expr.getExprString())) { + + String col = SemanticAnalyzer.getColumnInternalName(keyCols.size() + valueCols.size()); + valueOutputColumnNames.add(col); + ColumnInfo newColInfo = new ColumnInfo(col, cinfo.getType(), tableName, cinfo + .getIsVirtualCol(), cinfo.isHiddenVirtualCol()); + colExprMap.put(newColInfo.getInternalName(), expr); + outputRR.put(tableName, newColInfo.getInternalName(), newColInfo); + valueCols.add(expr); + + valueColToExprForDispatch.put(expr.getExprString(), new ExprNodeColumnDesc( + cinfo.getType(), col, tableName, + false)); + } + + thisValueColsInDispatch.add(valueColToExprForDispatch.get(expr.getExprString())); + String[] names = outputName.split("\\."); + String outputValueName = ""; + switch (names.length) { + case 1: + outputValueName = names[0]; + break; + case 2: + outputValueName = names[1]; + break; + default: + throw (new SemanticException("found a un-sopported internal value name structure")); + } + outputValueNamesInDispatch.add(outputValueName); + } + + if (!correlation.getDispatchValueSelectDescConfForOperationPath(newTag).containsKey( + childOpIndex)) { + correlation.getDispatchValueSelectDescConfForOperationPath(newTag).put(childOpIndex, + new ArrayList()); + } + correlation.getDispatchValueSelectDescConfForOperationPath(newTag).get(childOpIndex). + add(new SelectDesc(thisValueColsInDispatch, outputValueNamesInDispatch, false)); + } + + ReduceSinkOperator rsop = null; + rsop = (ReduceSinkOperator) putOpInsertMap( + OperatorFactory.getAndMakeChild(getReduceSinkDesc(keyCols, + keyCols.size(), valueCols, new ArrayList>(), + keyOutputColumnNames, valueOutputColumnNames, true, newTag, keyCols.size(), + -1), new RowSchema(outputRR + .getColumnInfos()), ycop), outputRR, originalOpParseCtx); + rsop.setColumnExprMap(colExprMap); + ((CorrelationCompositeOperator) ycop).getConf().setCorrespondingReduceSinkOperator(rsop); + + return rsop; + } + + + /** + * Generate reduce sink descriptor. + * + * @param keyCols + * The columns to be stored in the key + * @param numKeys + * number of distribution keys. Equals to group-by-key + * numbers usually. + * @param valueCols + * The columns to be stored in the value + * @param distinctColIndices + * column indices for distinct aggregates + * @param outputKeyColumnNames + * The output key columns names + * @param outputValueColumnNames + * The output value columns names + * @param tag + * The tag for this ReduceSinkOperator + * @param numPartitionFields + * The first numPartitionFields of keyCols will be partition columns. + * If numPartitionFields=-1, then partition randomly. + * @param numReducers + * The number of reducers, set to -1 for automatic inference based on + * input data size. + * @return ReduceSinkDesc. + */ + public static ReduceSinkDesc getReduceSinkDesc( + ArrayList keyCols, int numKeys, + ArrayList valueCols, + List> distinctColIndices, + ArrayList outputKeyColumnNames, ArrayList outputValueColumnNames, + boolean includeKey, int tag, + int numPartitionFields, int numReducers) throws SemanticException { + ArrayList partitionCols = null; + + if (numPartitionFields >= keyCols.size()) { + partitionCols = keyCols; + } else if (numPartitionFields >= 0) { + partitionCols = new ArrayList(numPartitionFields); + for (int i = 0; i < numPartitionFields; i++) { + partitionCols.add(keyCols.get(i)); + } + } else { + // numPartitionFields = -1 means random partitioning + partitionCols = new ArrayList(1); + partitionCols.add(TypeCheckProcFactory.DefaultExprProcessor + .getFuncExprNodeDesc("rand")); + } + + StringBuilder order = new StringBuilder(); + for (int i = 0; i < keyCols.size(); i++) { + order.append("+"); + } + + TableDesc keyTable = null; + TableDesc valueTable = null; + ArrayList outputKeyCols = new ArrayList(); + ArrayList outputValCols = new ArrayList(); + if (includeKey) { + keyTable = PlanUtils.getReduceKeyTableDesc(PlanUtils.getFieldSchemasFromColumnListWithLength( + keyCols, distinctColIndices, outputKeyColumnNames, numKeys, ""), + order.toString()); + outputKeyCols.addAll(outputKeyColumnNames); + } else { + keyTable = PlanUtils.getReduceKeyTableDesc(PlanUtils.getFieldSchemasFromColumnList( + keyCols, "reducesinkkey"), order.toString()); + for (int i = 0; i < keyCols.size(); i++) { + outputKeyCols.add("reducesinkkey" + i); + } + } + valueTable = PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromColumnList( + valueCols, outputValueColumnNames, 0, "")); + outputValCols.addAll(outputValueColumnNames); + + return new ReduceSinkDesc(keyCols, numKeys, valueCols, outputKeyCols, + distinctColIndices, outputValCols, + tag, partitionCols, numReducers, keyTable, + valueTable, true); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index edde378..cf1f865 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -118,6 +118,11 @@ public final class GenMapRedUtils { } if (reducer.getClass() == JoinOperator.class) { plan.setNeedsTagging(true); + plan.setNeedsOperationPathTagging(false); + } + if (op.getConf().getNeedsOperationPathTagging()) { + plan.setNeedsTagging(true); + plan.setNeedsOperationPathTagging(true); } assert currTopOp != null; @@ -184,6 +189,7 @@ public final class GenMapRedUtils { opTaskMap.put(reducer, currTask); if (reducer.getClass() == JoinOperator.class) { plan.setNeedsTagging(true); + plan.setNeedsOperationPathTagging(false); } ReduceSinkDesc desc = (ReduceSinkDesc) op.getConf(); plan.setNumReduceTasks(desc.getNumReducers()); @@ -318,6 +324,7 @@ public final class GenMapRedUtils { if (reducer.getClass() == JoinOperator.class) { plan.setNeedsTagging(true); + plan.setNeedsOperationPathTagging(false); } initUnionPlan(opProcCtx, unionTask, false); @@ -1078,6 +1085,7 @@ public final class GenMapRedUtils { // dependent on the redTask if (reducer.getClass() == JoinOperator.class) { cplan.setNeedsTagging(true); + cplan.setNeedsOperationPathTagging(false); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index d1555e2..8803c35 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -88,6 +88,12 @@ public class Optimizer { if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVELIMITOPTENABLE)) { transformations.add(new GlobalLimitOptimizer()); } + if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTCORRELATION) && + !HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEGROUPBYSKEW) && + !HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_OPTIMIZE_SKEWJOIN_COMPILETIME)) { + // TODO: make CorrelationOptimizer compatible with SkewJoinOptimizer + transformations.add(new CorrelationOptimizer()); + } transformations.add(new SimpleFetchOptimizer()); // must be called last } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java index 2bf284d..d0e5177 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.hooks.LineageInfo; @@ -86,6 +87,11 @@ public class ParseContext { private Map> groupOpToInputTables; private Map prunedPartitions; + //a map from non-map-side group by pattern (RS-GBY) to map-side group by pattern (GBY-RS-GBY) + Map groupbyNonMapSide2MapSide; + //a map from map-side group by pattern (GBY-RS-GBY) to non-map-side group by pattern (RS-GBY) + Map groupbyMapSide2NonMapSide; + /** * The lineage information. */ @@ -169,7 +175,9 @@ public class ParseContext { GlobalLimitCtx globalLimitCtx, HashMap nameToSplitSample, HashSet semanticInputs, List> rootTasks, - Map> opToPartToSkewedPruner) { + Map> opToPartToSkewedPruner, + Map groupbyNonMapSide2MapSide, + Map groupbyMapSide2NonMapSide) { this.conf = conf; this.qb = qb; this.ast = ast; @@ -196,6 +204,8 @@ public class ParseContext { this.semanticInputs = semanticInputs; this.rootTasks = rootTasks; this.opToPartToSkewedPruner = opToPartToSkewedPruner; + this.groupbyNonMapSide2MapSide = groupbyNonMapSide2MapSide; + this.groupbyMapSide2NonMapSide = groupbyMapSide2NonMapSide; } /** @@ -578,4 +588,11 @@ public class ParseContext { this.opToPartToSkewedPruner = opToPartToSkewedPruner; } + public Map getGroupbyNonMapSide2MapSide() { + return groupbyNonMapSide2MapSide; + } + + public Map getGroupbyMapSide2NonMapSide() { + return groupbyMapSide2NonMapSide; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 330aa52..d7ad269 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -192,7 +192,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { private List loadTableWork; private List loadFileWork; private Map joinContext; - private final HashMap topToTable; + private HashMap topToTable; private QB qb; private ASTNode ast; private int destTableId; @@ -214,6 +214,11 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { private final UnparseTranslator unparseTranslator; private final GlobalLimitCtx globalLimitCtx = new GlobalLimitCtx(); + // a map from non-map-side group by pattern (RS-GBY) to map-side group by pattern (GBY-RS-GBY) + Map groupbyNonMapSide2MapSide; + // a map from map-side group by pattern (GBY-RS-GBY) to non-map-side group by pattern (RS-GBY) + Map groupbyMapSide2NonMapSide; + //prefix for column names auto generated by hive private final String autogenColAliasPrfxLbl; private final boolean autogenColAliasPrfxIncludeFuncName; @@ -252,6 +257,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { HiveConf.ConfVars.HIVE_AUTOGEN_COLUMNALIAS_PREFIX_INCLUDEFUNCNAME); queryProperties = new QueryProperties(); opToPartToSkewedPruner = new HashMap>(); + groupbyNonMapSide2MapSide = new HashMap(); + groupbyMapSide2NonMapSide = new HashMap(); } @Override @@ -270,6 +277,9 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { opParseCtx.clear(); groupOpToInputTables.clear(); prunedPartitions.clear(); + topToTable.clear(); + groupbyNonMapSide2MapSide.clear(); + groupbyMapSide2NonMapSide.clear(); } public void initParseCtx(ParseContext pctx) { @@ -277,6 +287,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { opToPartList = pctx.getOpToPartList(); opToSamplePruner = pctx.getOpToSamplePruner(); topOps = pctx.getTopOps(); + topToTable = pctx.getTopToTable(); topSelOps = pctx.getTopSelOps(); opParseCtx = pctx.getOpParseCtx(); loadTableWork = pctx.getLoadTableWork(); @@ -292,6 +303,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { prunedPartitions = pctx.getPrunedPartitions(); fetchTask = pctx.getFetchTask(); setLineageInfo(pctx.getLineageInfo()); + groupbyNonMapSide2MapSide = pctx.getGroupbyNonMapSide2MapSide(); + groupbyMapSide2NonMapSide = pctx.getGroupbyMapSide2NonMapSide(); } public ParseContext getParseContext() { @@ -300,7 +313,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, - opToPartToSkewedPruner); + opToPartToSkewedPruner, groupbyNonMapSide2MapSide, groupbyMapSide2NonMapSide); } @SuppressWarnings("nls") @@ -3216,7 +3229,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } List> distinctColIndices = getDistinctColIndicesForReduceSink(parseInfo, dest, - reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames); + reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap); ArrayList reduceValues = new ArrayList(); HashMap aggregationTrees = parseInfo @@ -3224,7 +3237,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { if (!mapAggrDone) { getReduceValuesForReduceSinkNoMapAgg(parseInfo, dest, reduceSinkInputRowResolver, - reduceSinkOutputRowResolver, outputValueColumnNames, reduceValues); + reduceSinkOutputRowResolver, outputValueColumnNames, reduceValues, colExprMap); } else { // Put partial aggregation results in reduceValues int inputField = reduceKeys.size(); @@ -3233,14 +3246,16 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { TypeInfo type = reduceSinkInputRowResolver.getColumnInfos().get( inputField).getType(); - reduceValues.add(new ExprNodeColumnDesc(type, - getColumnInternalName(inputField), "", false)); + ExprNodeDesc expr = new ExprNodeColumnDesc(type, + getColumnInternalName(inputField), "", false); + reduceValues.add(expr); inputField++; outputValueColumnNames.add(getColumnInternalName(reduceValues.size() - 1)); String field = Utilities.ReduceField.VALUE.toString() + "." + getColumnInternalName(reduceValues.size() - 1); - reduceSinkOutputRowResolver.putExpression(entry.getValue(), - new ColumnInfo(field, type, null, false)); + ColumnInfo colInfo = new ColumnInfo(field, type, null, false); + reduceSinkOutputRowResolver.putExpression(entry.getValue(), colInfo); + colExprMap.put(colInfo.getInternalName(), expr); } } @@ -3288,7 +3303,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { private List> getDistinctColIndicesForReduceSink(QBParseInfo parseInfo, String dest, List reduceKeys, RowResolver reduceSinkInputRowResolver, - RowResolver reduceSinkOutputRowResolver, List outputKeyColumnNames) + RowResolver reduceSinkOutputRowResolver, List outputKeyColumnNames, + Map colExprMap) throws SemanticException { List> distinctColIndices = new ArrayList>(); @@ -3327,6 +3343,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { ColumnInfo colInfo = new ColumnInfo(field, expr.getTypeInfo(), null, false); reduceSinkOutputRowResolver.putExpression(parameter, colInfo); numExprs++; + colExprMap.put(colInfo.getInternalName(), expr); } distinctColIndices.add(distinctIndices); } @@ -3337,7 +3354,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { private void getReduceValuesForReduceSinkNoMapAgg(QBParseInfo parseInfo, String dest, RowResolver reduceSinkInputRowResolver, RowResolver reduceSinkOutputRowResolver, - List outputValueColumnNames, ArrayList reduceValues) + List outputValueColumnNames, ArrayList reduceValues, + Map colExprMap) throws SemanticException { HashMap aggregationTrees = parseInfo .getAggregationExprsForClause(dest); @@ -3349,15 +3367,16 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { for (int i = 1; i < value.getChildCount(); i++) { ASTNode parameter = (ASTNode) value.getChild(i); if (reduceSinkOutputRowResolver.getExpression(parameter) == null) { - reduceValues.add(genExprNodeDesc(parameter, - reduceSinkInputRowResolver)); + ExprNodeDesc expr = genExprNodeDesc(parameter, reduceSinkInputRowResolver); + reduceValues.add(expr); outputValueColumnNames .add(getColumnInternalName(reduceValues.size() - 1)); String field = Utilities.ReduceField.VALUE.toString() + "." + getColumnInternalName(reduceValues.size() - 1); - reduceSinkOutputRowResolver.putExpression(parameter, new ColumnInfo(field, - reduceValues.get(reduceValues.size() - 1).getTypeInfo(), null, - false)); + ColumnInfo colInfo = new ColumnInfo(field, + reduceValues.get(reduceValues.size() - 1).getTypeInfo(), null, false); + reduceSinkOutputRowResolver.putExpression(parameter, colInfo); + colExprMap.put(colInfo.getInternalName(), expr); } } } @@ -3388,7 +3407,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { colExprMap); List> distinctColIndices = getDistinctColIndicesForReduceSink(parseInfo, dest, - reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames); + reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap); ArrayList reduceValues = new ArrayList(); @@ -3397,7 +3416,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { for (String destination : dests) { getReduceValuesForReduceSinkNoMapAgg(parseInfo, destination, reduceSinkInputRowResolver, - reduceSinkOutputRowResolver, outputValueColumnNames, reduceValues); + reduceSinkOutputRowResolver, outputValueColumnNames, reduceValues, colExprMap); // Need to pass all of the columns used in the where clauses as reduce values ASTNode whereClause = parseInfo.getWhrForClause(destination); @@ -3407,15 +3426,18 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { for (int i = 0; i < columnExprs.size(); i++) { ASTNode parameter = columnExprs.get(i); if (reduceSinkOutputRowResolver.getExpression(parameter) == null) { - reduceValues.add(genExprNodeDesc(parameter, - reduceSinkInputRowResolver)); + ExprNodeDesc expr = genExprNodeDesc(parameter, + reduceSinkInputRowResolver); + reduceValues.add(expr); outputValueColumnNames .add(getColumnInternalName(reduceValues.size() - 1)); String field = Utilities.ReduceField.VALUE.toString() + "." + getColumnInternalName(reduceValues.size() - 1); - reduceSinkOutputRowResolver.putExpression(parameter, new ColumnInfo(field, + ColumnInfo colInfo = new ColumnInfo(field, reduceValues.get(reduceValues.size() - 1).getTypeInfo(), null, - false)); + false); + reduceSinkOutputRowResolver.putExpression(parameter, colInfo); + colExprMap.put(colInfo.getInternalName(), expr); } } } @@ -3528,13 +3550,16 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { ASTNode t = entry.getValue(); TypeInfo typeInfo = reduceSinkInputRowResolver2.getExpression(t) .getType(); - reduceValues.add(new ExprNodeColumnDesc(typeInfo, field, "", false)); + ExprNodeColumnDesc inputExpr = new ExprNodeColumnDesc(typeInfo, field, + "", false); + reduceValues.add(inputExpr); inputField++; String col = getColumnInternalName(reduceValues.size() - 1); outputColumnNames.add(col); - reduceSinkOutputRowResolver2.putExpression(t, new ColumnInfo( - Utilities.ReduceField.VALUE.toString() + "." + col, typeInfo, "", - false)); + ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.VALUE.toString() + + "." + col, typeInfo, "", false); + reduceSinkOutputRowResolver2.putExpression(t, colInfo); + colExprMap.put(colInfo.getInternalName(), inputExpr); } ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap( @@ -6444,6 +6469,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { reduceValues.size() - 1).getTypeInfo(), "", false); reduceSinkOutputRowResolver.putExpression(grpbyExpr, colInfo); outputColumnNames.add(getColumnInternalName(reduceValues.size() - 1)); + colExprMap.put(colInfo.getInternalName(), grpByExprNode); } } @@ -6469,6 +6495,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { reduceSinkOutputRowResolver.putExpression(paraExpr, colInfo); outputColumnNames .add(getColumnInternalName(reduceValues.size() - 1)); + colExprMap.put(colInfo.getInternalName(), paraExprNode); } } } @@ -6707,7 +6734,23 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { curr = insertSelectAllPlanForGroupBy(curr); if (conf.getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) { if (!conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + Operator rsopInNonMapSidePattern = null; + Operator mapSideGroupBy = null; + if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTCORRELATION)) { + Operator nonMapSidePattern = genGroupByPlan1MR(dest, qb, curr); + rsopInNonMapSidePattern = (Operator) nonMapSidePattern + .getParentOperators().get(0); + curr.getChildOperators().remove(rsopInNonMapSidePattern); + } curr = genGroupByPlanMapAggr1MR(dest, qb, curr); + mapSideGroupBy = (Operator) ((Operator) curr.getParentOperators().get(0)) + .getParentOperators().get(0); + if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTCORRELATION)) { + groupbyNonMapSide2MapSide.put((ReduceSinkOperator) rsopInNonMapSidePattern, + (GroupByOperator) mapSideGroupBy); + groupbyMapSide2NonMapSide.put((GroupByOperator) mapSideGroupBy, + (ReduceSinkOperator) rsopInNonMapSidePattern); + } } else { curr = genGroupByPlanMapAggr2MR(dest, qb, curr); } @@ -8149,7 +8192,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, - opToPartToSkewedPruner); + opToPartToSkewedPruner, groupbyNonMapSide2MapSide, groupbyMapSide2NonMapSide); // Generate table access stats if required if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_TABLEKEYS) == true) { diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/BaseReduceSinkDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/BaseReduceSinkDesc.java new file mode 100644 index 0000000..75ccee8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/BaseReduceSinkDesc.java @@ -0,0 +1,193 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.util.ArrayList; +import java.util.List; + +/** + * BaseReduceSinkDesc. + * + */ +@Explain(displayName = "Base Reduce Output Operator") +public class BaseReduceSinkDesc extends AbstractOperatorDesc { + private static final long serialVersionUID = 1L; + /** + * Key columns are passed to reducer in the "key". + */ + protected ArrayList keyCols; + protected ArrayList outputKeyColumnNames; + protected List> distinctColumnIndices; + /** + * Value columns are passed to reducer in the "value". + */ + protected ArrayList valueCols; + protected ArrayList outputValueColumnNames; + /** + * Describe how to serialize the key. + */ + protected TableDesc keySerializeInfo; + /** + * Describe how to serialize the value. + */ + protected TableDesc valueSerializeInfo; + + /** + * The tag for this reducesink descriptor. + */ + protected int tag; + + /** + * Number of distribution keys. + */ + protected int numDistributionKeys; + + /** + * The partition columns (CLUSTER BY or DISTRIBUTE BY in Hive language). + * Partition columns decide the reducer that the current row goes to. + * Partition columns are not passed to reducer. + */ + protected ArrayList partitionCols; + + protected int numReducers; + + public BaseReduceSinkDesc() { + } + + public ArrayList getOutputKeyColumnNames() { + return outputKeyColumnNames; + } + + public void setOutputKeyColumnNames( + ArrayList outputKeyColumnNames) { + this.outputKeyColumnNames = outputKeyColumnNames; + } + + public ArrayList getOutputValueColumnNames() { + return outputValueColumnNames; + } + + public void setOutputValueColumnNames( + ArrayList outputValueColumnNames) { + this.outputValueColumnNames = outputValueColumnNames; + } + + @Explain(displayName = "key expressions") + public ArrayList getKeyCols() { + return keyCols; + } + + public void setKeyCols(final ArrayList keyCols) { + this.keyCols = keyCols; + } + + public int getNumDistributionKeys() { + return this.numDistributionKeys; + } + + public void setNumDistributionKeys(int numKeys) { + this.numDistributionKeys = numKeys; + } + + @Explain(displayName = "value expressions") + public ArrayList getValueCols() { + return valueCols; + } + + public void setValueCols(final ArrayList valueCols) { + this.valueCols = valueCols; + } + + @Explain(displayName = "Map-reduce partition columns") + public ArrayList getPartitionCols() { + return partitionCols; + } + + public void setPartitionCols( + final ArrayList partitionCols) { + this.partitionCols = partitionCols; + } + + @Explain(displayName = "tag") + public int getTag() { + return tag; + } + + public void setTag(int tag) { + this.tag = tag; + } + + /** + * Returns the number of reducers for the map-reduce job. -1 means to decide + * the number of reducers at runtime. This enables Hive to estimate the number + * of reducers based on the map-reduce input data size, which is only + * available right before we start the map-reduce job. + */ + public int getNumReducers() { + return numReducers; + } + + public void setNumReducers(int numReducers) { + this.numReducers = numReducers; + } + + public TableDesc getKeySerializeInfo() { + return keySerializeInfo; + } + + public void setKeySerializeInfo(TableDesc keySerializeInfo) { + this.keySerializeInfo = keySerializeInfo; + } + + public TableDesc getValueSerializeInfo() { + return valueSerializeInfo; + } + + public void setValueSerializeInfo(TableDesc valueSerializeInfo) { + this.valueSerializeInfo = valueSerializeInfo; + } + + /** + * Returns the sort order of the key columns. + * + * @return null, which means ascending order for all key columns, or a String + * of the same length as key columns, that consists of only "+" + * (ascending order) and "-" (descending order). + */ + @Explain(displayName = "sort order") + public String getOrder() { + return keySerializeInfo.getProperties().getProperty( + org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_SORT_ORDER); + } + + public void setOrder(String orderStr) { + keySerializeInfo.getProperties().setProperty( + org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_SORT_ORDER, + orderStr); + } + + public List> getDistinctColumnIndices() { + return distinctColumnIndices; + } + + public void setDistinctColumnIndices( + List> distinctColumnIndices) { + this.distinctColumnIndices = distinctColumnIndices; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationCompositeDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationCompositeDesc.java new file mode 100644 index 0000000..1ae438f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationCompositeDesc.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; + + +/** + * Correlation composite operator Descriptor implementation. + * + */ +@Explain(displayName = "Correlation Composite Operator") +public class CorrelationCompositeDesc extends AbstractOperatorDesc { + + private static final long serialVersionUID = 1L; + + private ReduceSinkOperator correspondingReduceSinkOperator; + + public CorrelationCompositeDesc() { + + } + + public CorrelationCompositeDesc(ReduceSinkOperator correspondingReduceSinkOperator) { + this.correspondingReduceSinkOperator = correspondingReduceSinkOperator; + } + + public void setCorrespondingReduceSinkOperator( + ReduceSinkOperator correspondingReduceSinkOperator) { + this.correspondingReduceSinkOperator = correspondingReduceSinkOperator; + } + + public ReduceSinkOperator getCorrespondingReduceSinkOperator() { + return correspondingReduceSinkOperator; + } + + private int[] allOperationPathTags; + + public void setAllOperationPathTags(int[] allOperationPathTags) { + this.allOperationPathTags = allOperationPathTags; + } + + public int[] getAllOperationPathTags() { + return allOperationPathTags; + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationLocalSimulativeReduceSinkDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationLocalSimulativeReduceSinkDesc.java new file mode 100644 index 0000000..80629af --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationLocalSimulativeReduceSinkDesc.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +/** + * CorrelationLocalSimulativeReduceSinkDesc. + * + */ +@Explain(displayName = "Correlation Local Simulative Reduce Output Operator") +public class CorrelationLocalSimulativeReduceSinkDesc extends BaseReduceSinkDesc { + private static final long serialVersionUID = 1L; + + public CorrelationLocalSimulativeReduceSinkDesc() { + } + + // A CorrelationLocalSimulativeReduceSinkDesc is only generated from a corresponding + // ReduceSinkDesc. + public CorrelationLocalSimulativeReduceSinkDesc(ReduceSinkDesc reduceSinkDesc){ + this.keyCols = reduceSinkDesc.getKeyCols(); + this.numDistributionKeys = reduceSinkDesc.getNumDistributionKeys(); + this.valueCols = reduceSinkDesc.getValueCols(); + this.outputKeyColumnNames = reduceSinkDesc.getOutputKeyColumnNames(); + this.outputValueColumnNames = reduceSinkDesc.getOutputValueColumnNames(); + this.tag = reduceSinkDesc.getTag(); + this.numReducers = reduceSinkDesc.getNumReducers(); + this.partitionCols = reduceSinkDesc.getPartitionCols(); + this.keySerializeInfo = reduceSinkDesc.getKeySerializeInfo(); + this.valueSerializeInfo = reduceSinkDesc.getValueSerializeInfo(); + this.distinctColumnIndices = reduceSinkDesc.getDistinctColumnIndices(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationReducerDispatchDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationReducerDispatchDesc.java new file mode 100644 index 0000000..a72e73d --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/CorrelationReducerDispatchDesc.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + + +/** + * Correlation dispatch operator Descriptor implementation. + * + */ +@Explain(displayName = "Correlation Dispatch Operator") +public class CorrelationReducerDispatchDesc extends AbstractOperatorDesc { + + private static final long serialVersionUID = 1L; + + private Map>> dispatchConf; + private Map>> dispatchValueSelectDescConf; + private Map>> dispatchKeySelectDescConf; + + public CorrelationReducerDispatchDesc(){ + this.dispatchConf = new HashMap>>(); + this.dispatchValueSelectDescConf = new HashMap>>(); + this.dispatchKeySelectDescConf = new HashMap>>(); + + } + + public CorrelationReducerDispatchDesc(Map>> dispatchConf){ + this.dispatchConf = dispatchConf; + this.dispatchValueSelectDescConf = new HashMap>>(); + this.dispatchKeySelectDescConf = new HashMap>>(); + for(Entry>> entry: this.dispatchConf.entrySet()){ + HashMap> tmp = new HashMap>(); + for(Integer child: entry.getValue().keySet()){ + tmp.put(child, new ArrayList()); + tmp.get(child).add(new SelectDesc(true)); + } + this.dispatchValueSelectDescConf.put(entry.getKey(), tmp); + this.dispatchKeySelectDescConf.put(entry.getKey(), tmp); + } + } + + public CorrelationReducerDispatchDesc(Map>> dispatchConf, + Map>> dispatchKeySelectDescConf, + Map>> dispatchValueSelectDescConf){ + this.dispatchConf = dispatchConf; + this.dispatchValueSelectDescConf = dispatchValueSelectDescConf; + this.dispatchKeySelectDescConf = dispatchKeySelectDescConf; + } + + public void setDispatchConf(Map>> dispatchConf){ + this.dispatchConf = dispatchConf; + } + + public Map>> getDispatchConf(){ + return this.dispatchConf; + } + + public void setDispatchValueSelectDescConf(Map>> dispatchValueSelectDescConf){ + this.dispatchValueSelectDescConf = dispatchValueSelectDescConf; + } + + public Map>> getDispatchValueSelectDescConf(){ + return this.dispatchValueSelectDescConf; + } + + public void setDispatchKeySelectDescConf(Map>> dispatchKeySelectDescConf){ + this.dispatchKeySelectDescConf = dispatchKeySelectDescConf; + } + + public Map>> getDispatchKeySelectDescConf() { + return this.dispatchKeySelectDescConf; + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java index 5a9f064..1dbb368 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java @@ -74,6 +74,7 @@ public class MapredWork extends AbstractOperatorDesc { private Long minSplitSizePerRack; private boolean needsTagging; + private boolean needsOperationPathTagging; private boolean hadoopSupportsSplittable; private MapredLocalWork mapLocalWork; @@ -388,6 +389,16 @@ public class MapredWork extends AbstractOperatorDesc { this.needsTagging = needsTagging; } + //TODO: enable the annotation shown below + // @Explain(displayName = "Needs Operation Paths Tagging", normalExplain = false) + public boolean getNeedsOperationPathTagging() { + return needsOperationPathTagging; + } + + public void setNeedsOperationPathTagging(boolean needsOperationPathTagging) { + this.needsOperationPathTagging = needsOperationPathTagging; + } + public boolean getHadoopSupportsSplittable() { return hadoopSupportsSplittable; } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java index b33d616..42237af 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java @@ -27,58 +27,44 @@ import java.util.List; * */ @Explain(displayName = "Reduce Output Operator") -public class ReduceSinkDesc extends AbstractOperatorDesc { +public class ReduceSinkDesc extends BaseReduceSinkDesc { private static final long serialVersionUID = 1L; - /** - * Key columns are passed to reducer in the "key". - */ - private java.util.ArrayList keyCols; - private java.util.ArrayList outputKeyColumnNames; - private List> distinctColumnIndices; - /** - * Value columns are passed to reducer in the "value". - */ - private java.util.ArrayList valueCols; - private java.util.ArrayList outputValueColumnNames; - /** - * Describe how to serialize the key. - */ - private TableDesc keySerializeInfo; - /** - * Describe how to serialize the value. - */ - private TableDesc valueSerializeInfo; - /** - * The tag for this reducesink descriptor. - */ - private int tag; + private boolean needsOperationPathTagging; - /** - * Number of distribution keys. - */ - private int numDistributionKeys; - - /** - * The partition columns (CLUSTER BY or DISTRIBUTE BY in Hive language). - * Partition columns decide the reducer that the current row goes to. - * Partition columns are not passed to reducer. - */ - private java.util.ArrayList partitionCols; + public boolean getNeedsOperationPathTagging() { + return needsOperationPathTagging; + } - private int numReducers; + public void setNeedsOperationPathTagging(boolean isOperationPathTagged) { + this.needsOperationPathTagging = isOperationPathTagged; + } public ReduceSinkDesc() { } - public ReduceSinkDesc(java.util.ArrayList keyCols, + public ReduceSinkDesc(ArrayList keyCols, + int numDistributionKeys, + ArrayList valueCols, + ArrayList outputKeyColumnNames, + List> distinctColumnIndices, + ArrayList outputValueColumnNames, int tag, + ArrayList partitionCols, int numReducers, + final TableDesc keySerializeInfo, final TableDesc valueSerializeInfo) { + this(keyCols, numDistributionKeys, valueCols, + outputKeyColumnNames, distinctColumnIndices, outputValueColumnNames, tag, + partitionCols, numReducers, keySerializeInfo, valueSerializeInfo, false); + } + + public ReduceSinkDesc(ArrayList keyCols, int numDistributionKeys, - java.util.ArrayList valueCols, - java.util.ArrayList outputKeyColumnNames, + ArrayList valueCols, + ArrayList outputKeyColumnNames, List> distinctColumnIndices, - java.util.ArrayList outputValueColumnNames, int tag, - java.util.ArrayList partitionCols, int numReducers, - final TableDesc keySerializeInfo, final TableDesc valueSerializeInfo) { + ArrayList outputValueColumnNames, int tag, + ArrayList partitionCols, int numReducers, + final TableDesc keySerializeInfo, final TableDesc valueSerializeInfo, + boolean needsOperationPathTagging) { this.keyCols = keyCols; this.numDistributionKeys = numDistributionKeys; this.valueCols = valueCols; @@ -90,6 +76,7 @@ public class ReduceSinkDesc extends AbstractOperatorDesc { this.keySerializeInfo = keySerializeInfo; this.valueSerializeInfo = valueSerializeInfo; this.distinctColumnIndices = distinctColumnIndices; + this.needsOperationPathTagging = needsOperationPathTagging; } @Override @@ -112,127 +99,7 @@ public class ReduceSinkDesc extends AbstractOperatorDesc { desc.setPartitionCols((ArrayList) getPartitionCols().clone()); desc.setKeySerializeInfo((TableDesc) getKeySerializeInfo().clone()); desc.setValueSerializeInfo((TableDesc) getValueSerializeInfo().clone()); + desc.setNeedsOperationPathTagging(needsOperationPathTagging); return desc; } - - public java.util.ArrayList getOutputKeyColumnNames() { - return outputKeyColumnNames; - } - - public void setOutputKeyColumnNames( - java.util.ArrayList outputKeyColumnNames) { - this.outputKeyColumnNames = outputKeyColumnNames; - } - - public java.util.ArrayList getOutputValueColumnNames() { - return outputValueColumnNames; - } - - public void setOutputValueColumnNames( - java.util.ArrayList outputValueColumnNames) { - this.outputValueColumnNames = outputValueColumnNames; - } - - @Explain(displayName = "key expressions") - public java.util.ArrayList getKeyCols() { - return keyCols; - } - - public void setKeyCols(final java.util.ArrayList keyCols) { - this.keyCols = keyCols; - } - - public int getNumDistributionKeys() { - return this.numDistributionKeys; - } - - public void setNumDistributionKeys(int numKeys) { - this.numDistributionKeys = numKeys; - } - - @Explain(displayName = "value expressions") - public java.util.ArrayList getValueCols() { - return valueCols; - } - - public void setValueCols(final java.util.ArrayList valueCols) { - this.valueCols = valueCols; - } - - @Explain(displayName = "Map-reduce partition columns") - public java.util.ArrayList getPartitionCols() { - return partitionCols; - } - - public void setPartitionCols( - final java.util.ArrayList partitionCols) { - this.partitionCols = partitionCols; - } - - @Explain(displayName = "tag") - public int getTag() { - return tag; - } - - public void setTag(int tag) { - this.tag = tag; - } - - /** - * Returns the number of reducers for the map-reduce job. -1 means to decide - * the number of reducers at runtime. This enables Hive to estimate the number - * of reducers based on the map-reduce input data size, which is only - * available right before we start the map-reduce job. - */ - public int getNumReducers() { - return numReducers; - } - - public void setNumReducers(int numReducers) { - this.numReducers = numReducers; - } - - public TableDesc getKeySerializeInfo() { - return keySerializeInfo; - } - - public void setKeySerializeInfo(TableDesc keySerializeInfo) { - this.keySerializeInfo = keySerializeInfo; - } - - public TableDesc getValueSerializeInfo() { - return valueSerializeInfo; - } - - public void setValueSerializeInfo(TableDesc valueSerializeInfo) { - this.valueSerializeInfo = valueSerializeInfo; - } - - /** - * Returns the sort order of the key columns. - * - * @return null, which means ascending order for all key columns, or a String - * of the same length as key columns, that consists of only "+" - * (ascending order) and "-" (descending order). - */ - @Explain(displayName = "sort order") - public String getOrder() { - return keySerializeInfo.getProperties().getProperty( - org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_SORT_ORDER); - } - - public void setOrder(String orderStr) { - keySerializeInfo.getProperties().setProperty( - org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_SORT_ORDER, - orderStr); - } - - public List> getDistinctColumnIndices() { - return distinctColumnIndices; - } - - public void setDistinctColumnIndices( - List> distinctColumnIndices) { - this.distinctColumnIndices = distinctColumnIndices; - } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index 9a95efd..0289805 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -50,6 +50,8 @@ public class TableScanDesc extends AbstractOperatorDesc { private boolean gatherStats; private boolean statsReliable; + private boolean forwardRowNumber = false; + private ExprNodeDesc filterExpr; public static final String FILTER_EXPR_CONF_STR = @@ -103,6 +105,14 @@ public class TableScanDesc extends AbstractOperatorDesc { return partColumns; } + public boolean isForwardRowNumber() { + return forwardRowNumber; + } + + public void setForwardRowNumber(boolean forwardRowNumber) { + this.forwardRowNumber = forwardRowNumber; + } + public void setGatherStats(boolean gatherStats) { this.gatherStats = gatherStats; } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/TestExecDriver.java ql/src/test/org/apache/hadoop/hive/ql/exec/TestExecDriver.java index 6f8bc47..ffbe655 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/TestExecDriver.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/TestExecDriver.java @@ -277,6 +277,7 @@ public class TestExecDriver extends TestCase { private void populateMapRedPlan3(Table src, Table src2) throws SemanticException { mr.setNumReduceTasks(Integer.valueOf(5)); mr.setNeedsTagging(true); + mr.setNeedsOperationPathTagging(false); ArrayList outputColumns = new ArrayList(); for (int i = 0; i < 2; i++) { outputColumns.add("_col" + i); diff --git ql/src/test/queries/clientpositive/correlationoptimizer1.q ql/src/test/queries/clientpositive/correlationoptimizer1.q new file mode 100644 index 0000000..13ffe3e --- /dev/null +++ ql/src/test/queries/clientpositive/correlationoptimizer1.q @@ -0,0 +1,38 @@ +-- the query is from auto_join26.q + +CREATE TABLE dest_co1(key INT, cnt INT); +CREATE TABLE dest_co2(key INT, cnt INT); + +set hive.optimize.correlation=false; +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key; + +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key; + +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key; + +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key; + +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key; +SELECT * FROM dest_co2 x ORDER BY x.key; + +-- enable hive.auto.convert.join. The query should not be optimized by correlation optimizer +CREATE TABLE dest_j1(key INT, cnt INT); +set hive.optimize.correlation=true; +set hive.auto.convert.join = true; + +EXPLAIN +INSERT OVERWRITE TABLE dest_j1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key; + +INSERT OVERWRITE TABLE dest_j1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key; + +select * from dest_j1 x order by x.key; diff --git ql/src/test/queries/clientpositive/correlationoptimizer2.q ql/src/test/queries/clientpositive/correlationoptimizer2.q new file mode 100644 index 0000000..98532d4 --- /dev/null +++ ql/src/test/queries/clientpositive/correlationoptimizer2.q @@ -0,0 +1,44 @@ +-- the query is modified from join18.q + +CREATE TABLE dest_co1(key1 INT, cnt1 INT, key2 INT, cnt2 INT); +CREATE TABLE dest_co2(key1 INT, cnt1 INT, key2 INT, cnt2 INT); + +set hive.optimize.correlation=false; +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key); + +INSERT OVERWRITE TABLE dest_co1 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key); + +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key); + +INSERT OVERWRITE TABLE dest_co2 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key); + +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key1, x.key2, x.cnt1, x.cnt2; +SELECT * FROM dest_co2 x ORDER BY x.key1, x.key2, x.cnt1, x.cnt2; \ No newline at end of file diff --git ql/src/test/queries/clientpositive/correlationoptimizer3.q ql/src/test/queries/clientpositive/correlationoptimizer3.q new file mode 100644 index 0000000..37e0e7d --- /dev/null +++ ql/src/test/queries/clientpositive/correlationoptimizer3.q @@ -0,0 +1,43 @@ +CREATE TABLE dest_co1(key INT, cnt INT, value STRING); +CREATE TABLE dest_co2(key INT, cnt INT, value STRING); + + +set hive.optimize.correlation=false; +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key; + +INSERT OVERWRITE TABLE dest_co1 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key; + +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key; + +INSERT OVERWRITE TABLE dest_co2 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key; + +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt, x.value; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt, x.value; \ No newline at end of file diff --git ql/src/test/queries/clientpositive/correlationoptimizer4.q ql/src/test/queries/clientpositive/correlationoptimizer4.q new file mode 100644 index 0000000..d473f4a --- /dev/null +++ ql/src/test/queries/clientpositive/correlationoptimizer4.q @@ -0,0 +1,112 @@ +CREATE TABLE T1(key INT, val STRING); +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; +CREATE TABLE T2(key INT, val STRING); +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +CREATE TABLE dest_co1(key INT, cnt INT); +CREATE TABLE dest_co2(key INT, cnt INT); + +set hive.optimize.correlation=false; +-- INNER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; + +set hive.optimize.correlation=false; +-- The case that GROUP BY key is from the left table of LEFT OUTER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; + + +set hive.optimize.correlation=false; +-- The case that GROUP BY key is from the right table of RIGHT OUTER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; + + +set hive.optimize.correlation=false; +-- The case that GROUP BY key is from the right table of LEFT OUTER JOIN should not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; + + +set hive.optimize.correlation=false; +-- The case that GROUP BY key is from the left table of RIGHT OUTER JOIN should not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; + +set hive.optimize.correlation=false; +-- FULL OUTER JOIN will not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; diff --git ql/src/test/queries/clientpositive/correlationoptimizer5.q ql/src/test/queries/clientpositive/correlationoptimizer5.q new file mode 100644 index 0000000..d625dda --- /dev/null +++ ql/src/test/queries/clientpositive/correlationoptimizer5.q @@ -0,0 +1,74 @@ +CREATE TABLE T1(key INT, val STRING); +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; +CREATE TABLE T2(key INT, val STRING); +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; +CREATE TABLE T3(key INT, val STRING); +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T3; + +CREATE TABLE dest_co1(key INT, cnt INT); +CREATE TABLE dest_co2(key INT, cnt INT); + +set hive.optimize.correlation=false; +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; + +set hive.optimize.correlation=false; +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; + +set hive.optimize.correlation=false; +-- FULL OUTER JOIN will not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; + +set hive.optimize.correlation=false; +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key; +set hive.optimize.correlation=true; +EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key; +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key; +-- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt; +SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt; diff --git ql/src/test/results/clientpositive/correlationoptimizer1.q.out ql/src/test/results/clientpositive/correlationoptimizer1.q.out new file mode 100644 index 0000000..7b118cf --- /dev/null +++ ql/src/test/results/clientpositive/correlationoptimizer1.q.out @@ -0,0 +1,710 @@ +PREHOOK: query: -- the query is from auto_join26.q + +CREATE TABLE dest_co1(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- the query is from auto_join26.q + +CREATE TABLE dest_co1(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co1 +PREHOOK: query: CREATE TABLE dest_co2(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_co2(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co2 +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Correlation Dispatch Operator + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: 1 + type: int + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +66 1 +98 2 +128 3 +146 2 +150 1 +213 2 +224 2 +238 2 +255 2 +273 3 +278 2 +311 3 +369 3 +401 5 +406 4 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +66 1 +98 2 +128 3 +146 2 +150 1 +213 2 +224 2 +238 2 +255 2 +273 3 +278 2 +311 3 +369 3 +401 5 +406 4 +PREHOOK: query: -- enable hive.auto.convert.join. The query should not be optimized by correlation optimizer +CREATE TABLE dest_j1(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- enable hive.auto.convert.join. The query should not be optimized by correlation optimizer +CREATE TABLE dest_j1(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_j1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_j1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_j1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_j1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-7 is a root stage , consists of Stage-8, Stage-9, Stage-1 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-2 depends on stages: Stage-1, Stage-5, Stage-6 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-9 has a backup stage: Stage-1 + Stage-6 depends on stages: Stage-9 + Stage-1 + +STAGE PLANS: + Stage: Stage-7 + Conditional Operator + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + y + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + y + TableScan + alias: y + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-9 + Map Reduce Local Work + Alias -> Map Local Tables: + x + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + x + TableScan + alias: x + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: + y + TableScan + alias: y + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_j1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +PREHOOK: Output: default@dest_j1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_j1 +SELECT x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +POSTHOOK: Output: default@dest_j1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_j1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_j1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: select * from dest_j1 x order by x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_j1 +#### A masked pattern was here #### +POSTHOOK: query: select * from dest_j1 x order by x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_j1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_j1.cnt EXPRESSION [(src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_j1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +66 1 +98 2 +128 3 +146 2 +150 1 +213 2 +224 2 +238 2 +255 2 +273 3 +278 2 +311 3 +369 3 +401 5 +406 4 diff --git ql/src/test/results/clientpositive/correlationoptimizer2.q.out ql/src/test/results/clientpositive/correlationoptimizer2.q.out new file mode 100644 index 0000000..0b37632 --- /dev/null +++ ql/src/test/results/clientpositive/correlationoptimizer2.q.out @@ -0,0 +1,601 @@ +PREHOOK: query: -- the query is modified from join18.q + +CREATE TABLE dest_co1(key1 INT, cnt1 INT, key2 INT, cnt2 INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- the query is modified from join18.q + +CREATE TABLE dest_co1(key1 INT, cnt1 INT, key2 INT, cnt2 INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co1 +PREHOOK: query: CREATE TABLE dest_co2(key1 INT, cnt1 INT, key2 INT, cnt2 INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_co2(key1 INT, cnt1 INT, key2 INT, cnt2 INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co2 +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src) x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key) key) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL x) value)) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src1) y)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key) key) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL y) value)) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) cnt)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) cnt))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(value) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + expr: UDFToInteger(_col2) + type: int + expr: UDFToInteger(_col3) + type: int + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b:y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(value) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt1 EXPRESSION [(src)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.cnt2 EXPRESSION [(src1)y.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key1 EXPRESSION [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key2 EXPRESSION [(src1)y.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt1 EXPRESSION [(src)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.cnt2 EXPRESSION [(src1)y.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key1 EXPRESSION [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key2 EXPRESSION [(src1)y.FieldSchema(name:key, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src) x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key) key) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL x) value)) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src1) y)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key) key) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL y) value)) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) cnt)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) cnt))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:x + TableScan + alias: x + Select Operator + expressions: + expr: value + type: string + expr: key + type: string + outputColumnNames: value, key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: value + type: string + b:y + TableScan + alias: y + Select Operator + expressions: + expr: value + type: string + expr: key + type: string + outputColumnNames: value, key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: value + type: string + Reduce Operator Tree: + Correlation Dispatch Operator + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + expr: UDFToInteger(_col2) + type: int + expr: UDFToInteger(_col3) + type: int + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + expr: UDFToInteger(_col2) + type: int + expr: UDFToInteger(_col3) + type: int + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT a.key, a.cnt, b.key, b.cnt +FROM +(SELECT x.key as key, count(x.value) AS cnt FROM src x group by x.key) a +JOIN +(SELECT y.key as key, count(y.value) AS cnt FROM src1 y group by y.key) b +ON (a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt1 EXPRESSION [(src)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.cnt2 EXPRESSION [(src1)y.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key1 EXPRESSION [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key2 EXPRESSION [(src1)y.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt1 EXPRESSION [(src)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt2 EXPRESSION [(src1)y.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.key1 EXPRESSION [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.key2 EXPRESSION [(src1)y.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key1, x.key2, x.cnt1, x.cnt2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key1, x.key2, x.cnt1, x.cnt2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt1 EXPRESSION [(src)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.cnt2 EXPRESSION [(src1)y.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key1 EXPRESSION [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key2 EXPRESSION [(src1)y.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt1 EXPRESSION [(src)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt2 EXPRESSION [(src1)y.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.key1 EXPRESSION [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.key2 EXPRESSION [(src1)y.FieldSchema(name:key, type:string, comment:default), ] +66 1 66 1 +98 2 98 1 +128 3 128 1 +146 2 146 1 +150 1 150 1 +213 2 213 1 +224 2 224 1 +238 2 238 1 +255 2 255 1 +273 3 273 1 +278 2 278 1 +311 3 311 1 +369 3 369 1 +401 5 401 1 +406 4 406 1 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key1, x.key2, x.cnt1, x.cnt2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key1, x.key2, x.cnt1, x.cnt2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt1 EXPRESSION [(src)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.cnt2 EXPRESSION [(src1)y.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key1 EXPRESSION [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.key2 EXPRESSION [(src1)y.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt1 EXPRESSION [(src)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt2 EXPRESSION [(src1)y.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.key1 EXPRESSION [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.key2 EXPRESSION [(src1)y.FieldSchema(name:key, type:string, comment:default), ] +66 1 66 1 +98 2 98 1 +128 3 128 1 +146 2 146 1 +150 1 150 1 +213 2 213 1 +224 2 224 1 +238 2 238 1 +255 2 255 1 +273 3 273 1 +278 2 278 1 +311 3 311 1 +369 3 369 1 +401 5 401 1 +406 4 406 1 diff --git ql/src/test/results/clientpositive/correlationoptimizer3.q.out ql/src/test/results/clientpositive/correlationoptimizer3.q.out new file mode 100644 index 0000000..81e7b2d --- /dev/null +++ ql/src/test/results/clientpositive/correlationoptimizer3.q.out @@ -0,0 +1,705 @@ +PREHOOK: query: CREATE TABLE dest_co1(key INT, cnt INT, value STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_co1(key INT, cnt INT, value STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co1 +PREHOOK: query: CREATE TABLE dest_co2(key INT, cnt INT, value STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_co2(key INT, cnt INT, value STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co2 +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) b) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) value))))) d) (= (. (TOK_TABLE_OR_COL b) key) (. (TOK_TABLE_OR_COL d) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) cnt)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL d) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-6 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-5 is a root stage + Stage-6 depends on stages: Stage-5 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + d:x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: value + type: string + d:y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col1 + type: string + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col3 + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b:x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + b:y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.value SIMPLE [(src1)x.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.value SIMPLE [(src1)x.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) b) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) value))))) d) (= (. (TOK_TABLE_OR_COL b) key) (. (TOK_TABLE_OR_COL d) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) cnt)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL d) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + d:x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: value + type: string + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: value + type: string + d:y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Correlation Dispatch Operator + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: 1 + type: int + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col3 + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col1 + type: string + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col3 + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT b.key, b.cnt, d.value +FROM +(SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b +JOIN +(SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d +ON b.key = d.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.value SIMPLE [(src1)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.value SIMPLE [(src1)x.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt, x.value +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt, x.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.value SIMPLE [(src1)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.value SIMPLE [(src1)x.FieldSchema(name:value, type:string, comment:default), ] +66 1 val_66 +98 2 val_98 +98 2 val_98 +128 3 +128 3 +128 3 +146 2 val_146 +146 2 val_146 +150 1 val_150 +213 2 val_213 +213 2 val_213 +224 2 +224 2 +238 2 val_238 +238 2 val_238 +255 2 val_255 +255 2 val_255 +273 3 val_273 +273 3 val_273 +273 3 val_273 +278 2 val_278 +278 2 val_278 +311 3 val_311 +311 3 val_311 +311 3 val_311 +369 3 +369 3 +369 3 +401 5 val_401 +401 5 val_401 +401 5 val_401 +401 5 val_401 +401 5 val_401 +406 4 val_406 +406 4 val_406 +406 4 val_406 +406 4 val_406 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt, x.value +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt, x.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(src1)x.null, (src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co1.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co1.value SIMPLE [(src1)x.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(src1)x.null, (src1)x.null, (src)y.null, ] +POSTHOOK: Lineage: dest_co2.key EXPRESSION [(src1)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dest_co2.value SIMPLE [(src1)x.FieldSchema(name:value, type:string, comment:default), ] +66 1 val_66 +98 2 val_98 +98 2 val_98 +128 3 +128 3 +128 3 +146 2 val_146 +146 2 val_146 +150 1 val_150 +213 2 val_213 +213 2 val_213 +224 2 +224 2 +238 2 val_238 +238 2 val_238 +255 2 val_255 +255 2 val_255 +273 3 val_273 +273 3 val_273 +273 3 val_273 +278 2 val_278 +278 2 val_278 +311 3 val_311 +311 3 val_311 +311 3 val_311 +369 3 +369 3 +369 3 +401 5 val_401 +401 5 val_401 +401 5 val_401 +401 5 val_401 +401 5 val_401 +406 4 val_406 +406 4 val_406 +406 4 val_406 +406 4 val_406 diff --git ql/src/test/results/clientpositive/correlationoptimizer4.q.out ql/src/test/results/clientpositive/correlationoptimizer4.q.out new file mode 100644 index 0000000..e086195 --- /dev/null +++ ql/src/test/results/clientpositive/correlationoptimizer4.q.out @@ -0,0 +1,2477 @@ +PREHOOK: query: CREATE TABLE T1(key INT, val STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key INT, val STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key INT, val STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key INT, val STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE dest_co1(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_co1(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co1 +PREHOOK: query: CREATE TABLE dest_co2(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_co2(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co2 +PREHOOK: query: -- INNER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: -- INNER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col4 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + Reduce Operator Tree: + Correlation Dispatch Operator + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col4 + type: int + sort order: + + Map-reduce partition columns: + expr: _col4 + type: int + tag: -1 + value expressions: + expr: 1 + type: int + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +2 1 +3 1 +8 4 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +2 1 +3 1 +8 4 +PREHOOK: query: -- The case that GROUP BY key is from the left table of LEFT OUTER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The case that GROUP BY key is from the left table of LEFT OUTER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + Reduce Operator Tree: + Correlation Dispatch Operator + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: 1 + type: int + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +2 1 +3 1 +4 1 +5 1 +8 4 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +2 1 +3 1 +4 1 +5 1 +8 4 +PREHOOK: query: -- The case that GROUP BY key is from the right table of RIGHT OUTER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The case that GROUP BY key is from the right table of RIGHT OUTER JOIN should be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + Reduce Operator Tree: + Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col4 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + Reduce Operator Tree: + Correlation Dispatch Operator + Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col4 + type: int + sort order: + + Map-reduce partition columns: + expr: _col4 + type: int + tag: -1 + value expressions: + expr: 1 + type: int + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 4 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 4 +PREHOOK: query: -- The case that GROUP BY key is from the right table of LEFT OUTER JOIN should not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The case that GROUP BY key is from the right table of LEFT OUTER JOIN should not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col4 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col4 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +NULL 2 +2 1 +3 1 +8 4 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +NULL 2 +2 1 +3 1 +8 4 +PREHOOK: query: -- The case that GROUP BY key is from the left table of RIGHT OUTER JOIN should not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The case that GROUP BY key is from the left table of RIGHT OUTER JOIN should not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +NULL 2 +2 1 +3 1 +8 4 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +NULL 2 +2 1 +3 1 +8 4 +PREHOOK: query: -- FULL OUTER JOIN will not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +POSTHOOK: query: -- FULL OUTER JOIN will not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_FULLOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_FULLOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x FULL OUTER JOIN T1 y ON (x.key = y.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +NULL 2 +2 1 +3 1 +4 1 +5 1 +8 4 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +NULL 2 +2 1 +3 1 +4 1 +5 1 +8 4 diff --git ql/src/test/results/clientpositive/correlationoptimizer5.q.out ql/src/test/results/clientpositive/correlationoptimizer5.q.out new file mode 100644 index 0000000..f631b54 --- /dev/null +++ ql/src/test/results/clientpositive/correlationoptimizer5.q.out @@ -0,0 +1,1711 @@ +PREHOOK: query: CREATE TABLE T1(key INT, val STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key INT, val STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key INT, val STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key INT, val STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE T3(key INT, val STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T3(key INT, val STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T3 +PREHOOK: type: LOAD +PREHOOK: Output: default@t3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T3 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t3 +PREHOOK: query: CREATE TABLE dest_co1(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_co1(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co1 +PREHOOK: query: CREATE TABLE dest_co2(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_co2(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_co2 +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key))) (TOK_TABREF (TOK_TABNAME T3) z) (= (. (TOK_TABLE_OR_COL y) key) (. (TOK_TABLE_OR_COL z) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + z + TableScan + alias: z + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 2 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 1 to 2 + condition expressions: + 0 + 1 {VALUE._col0} + 2 + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col4 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key))) (TOK_TABREF (TOK_TABNAME T3) z) (= (. (TOK_TABLE_OR_COL y) key) (. (TOK_TABLE_OR_COL z) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 2 + y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + z + TableScan + alias: z + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Reduce Operator Tree: + Correlation Dispatch Operator + Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 1 to 2 + condition expressions: + 0 + 1 {VALUE._col0} + 2 + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col4 + type: int + sort order: + + Map-reduce partition columns: + expr: _col4 + type: int + tag: -1 + value expressions: + expr: 1 + type: int + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) JOIN T3 z ON (y.key = z.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +2 1 +3 1 +8 8 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +2 1 +3 1 +8 8 +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key))) (TOK_TABREF (TOK_TABNAME T3) z) (= (. (TOK_TABLE_OR_COL y) key) (. (TOK_TABLE_OR_COL z) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + z + TableScan + alias: z + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 2 + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + Left Outer Join1 to 2 + condition expressions: + 0 {VALUE._col0} + 1 + 2 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key))) (TOK_TABREF (TOK_TABNAME T3) z) (= (. (TOK_TABLE_OR_COL y) key) (. (TOK_TABLE_OR_COL z) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 2 + value expressions: + expr: key + type: int + y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + z + TableScan + alias: z + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Reduce Operator Tree: + Correlation Dispatch Operator + Join Operator + condition map: + Left Outer Join0 to 1 + Left Outer Join1 to 2 + condition expressions: + 0 {VALUE._col0} + 1 + 2 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: 1 + type: int + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT x.key, count(1) FROM T2 x LEFT OUTER JOIN T1 y ON (x.key = y.key) LEFT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY x.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +2 1 +3 1 +4 1 +5 1 +8 8 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +2 1 +3 1 +4 1 +5 1 +8 8 +PREHOOK: query: -- FULL OUTER JOIN will not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key +PREHOOK: type: QUERY +POSTHOOK: query: -- FULL OUTER JOIN will not be optimized +EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key))) (TOK_TABREF (TOK_TABNAME T3) z) (= (. (TOK_TABLE_OR_COL y) key) (. (TOK_TABLE_OR_COL z) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL z) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL z) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + z + TableScan + alias: z + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 2 + value expressions: + expr: key + type: int + Reduce Operator Tree: + Join Operator + condition map: + Right Outer Join0 to 1 + Right Outer Join1 to 2 + condition expressions: + 0 + 1 + 2 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col8 + Select Operator + expressions: + expr: _col8 + type: int + outputColumnNames: _col8 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col8 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key))) (TOK_TABREF (TOK_TABNAME T3) z) (= (. (TOK_TABLE_OR_COL y) key) (. (TOK_TABLE_OR_COL z) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL z) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL z) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 2 + y + TableScan + alias: y + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + z + TableScan + alias: z + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Forward + Correlation Composite Operator + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + Reduce Operator Tree: + Correlation Dispatch Operator + Join Operator + condition map: + Right Outer Join0 to 1 + Right Outer Join1 to 2 + condition expressions: + 0 + 1 + 2 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col8 + Select Operator + expressions: + expr: _col8 + type: int + outputColumnNames: _col8 + Correlation Local Simulative Reduce Output Operator + key expressions: + expr: _col8 + type: int + sort order: + + Map-reduce partition columns: + expr: _col8 + type: int + tag: -1 + value expressions: + expr: 1 + type: int + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT z.key, count(1) FROM T2 x RIGHT OUTER JOIN T1 y ON (x.key = y.key) RIGHT OUTER JOIN T3 z ON (y.key = z.key) GROUP BY z.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 8 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 8 +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_FULLOUTERJOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key))) (TOK_TABREF (TOK_TABNAME T3) z) (= (. (TOK_TABLE_OR_COL y) key) (. (TOK_TABLE_OR_COL z) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + z + TableScan + alias: z + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 2 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Outer Join 1 to 2 + condition expressions: + 0 + 1 {VALUE._col0} + 2 + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col4 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co1 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +PREHOOK: Output: default@dest_co1 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co1 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@dest_co1 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_FULLOUTERJOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T2) x) (TOK_TABREF (TOK_TABNAME T1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key))) (TOK_TABREF (TOK_TABNAME T3) z) (= (. (TOK_TABLE_OR_COL y) key) (. (TOK_TABLE_OR_COL z) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest_co2))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL y) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + y + TableScan + alias: y + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + z + TableScan + alias: z + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 2 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Outer Join 1 to 2 + condition expressions: + 0 + 1 {VALUE._col0} + 2 + handleSkewJoin: false + outputColumnNames: _col4 + Select Operator + expressions: + expr: _col4 + type: int + outputColumnNames: _col4 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col4 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_co2 + + Stage: Stage-3 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +PREHOOK: Output: default@dest_co2 +POSTHOOK: query: INSERT OVERWRITE TABLE dest_co2 +SELECT y.key, count(1) FROM T2 x JOIN T1 y ON (x.key = y.key) FULL OUTER JOIN T3 z ON (y.key = z.key) GROUP BY y.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@dest_co2 +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: query: -- dest_co1 and dest_co2 should be same +SELECT * FROM dest_co1 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +NULL 2 +2 1 +3 1 +8 8 +PREHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM dest_co2 x ORDER BY x.key, x.cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_co2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co1.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.cnt EXPRESSION [(t2)x.null, (t1)y.null, (t3)z.null, ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t2)x.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t3)z.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest_co2.key SIMPLE [(t1)y.FieldSchema(name:key, type:int, comment:null), ] +NULL 2 +2 1 +3 1 +8 8 diff --git ql/src/test/results/compiler/plan/groupby1.q.xml ql/src/test/results/compiler/plan/groupby1.q.xml old mode 100755 new mode 100644 index cd0d6e4..e941847 --- ql/src/test/results/compiler/plan/groupby1.q.xml +++ ql/src/test/results/compiler/plan/groupby1.q.xml @@ -351,6 +351,24 @@ + + VALUE._col0 + + + _col1 + + + + + + + + double + + + + + @@ -423,21 +441,7 @@ - - - _col1 - - - - - - - - double - - - - + @@ -534,7 +538,7 @@ _col0 - + key @@ -633,7 +637,7 @@ - + @@ -1303,7 +1307,7 @@ _col1 - + _col1 @@ -1317,7 +1321,7 @@ _col0 - + _col0 @@ -1336,10 +1340,10 @@ - + - + @@ -1417,7 +1421,7 @@ _col0 - + KEY._col0 @@ -1469,7 +1473,7 @@ - + diff --git ql/src/test/results/compiler/plan/groupby2.q.xml ql/src/test/results/compiler/plan/groupby2.q.xml old mode 100755 new mode 100644 index 7b07f02..aad1cc0 --- ql/src/test/results/compiler/plan/groupby2.q.xml +++ ql/src/test/results/compiler/plan/groupby2.q.xml @@ -217,6 +217,56 @@ + + VALUE._col1 + + + _col3 + + + + + + + + double + + + + + + + KEY._col1:0._col0 + + + _col1 + + + + + + + + + + + VALUE._col0 + + + _col2 + + + + + + + + bigint + + + + + @@ -238,17 +288,7 @@ - - - _col1 - - - - - - - - + @@ -320,38 +360,10 @@ - - - _col2 - - - - - - - - bigint - - - - + - - - _col3 - - - - - - - - double - - - - + @@ -442,7 +454,7 @@ VALUE._col0 - + @@ -452,7 +464,7 @@ VALUE._col1 - + @@ -830,7 +842,7 @@ - + @@ -843,7 +855,7 @@ - + @@ -870,7 +882,7 @@ src - + @@ -1093,7 +1105,7 @@ src - + @@ -1432,7 +1444,7 @@ - + @@ -1486,7 +1498,7 @@ - + @@ -1512,7 +1524,7 @@ _col1 - + _col1 @@ -1520,13 +1532,13 @@ - + _col0 - + _col0 @@ -1545,10 +1557,10 @@ - + - + @@ -1622,7 +1634,7 @@ _col1 - + @@ -1651,7 +1663,7 @@ _col0 - + KEY._col0 @@ -1719,7 +1731,7 @@ VALUE._col1 - + @@ -1735,7 +1747,7 @@ - + @@ -1817,7 +1829,7 @@ - + @@ -1830,7 +1842,7 @@ - + diff --git ql/src/test/results/compiler/plan/groupby3.q.xml ql/src/test/results/compiler/plan/groupby3.q.xml index a6a1986..967d75f 100644 --- ql/src/test/results/compiler/plan/groupby3.q.xml +++ ql/src/test/results/compiler/plan/groupby3.q.xml @@ -198,7 +198,125 @@ - + + + VALUE._col4 + + + _col5 + + + + + + + + string + + + + + + + VALUE._col3 + + + _col4 + + + + + + + + + + + VALUE._col2 + + + _col3 + + + + + + + + + + count + + + sum + + + + + + + + + bigint + + + + + + + double + + + + + + + + + + + VALUE._col1 + + + _col2 + + + + + + + + + + + VALUE._col0 + + + _col1 + + + + + + + + + + + KEY._col0:0._col0 + + + _col0 + + + + + + + + + + @@ -216,21 +334,7 @@ - - - _col0 - - - - - - - - string - - - - + @@ -301,98 +405,19 @@ - - - _col1 - - - - - - - - double - - - - + - - - _col2 - - - - - - - - - - count - - - sum - - - - - - - - - bigint - - - - - - - - - - - + - - - _col3 - - - - - - - - + - - - _col4 - - - - - - - - + - - - _col5 - - - - - - - - + @@ -473,7 +498,7 @@ VALUE._col0 - + @@ -1012,7 +1037,7 @@ - + @@ -1091,7 +1116,7 @@ src - + @@ -1292,7 +1317,7 @@ src - + @@ -1618,7 +1643,7 @@ - + @@ -1631,7 +1656,7 @@ - + @@ -1644,7 +1669,7 @@ - + @@ -1686,7 +1711,7 @@ _col4 - + _col4 @@ -1700,7 +1725,7 @@ _col3 - + _col3 @@ -1714,7 +1739,7 @@ _col2 - + _col2 @@ -1722,13 +1747,13 @@ - + _col1 - + _col1 @@ -1736,13 +1761,13 @@ - + _col0 - + _col0 @@ -1750,7 +1775,7 @@ - + @@ -1761,19 +1786,19 @@ - + - + - + - + - + @@ -1837,7 +1862,7 @@ _col0 - + @@ -1850,7 +1875,7 @@ _col1 - + @@ -1863,7 +1888,7 @@ _col2 - + @@ -1929,7 +1954,7 @@ VALUE._col0 - + @@ -2129,7 +2154,7 @@ - + @@ -2142,7 +2167,7 @@ - + @@ -2155,7 +2180,7 @@ - + diff --git ql/src/test/results/compiler/plan/groupby5.q.xml ql/src/test/results/compiler/plan/groupby5.q.xml index 25e3583..9b035f5 100644 --- ql/src/test/results/compiler/plan/groupby5.q.xml +++ ql/src/test/results/compiler/plan/groupby5.q.xml @@ -217,6 +217,24 @@ + + VALUE._col0 + + + _col1 + + + + + + + + double + + + + + @@ -289,21 +307,7 @@ - - - _col1 - - - - - - - - double - - - - + @@ -400,7 +404,7 @@ _col0 - + key @@ -499,7 +503,7 @@ - + @@ -1190,7 +1194,7 @@ _col1 - + _col1 @@ -1204,7 +1208,7 @@ _col0 - + _col0 @@ -1223,10 +1227,10 @@ - + - + @@ -1310,7 +1314,7 @@ _col0 - + KEY._col0 @@ -1362,7 +1366,7 @@ - +