Index: data/files/in4.txt =================================================================== --- data/files/in4.txt (revision 0) +++ data/files/in4.txt (revision 0) @@ -0,0 +1,7 @@ +35236 +101000501 +100100103 +12802 +101005 +10100454 +12100757 Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java (working copy) @@ -23,6 +23,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; /** * This evaluator gets the column from the row object. @@ -33,6 +35,7 @@ transient StructObjectInspector[] inspectors; transient StructField[] fields; + transient boolean[] unionField; public ExprNodeColumnEvaluator(ExprNodeColumnDesc expr) { this.expr = expr; @@ -46,15 +49,32 @@ String[] names = expr.getColumn().split("\\."); inspectors = new StructObjectInspector[names.length]; fields = new StructField[names.length]; + unionField = new boolean[names.length]; + int unionIndex = -1; for (int i = 0; i < names.length; i++) { if (i == 0) { inspectors[0] = (StructObjectInspector) rowInspector; } else { - inspectors[i] = (StructObjectInspector) fields[i - 1] + if (unionIndex != -1) { + inspectors[i] = (StructObjectInspector) ( + (UnionObjectInspector)fields[i-1].getFieldObjectInspector()). + getObjectInspectors().get(unionIndex); + } else { + inspectors[i] = (StructObjectInspector) fields[i - 1] .getFieldObjectInspector(); + } + } + // to support names like _colx:1._coly + String[] unionfields = names[i].split("\\:"); + fields[i] = inspectors[i].getStructFieldRef(unionfields[0]); + if (unionfields.length > 1) { + unionIndex = Integer.parseInt(unionfields[1]); + unionField[i] = true; + } else { + unionIndex = -1; + unionField[i] = false; } - fields[i] = inspectors[i].getStructFieldRef(names[i]); } return fields[names.length - 1].getFieldObjectInspector(); } @@ -64,6 +84,9 @@ Object o = row; for (int i = 0; i < fields.length; i++) { o = inspectors[i].getStructFieldData(o, fields[i]); + if (unionField[i]) { + o = ((StandardUnion)o).getObject(); + } } return o; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (working copy) @@ -27,6 +27,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -35,22 +36,27 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectsEqualComparer; import org.apache.hadoop.hive.serde2.lazy.LazyPrimitive; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectsEqualComparer; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObject; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.Text; /** @@ -77,6 +83,16 @@ // the same SQL clause, // so aggregationIsDistinct is a boolean array instead of a single number. protected transient boolean[] aggregationIsDistinct; + // Map from integer tag to distinct aggrs + transient protected Map> distinctKeyAggrs = + new HashMap>(); + // Map from integer tag to non-distinct aggrs with key parameters. + transient protected Map> nonDistinctKeyAggrs = + new HashMap>(); + // List of non-distinct aggrs. + transient protected List nonDistinctAggrs = new ArrayList(); + // Union expr for distinct keys + transient ExprNodeEvaluator unionExprEval = null; transient GenericUDAFEvaluator[] aggregationEvaluators; @@ -187,17 +203,45 @@ } newKeys = new ArrayList(keyFields.length); + // initialize unionExpr for reduce-side + // reduce KEY has union field as the last field if there are distinct + // aggregates in group-by. + List sfs = + ((StandardStructObjectInspector) rowInspector).getAllStructFieldRefs(); + if (sfs.size() > 0) { + StructField keyField = sfs.get(0); + if (keyField.getFieldName().toUpperCase().equals( + Utilities.ReduceField.KEY.name())) { + ObjectInspector keyObjInspector = keyField.getFieldObjectInspector(); + if (keyObjInspector instanceof StandardStructObjectInspector) { + List keysfs = + ((StandardStructObjectInspector) keyObjInspector).getAllStructFieldRefs(); + if (keysfs.size() > 0) { + // the last field is the union field, if any + StructField sf = keysfs.get(keysfs.size() - 1); + if (sf.getFieldObjectInspector().getCategory().equals( + ObjectInspector.Category.UNION)) { + unionExprEval = ExprNodeEvaluatorFactory.get( + new ExprNodeColumnDesc(TypeInfoUtils.getTypeInfoFromObjectInspector( + sf.getFieldObjectInspector()), + keyField.getFieldName() + "." + sf.getFieldName(), null, + false)); + unionExprEval.initialize(rowInspector); + } + } + } + } + } // init aggregationParameterFields - aggregationParameterFields = new ExprNodeEvaluator[conf.getAggregators() - .size()][]; - aggregationParameterObjectInspectors = new ObjectInspector[conf - .getAggregators().size()][]; - aggregationParameterStandardObjectInspectors = new ObjectInspector[conf - .getAggregators().size()][]; - aggregationParameterObjects = new Object[conf.getAggregators().size()][]; - for (int i = 0; i < aggregationParameterFields.length; i++) { - ArrayList parameters = conf.getAggregators().get(i) - .getParameters(); + ArrayList aggrs = conf.getAggregators(); + aggregationParameterFields = new ExprNodeEvaluator[aggrs.size()][]; + aggregationParameterObjectInspectors = new ObjectInspector[aggrs.size()][]; + aggregationParameterStandardObjectInspectors = new ObjectInspector[aggrs.size()][]; + aggregationParameterObjects = new Object[aggrs.size()][]; + aggregationIsDistinct = new boolean[aggrs.size()]; + for (int i = 0; i < aggrs.size(); i++) { + AggregationDesc aggr = aggrs.get(i); + ArrayList parameters = aggr.getParameters(); aggregationParameterFields[i] = new ExprNodeEvaluator[parameters.size()]; aggregationParameterObjectInspectors[i] = new ObjectInspector[parameters .size()]; @@ -209,17 +253,55 @@ .get(parameters.get(j)); aggregationParameterObjectInspectors[i][j] = aggregationParameterFields[i][j] .initialize(rowInspector); + if (unionExprEval != null) { + String[] names = parameters.get(j).getExprString().split("\\."); + // parameters of the form : KEY.colx:t.coly + if (Utilities.ReduceField.KEY.name().equals(names[0])) { + String name = names[names.length - 2]; + int tag = Integer.parseInt(name.split("\\:")[1]); + if (aggr.getDistinct()) { + // is distinct + Set set = distinctKeyAggrs.get(tag); + if (null == set) { + set = new HashSet(); + distinctKeyAggrs.put(tag, set); + } + if (!set.contains(i)) { + set.add(i); + } + } else { + Set set = nonDistinctKeyAggrs.get(tag); + if (null == set) { + set = new HashSet(); + nonDistinctKeyAggrs.put(tag, set); + } + if (!set.contains(i)) { + set.add(i); + } + } + } else { + // will be VALUE._COLx + if (!nonDistinctAggrs.contains(i)) { + nonDistinctAggrs.add(i); + } + } + } else { + if (aggr.getDistinct()) { + aggregationIsDistinct[i] = true; + } + } aggregationParameterStandardObjectInspectors[i][j] = ObjectInspectorUtils .getStandardObjectInspector( aggregationParameterObjectInspectors[i][j], ObjectInspectorCopyOption.WRITABLE); aggregationParameterObjects[i][j] = null; } - } - // init aggregationIsDistinct - aggregationIsDistinct = new boolean[conf.getAggregators().size()]; - for (int i = 0; i < aggregationIsDistinct.length; i++) { - aggregationIsDistinct[i] = conf.getAggregators().get(i).getDistinct(); + if (parameters.size() == 0) { + // for ex: count(*) + if (!nonDistinctAggrs.contains(i)) { + nonDistinctAggrs.add(i); + } + } } // init aggregationClasses @@ -482,37 +564,108 @@ protected void updateAggregations(AggregationBuffer[] aggs, Object row, ObjectInspector rowInspector, boolean hashAggr, boolean newEntryForHashAggr, Object[][] lastInvoke) throws HiveException { - - for (int ai = 0; ai < aggs.length; ai++) { + if (unionExprEval == null) { + for (int ai = 0; ai < aggs.length; ai++) { + // Calculate the parameters + Object[] o = new Object[aggregationParameterFields[ai].length]; + for (int pi = 0; pi < aggregationParameterFields[ai].length; pi++) { + o[pi] = aggregationParameterFields[ai][pi].evaluate(row); + } - // Calculate the parameters - Object[] o = new Object[aggregationParameterFields[ai].length]; - for (int pi = 0; pi < aggregationParameterFields[ai].length; pi++) { - o[pi] = aggregationParameterFields[ai][pi].evaluate(row); + // Update the aggregations. + if (aggregationIsDistinct[ai]) { + if (hashAggr) { + if (newEntryForHashAggr) { + aggregationEvaluators[ai].aggregate(aggs[ai], o); + } + } else { + if (lastInvoke[ai] == null) { + lastInvoke[ai] = new Object[o.length]; + } + if (ObjectInspectorUtils.compare(o, + aggregationParameterObjectInspectors[ai], lastInvoke[ai], + aggregationParameterStandardObjectInspectors[ai]) != 0) { + aggregationEvaluators[ai].aggregate(aggs[ai], o); + for (int pi = 0; pi < o.length; pi++) { + lastInvoke[ai][pi] = ObjectInspectorUtils.copyToStandardObject( + o[pi], aggregationParameterObjectInspectors[ai][pi], + ObjectInspectorCopyOption.WRITABLE); + } + } + } + } else { + aggregationEvaluators[ai].aggregate(aggs[ai], o); + } } + return; + } - // Update the aggregations. - if (aggregationIsDistinct[ai]) { - if (hashAggr) { - if (newEntryForHashAggr) { - aggregationEvaluators[ai].aggregate(aggs[ai], o); + if (distinctKeyAggrs.size() > 0) { + // evaluate union object + UnionObject uo = (UnionObject) (unionExprEval.evaluate(row)); + int unionTag = uo.getTag(); + + // update non-distinct key aggregations : "KEY._colx:t._coly" + if (nonDistinctKeyAggrs.get(unionTag) != null) { + for (int pos : nonDistinctKeyAggrs.get(unionTag)) { + Object[] o = new Object[aggregationParameterFields[pos].length]; + for (int pi = 0; pi < aggregationParameterFields[pos].length; pi++) { + o[pi] = aggregationParameterFields[pos][pi].evaluate(row); } - } else { - if (lastInvoke[ai] == null) { - lastInvoke[ai] = new Object[o.length]; + aggregationEvaluators[pos].aggregate(aggs[pos], o); + } + } + // there may be multi distinct clauses for one column + // update them all. + if (distinctKeyAggrs.get(unionTag) != null) { + for (int i : distinctKeyAggrs.get(unionTag)) { + Object[] o = new Object[aggregationParameterFields[i].length]; + for (int pi = 0; pi < aggregationParameterFields[i].length; pi++) { + o[pi] = aggregationParameterFields[i][pi].evaluate(row); } - if (ObjectInspectorUtils.compare(o, - aggregationParameterObjectInspectors[ai], lastInvoke[ai], - aggregationParameterStandardObjectInspectors[ai]) != 0) { - aggregationEvaluators[ai].aggregate(aggs[ai], o); - for (int pi = 0; pi < o.length; pi++) { - lastInvoke[ai][pi] = ObjectInspectorUtils.copyToStandardObject( - o[pi], aggregationParameterObjectInspectors[ai][pi], - ObjectInspectorCopyOption.WRITABLE); + + if (hashAggr) { + if (newEntryForHashAggr) { + aggregationEvaluators[i].aggregate(aggs[i], o); + } + } else { + if (lastInvoke[i] == null) { + lastInvoke[i] = new Object[o.length]; + } + if (ObjectInspectorUtils.compare(o, + aggregationParameterObjectInspectors[i], + lastInvoke[i], + aggregationParameterStandardObjectInspectors[i]) != 0) { + aggregationEvaluators[i].aggregate(aggs[i], o); + for (int pi = 0; pi < o.length; pi++) { + lastInvoke[i][pi] = ObjectInspectorUtils.copyToStandardObject( + o[pi], aggregationParameterObjectInspectors[i][pi], + ObjectInspectorCopyOption.WRITABLE); + } } } } - } else { + } + + // update non-distinct value aggregations: 'VALUE._colx' + // these aggregations should be updated only once. + if (unionTag == 0) { + for (int pos : nonDistinctAggrs) { + Object[] o = new Object[aggregationParameterFields[pos].length]; + for (int pi = 0; pi < aggregationParameterFields[pos].length; pi++) { + o[pi] = aggregationParameterFields[pos][pi].evaluate(row); + } + aggregationEvaluators[pos].aggregate(aggs[pos], o); + } + } + } else { + for (int ai = 0; ai < aggs.length; ai++) { + // there is no distinct aggregation, + // update all aggregations + Object[] o = new Object[aggregationParameterFields[ai].length]; + for (int pi = 0; pi < aggregationParameterFields[ai].length; pi++) { + o[pi] = aggregationParameterFields[ai][pi].evaluate(row); + } aggregationEvaluators[ai].aggregate(aggs[ai], o); } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy) @@ -824,6 +824,20 @@ } /** + * Initialize an array of ExprNodeEvaluator from start, for specified length + * and return the result ObjectInspectors. + */ + protected static ObjectInspector[] initEvaluators(ExprNodeEvaluator[] evals, + int start, int length, + ObjectInspector rowInspector) throws HiveException { + ObjectInspector[] result = new ObjectInspector[length]; + for (int i = 0; i < length; i++) { + result[i] = evals[start + i].initialize(rowInspector); + } + return result; + } + + /** * Initialize an array of ExprNodeEvaluator and put the return values into a * StructObjectInspector with integer field names. */ Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (working copy) @@ -20,9 +20,13 @@ import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Random; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; @@ -33,8 +37,11 @@ import org.apache.hadoop.hive.serde2.Serializer; import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -71,6 +78,8 @@ transient Serializer valueSerializer; transient int tag; transient byte[] tagByte = new byte[1]; + transient protected int numDistributionKeys; + transient protected int numDistinctExprs; @Override protected void initializeOp(Configuration hconf) throws HiveException { @@ -82,6 +91,10 @@ keyEval[i++] = ExprNodeEvaluatorFactory.get(e); } + numDistributionKeys = conf.getNumDistributionKeys(); + distinctColIndices = conf.getDistinctColumnIndices(); + numDistinctExprs = distinctColIndices.size(); + valueEval = new ExprNodeEvaluator[conf.getValueCols().size()]; i = 0; for (ExprNodeDesc e : conf.getValueCols()) { @@ -125,61 +138,76 @@ transient StructObjectInspector valueObjectInspector; transient ObjectInspector[] partitionObjectInspectors; - transient Object[] cachedKeys; + transient Object[][] cachedKeys; transient Object[] cachedValues; + transient List> distinctColIndices; boolean firstRow; transient Random random; + /** + * Initializes array of ExprNodeEvaluator. Adds Union field for distinct + * column indices for group by. + * Puts the return values into a StructObjectInspector with output column + * names. + * + * If distinctColIndices is empty, the object inspector is same as + * {@link Operator#initEvaluatorsAndReturnStruct(ExprNodeEvaluator[], List, ObjectInspector)} + */ + protected static StructObjectInspector initEvaluatorsAndReturnStruct( + ExprNodeEvaluator[] evals, List> distinctColIndices, + List outputColNames, + int length, ObjectInspector rowInspector) + throws HiveException { + int inspectorLen = evals.length > length ? length + 1 : evals.length; + List sois = new ArrayList(inspectorLen); + + // keys + ObjectInspector[] fieldObjectInspectors = initEvaluators(evals, 0, length, rowInspector); + sois.addAll(Arrays.asList(fieldObjectInspectors)); + + if (evals.length > length) { + // union keys + List uois = new ArrayList(); + for (List distinctCols : distinctColIndices) { + List names = new ArrayList(); + List eois = new ArrayList(); + int numExprs = 0; + for (int i : distinctCols) { + names.add(HiveConf.getColumnInternalName(numExprs)); + eois.add(evals[i].initialize(rowInspector)); + numExprs++; + } + uois.add(ObjectInspectorFactory.getStandardStructObjectInspector(names, eois)); + } + UnionObjectInspector uoi = + ObjectInspectorFactory.getStandardUnionObjectInspector(uois); + sois.add(uoi); + } + return ObjectInspectorFactory.getStandardStructObjectInspector(outputColNames, sois ); + } + @Override public void processOp(Object row, int tag) throws HiveException { try { ObjectInspector rowInspector = inputObjInspectors[tag]; if (firstRow) { firstRow = false; - keyObjectInspector = initEvaluatorsAndReturnStruct(keyEval, conf - .getOutputKeyColumnNames(), rowInspector); + keyObjectInspector = initEvaluatorsAndReturnStruct(keyEval, + distinctColIndices, + conf.getOutputKeyColumnNames(), numDistributionKeys, rowInspector); valueObjectInspector = initEvaluatorsAndReturnStruct(valueEval, conf .getOutputValueColumnNames(), rowInspector); partitionObjectInspectors = initEvaluators(partitionEval, rowInspector); - - cachedKeys = new Object[keyEval.length]; + int numKeys = numDistinctExprs > 0 ? numDistinctExprs : 1; + int keyLen = numDistinctExprs > 0 ? numDistributionKeys + 1 : + numDistributionKeys; + cachedKeys = new Object[numKeys][keyLen]; cachedValues = new Object[valueEval.length]; } - // Evaluate the keys - for (int i = 0; i < keyEval.length; i++) { - cachedKeys[i] = keyEval[i].evaluate(row); - } - - // Serialize the keys and append the tag - if (keyIsText) { - Text key = (Text) keySerializer.serialize(cachedKeys, - keyObjectInspector); - if (tag == -1) { - keyWritable.set(key.getBytes(), 0, key.getLength()); - } else { - int keyLength = key.getLength(); - keyWritable.setSize(keyLength + 1); - System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength); - keyWritable.get()[keyLength] = tagByte[0]; - } - } else { - // Must be BytesWritable - BytesWritable key = (BytesWritable) keySerializer.serialize(cachedKeys, - keyObjectInspector); - if (tag == -1) { - keyWritable.set(key.get(), 0, key.getSize()); - } else { - int keyLength = key.getSize(); - keyWritable.setSize(keyLength + 1); - System.arraycopy(key.get(), 0, keyWritable.get(), 0, keyLength); - keyWritable.get()[keyLength] = tagByte[0]; - } - } - - // Set the HashCode + // Evaluate the HashCode int keyHashCode = 0; if (partitionEval.length == 0) { // If no partition cols, just distribute the data uniformly to provide @@ -199,7 +227,6 @@ + ObjectInspectorUtils.hashCode(o, partitionObjectInspectors[i]); } } - keyWritable.setHashCode(keyHashCode); // Evaluate the value for (int i = 0; i < valueEval.length; i++) { @@ -208,23 +235,71 @@ // Serialize the value value = valueSerializer.serialize(cachedValues, valueObjectInspector); - } catch (SerDeException e) { - throw new HiveException(e); - } + // Evaluate the keys + Object[] distributionKeys = new Object[numDistributionKeys]; + for (int i = 0; i < numDistributionKeys; i++) { + distributionKeys[i] = keyEval[i].evaluate(row); + } - try { - if (out != null) { - out.collect(keyWritable, value); - // Since this is a terminal operator, update counters explicitly - - // forward is not called - if (counterNameToEnum != null) { - ++outputRows; - if (outputRows % 1000 == 0) { - incrCounter(numOutputRowsCntr, outputRows); - outputRows = 0; + if (numDistinctExprs > 0) { + // with distinct key(s) + for (int i = 0; i < numDistinctExprs; i++) { + System.arraycopy(distributionKeys, 0, cachedKeys[i], 0, numDistributionKeys); + Object[] distinctParameters = + new Object[distinctColIndices.get(i).size()]; + for (int j = 0; j < distinctParameters.length; j++) { + distinctParameters[j] = + keyEval[distinctColIndices.get(i).get(j)].evaluate(row); } + cachedKeys[i][numDistributionKeys] = + new StandardUnion((byte)i, distinctParameters); } + } else { + // no distinct key + System.arraycopy(distributionKeys, 0, cachedKeys[0], 0, numDistributionKeys); } + // Serialize the keys and append the tag + for (int i = 0; i < cachedKeys.length; i++) { + if (keyIsText) { + Text key = (Text) keySerializer.serialize(cachedKeys[i], + keyObjectInspector); + if (tag == -1) { + keyWritable.set(key.getBytes(), 0, key.getLength()); + } else { + int keyLength = key.getLength(); + keyWritable.setSize(keyLength + 1); + System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength); + keyWritable.get()[keyLength] = tagByte[0]; + } + } else { + // Must be BytesWritable + BytesWritable key = (BytesWritable) keySerializer.serialize( + cachedKeys[i], keyObjectInspector); + if (tag == -1) { + keyWritable.set(key.getBytes(), 0, key.getLength()); + } else { + int keyLength = key.getLength(); + keyWritable.setSize(keyLength + 1); + System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength); + keyWritable.get()[keyLength] = tagByte[0]; + } + } + keyWritable.setHashCode(keyHashCode); + if (out != null) { + out.collect(keyWritable, value); + // Since this is a terminal operator, update counters explicitly - + // forward is not called + if (counterNameToEnum != null) { + ++outputRows; + if (outputRows % 1000 == 0) { + incrCounter(numOutputRowsCntr, outputRows); + outputRows = 0; + } + } + } + } + } catch (SerDeException e) { + throw new HiveException(e); } catch (IOException e) { throw new HiveException(e); } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java (working copy) @@ -59,7 +59,6 @@ INVALID_JOIN_CONDITION_3("OR not supported in Join currently"), INVALID_TRANSFORM("TRANSFORM with Other Select Columns not Supported"), DUPLICATE_GROUPBY_KEY("Repeated Key in Group By"), - UNSUPPORTED_MULTIPLE_DISTINCTS("DISTINCT on Different Columns not Supported"), NO_SUBQUERY_ALIAS("No Alias For Subquery"), NO_INSERT_INSUBQUERY("Cannot insert in a Subquery. Inserting to table "), NON_KEY_EXPR_IN_GROUPBY("Expression Not In Group By Key"), Index: ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java (working copy) @@ -81,7 +81,7 @@ // used by GroupBy private final LinkedHashMap> destToAggregationExprs; - private final HashMap destToDistinctFuncExpr; + private final HashMap> destToDistinctFuncExprs; @SuppressWarnings("unused") private static final Log LOG = LogFactory.getLog(QBParseInfo.class.getName()); @@ -100,7 +100,7 @@ destToLimit = new HashMap(); destToAggregationExprs = new LinkedHashMap>(); - destToDistinctFuncExpr = new HashMap(); + destToDistinctFuncExprs = new HashMap>(); this.alias = alias; this.isSubQ = isSubQ; @@ -120,12 +120,12 @@ return destToAggregationExprs.get(clause); } - public void setDistinctFuncExprForClause(String clause, ASTNode ast) { - destToDistinctFuncExpr.put(clause, ast); + public void setDistinctFuncExprsForClause(String clause, List ast) { + destToDistinctFuncExprs.put(clause, ast); } - public ASTNode getDistinctFuncExprForClause(String clause) { - return destToDistinctFuncExpr.get(clause); + public List getDistinctFuncExprsForClause(String clause) { + return destToDistinctFuncExprs.get(clause); } public void setSelExprForClause(String clause, ASTNode ast) { @@ -340,12 +340,12 @@ } } - if (!destToDistinctFuncExpr.isEmpty()) { - Iterator> distn = destToDistinctFuncExpr + if (!destToDistinctFuncExprs.isEmpty()) { + Iterator>> distn = destToDistinctFuncExprs .entrySet().iterator(); while (distn.hasNext()) { - ASTNode ct = distn.next().getValue(); - if (ct != null) { + List ct = distn.next().getValue(); + if (!ct.isEmpty()) { return false; } } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -344,22 +344,17 @@ } } - private ASTNode doPhase1GetDistinctFuncExpr( + private List doPhase1GetDistinctFuncExprs( HashMap aggregationTrees) throws SemanticException { - ASTNode expr = null; + List exprs = new ArrayList(); for (Map.Entry entry : aggregationTrees.entrySet()) { ASTNode value = entry.getValue(); assert (value != null); if (value.getToken().getType() == HiveParser.TOK_FUNCTIONDI) { - if (expr == null) { - expr = value; - } else { - throw new SemanticException(ErrorMsg.UNSUPPORTED_MULTIPLE_DISTINCTS - .getMsg()); - } + exprs.add(value); } } - return expr; + return exprs; } /** @@ -591,8 +586,8 @@ LinkedHashMap aggregations = doPhase1GetAggregationsFromSelect(ast); qbp.setAggregationExprsForClause(ctx_1.dest, aggregations); - qbp.setDistinctFuncExprForClause(ctx_1.dest, - doPhase1GetDistinctFuncExpr(aggregations)); + qbp.setDistinctFuncExprsForClause(ctx_1.dest, + doPhase1GetDistinctFuncExprs(aggregations)); break; case HiveParser.TOK_WHERE: @@ -2188,11 +2183,24 @@ HashMap aggregationTrees = parseInfo .getAggregationExprsForClause(dest); assert (aggregationTrees != null); + // get the last colName for the reduce KEY + // it represents the column name corresponding to distinct aggr, if any + String lastKeyColName = null; + if (reduceSinkOperatorInfo.getConf() instanceof ReduceSinkDesc) { + List inputKeyCols = ((ReduceSinkDesc) + reduceSinkOperatorInfo.getConf()).getOutputKeyColumnNames(); + if (inputKeyCols.size() > 0) { + lastKeyColName = inputKeyCols.get(inputKeyCols.size()-1); + } + } + int numDistinctUDFs = 0; for (Map.Entry entry : aggregationTrees.entrySet()) { ASTNode value = entry.getValue(); // This is the GenericUDAF name String aggName = value.getChild(0).getText(); + boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI; + boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; // Convert children to aggParameters ArrayList aggParameters = new ArrayList(); @@ -2207,13 +2215,22 @@ String paraExpression = paraExprInfo.getInternalName(); assert (paraExpression != null); + if (isDistinct && lastKeyColName != null) { + // if aggr is distinct, the parameter is name is constructed as + // KEY.lastKeyColName:._colx + paraExpression = Utilities.ReduceField.KEY.name() + "." + + lastKeyColName + ":" + numDistinctUDFs + "." + + getColumnInternalName(i-1); + + } aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), - paraExprInfo.getInternalName(), paraExprInfo.getTabAlias(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); } - boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI; - boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; + if (isDistinct) { + numDistinctUDFs++; + } Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator( aggName, aggParameters, value, isDistinct, isAllColumns); @@ -2290,10 +2307,22 @@ HashMap aggregationTrees = parseInfo .getAggregationExprsForClause(dest); + // get the last colName for the reduce KEY + // it represents the column name corresponding to distinct aggr, if any + String lastKeyColName = null; + if (reduceSinkOperatorInfo.getConf() instanceof ReduceSinkDesc) { + List inputKeyCols = ((ReduceSinkDesc) + reduceSinkOperatorInfo.getConf()).getOutputKeyColumnNames(); + if (inputKeyCols.size() > 0) { + lastKeyColName = inputKeyCols.get(inputKeyCols.size()-1); + } + } + int numDistinctUDFs = 0; for (Map.Entry entry : aggregationTrees.entrySet()) { ASTNode value = entry.getValue(); String aggName = value.getChild(0).getText(); ArrayList aggParameters = new ArrayList(); + boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI); // If the function is distinct, partial aggregartion has not been done on // the client side. @@ -2305,8 +2334,7 @@ // Otherwise, we look for b+c. // For distincts, partial aggregation is never performed on the client // side, so always look for the parameters: d+e - boolean partialAggDone = !(distPartAgg - || (value.getToken().getType() == HiveParser.TOK_FUNCTIONDI)); + boolean partialAggDone = !(distPartAgg || isDistinct); if (!partialAggDone) { // 0 is the function name for (int i = 1; i < value.getChildCount(); i++) { @@ -2320,8 +2348,16 @@ String paraExpression = paraExprInfo.getInternalName(); assert (paraExpression != null); + if (isDistinct && lastKeyColName != null) { + // if aggr is distinct, the parameter is name is constructed as + // KEY.lastKeyColName:._colx + paraExpression = Utilities.ReduceField.KEY.name() + "." + + lastKeyColName + ":" + numDistinctUDFs + "." + + getColumnInternalName(i-1); + + } aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), - paraExprInfo.getInternalName(), paraExprInfo.getTabAlias(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); } } else { @@ -2335,7 +2371,9 @@ paraExpression, paraExprInfo.getTabAlias(), paraExprInfo .getIsVirtualCol())); } - boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI); + if (isDistinct) { + numDistinctUDFs++; + } boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = null; @@ -2414,22 +2452,25 @@ } // If there is a distinctFuncExp, add all parameters to the reduceKeys. - if (parseInfo.getDistinctFuncExprForClause(dest) != null) { - ASTNode value = parseInfo.getDistinctFuncExprForClause(dest); + if (!parseInfo.getDistinctFuncExprsForClause(dest).isEmpty()) { + List list = parseInfo.getDistinctFuncExprsForClause(dest); int numDistn = 0; - // 0 is function name - for (int i = 1; i < value.getChildCount(); i++) { - ASTNode parameter = (ASTNode) value.getChild(i); - if (groupByOutputRowResolver.getExpression(parameter) == null) { - ExprNodeDesc distExprNode = genExprNodeDesc(parameter, - groupByInputRowResolver); - groupByKeys.add(distExprNode); - numDistn++; - String field = getColumnInternalName(grpByExprs.size() + numDistn - 1); - outputColumnNames.add(field); - groupByOutputRowResolver.putExpression(parameter, new ColumnInfo(field, - distExprNode.getTypeInfo(), "", false)); - colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1)); + for(ASTNode value: list) { + // 0 is function name + for (int i = 1; i < value.getChildCount(); i++) { + ASTNode parameter = (ASTNode) value.getChild(i); + if (groupByOutputRowResolver.getExpression(parameter) == null) { + ExprNodeDesc distExprNode = genExprNodeDesc(parameter, + groupByInputRowResolver); + groupByKeys.add(distExprNode); + numDistn++; + String field = getColumnInternalName(grpByExprs.size() + numDistn - + 1); + outputColumnNames.add(field); + groupByOutputRowResolver.putExpression(parameter, new ColumnInfo( + field, distExprNode.getTypeInfo(), "", false)); + colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1)); + } } } } @@ -2513,7 +2554,8 @@ ArrayList reduceKeys = new ArrayList(); // Pre-compute group-by keys and store in reduceKeys - List outputColumnNames = new ArrayList(); + List outputKeyColumnNames = new ArrayList(); + List outputValueColumnNames = new ArrayList(); List grpByExprs = getGroupByForClause(parseInfo, dest); for (int i = 0; i < grpByExprs.size(); ++i) { ASTNode grpbyExpr = grpByExprs.get(i); @@ -2521,7 +2563,7 @@ reduceSinkInputRowResolver); reduceKeys.add(inputExpr); if (reduceSinkOutputRowResolver.getExpression(grpbyExpr) == null) { - outputColumnNames.add(getColumnInternalName(reduceKeys.size() - 1)); + outputKeyColumnNames.add(getColumnInternalName(reduceKeys.size() - 1)); String field = Utilities.ReduceField.KEY.toString() + "." + getColumnInternalName(reduceKeys.size() - 1); ColumnInfo colInfo = new ColumnInfo(field, reduceKeys.get( @@ -2534,24 +2576,43 @@ } } + List> distinctColIndices = new ArrayList>(); // If there is a distinctFuncExp, add all parameters to the reduceKeys. - if (parseInfo.getDistinctFuncExprForClause(dest) != null) { - ASTNode value = parseInfo.getDistinctFuncExprForClause(dest); - // 0 is function name - for (int i = 1; i < value.getChildCount(); i++) { - ASTNode parameter = (ASTNode) value.getChild(i); - if (reduceSinkOutputRowResolver.getExpression(parameter) == null) { - reduceKeys - .add(genExprNodeDesc(parameter, reduceSinkInputRowResolver)); - outputColumnNames.add(getColumnInternalName(reduceKeys.size() - 1)); - String field = Utilities.ReduceField.KEY.toString() + "." - + getColumnInternalName(reduceKeys.size() - 1); - ColumnInfo colInfo = new ColumnInfo(field, reduceKeys.get( - reduceKeys.size() - 1).getTypeInfo(), null, false); + if (!parseInfo.getDistinctFuncExprsForClause(dest).isEmpty()) { + List distFuncs = parseInfo.getDistinctFuncExprsForClause(dest); + String colName = getColumnInternalName(reduceKeys.size()); + outputKeyColumnNames.add(colName); + for (int i = 0; i < distFuncs.size(); i++) { + ASTNode value = distFuncs.get(i); + int numExprs = 0; + List distinctIndices = new ArrayList(); + // 0 is function name + for (int j = 1; j < value.getChildCount(); j++) { + ASTNode parameter = (ASTNode) value.getChild(j); + ExprNodeDesc expr = genExprNodeDesc(parameter, reduceSinkInputRowResolver); + // see if expr is already present in reduceKeys. + // get index of expr in reduceKeys + int ri; + for (ri = 0; ri < reduceKeys.size(); ri++) { + if (reduceKeys.get(ri).getExprString().equals(expr.getExprString())) { + break; + } + } + // add the expr to reduceKeys if it is not present + if (ri == reduceKeys.size()) { + reduceKeys.add(expr); + } + // add the index of expr in reduceKeys to distinctIndices + distinctIndices.add(ri); + String name = getColumnInternalName(numExprs); + String field = Utilities.ReduceField.KEY.toString() + "." + colName + + ":" + i + + "." + name; + ColumnInfo colInfo = new ColumnInfo(field, expr.getTypeInfo(), null, false); reduceSinkOutputRowResolver.putExpression(parameter, colInfo); - colExprMap.put(colInfo.getInternalName(), reduceKeys.get(reduceKeys - .size() - 1)); + numExprs++; } + distinctColIndices.add(distinctIndices); } } @@ -2569,7 +2630,7 @@ if (reduceSinkOutputRowResolver.getExpression(parameter) == null) { reduceValues.add(genExprNodeDesc(parameter, reduceSinkInputRowResolver)); - outputColumnNames + outputValueColumnNames .add(getColumnInternalName(reduceValues.size() - 1)); String field = Utilities.ReduceField.VALUE.toString() + "." + getColumnInternalName(reduceValues.size() - 1); @@ -2590,7 +2651,7 @@ reduceValues.add(new ExprNodeColumnDesc(type, getColumnInternalName(inputField), "", false)); inputField++; - outputColumnNames.add(getColumnInternalName(reduceValues.size() - 1)); + outputValueColumnNames.add(getColumnInternalName(reduceValues.size() - 1)); String field = Utilities.ReduceField.VALUE.toString() + "." + getColumnInternalName(reduceValues.size() - 1); reduceSinkOutputRowResolver.putExpression(entry.getValue(), @@ -2600,7 +2661,8 @@ ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap( OperatorFactory.getAndMakeChild(PlanUtils.getReduceSinkDesc(reduceKeys, - reduceValues, outputColumnNames, true, -1, numPartitionFields, + grpByExprs.size(), reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, true, -1, numPartitionFields, numReducers), new RowSchema(reduceSinkOutputRowResolver .getColumnInfos()), inputOperatorInfo), reduceSinkOutputRowResolver); rsOp.setColumnExprMap(colExprMap); @@ -2788,7 +2850,7 @@ * * Generate a Group-By plan using 1 map-reduce job. Spray by the * group by key, and sort by the distinct key (if any), and compute - * aggregates * The agggregation evaluation functions are as + * aggregates * The aggregation evaluation functions are as * follows: Partitioning Key: grouping key * * Sorting Key: grouping key if no DISTINCT grouping + distinct key @@ -2796,7 +2858,7 @@ * * Reducer: iterate/merge (mode = COMPLETE) **/ - @SuppressWarnings({"unused", "nls"}) + @SuppressWarnings({"nls"}) private Operator genGroupByPlan1MR(String dest, QB qb, Operator input) throws SemanticException { @@ -2940,8 +3002,8 @@ // operator. We set the numPartitionColumns to -1 for this purpose. This is // captured by WritableComparableHiveObject.hashCode() function. Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, - dest, input, (parseInfo.getDistinctFuncExprForClause(dest) == null ? -1 - : Integer.MAX_VALUE), -1, false); + dest, input, (parseInfo.getDistinctFuncExprsForClause(dest).isEmpty() ? + -1 : Integer.MAX_VALUE), -1, false); // ////// 2. Generate GroupbyOperator Map genericUDAFEvaluators = @@ -2974,7 +3036,7 @@ return false; } - if (qb.getParseInfo().getDistinctFuncExprForClause(dest) != null) { + if (!qb.getParseInfo().getDistinctFuncExprsForClause(dest).isEmpty()) { return false; } @@ -3093,7 +3155,7 @@ // ////// Generate ReduceSink Operator Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, (parseInfo - .getDistinctFuncExprForClause(dest) == null ? -1 + .getDistinctFuncExprsForClause(dest).isEmpty() ? -1 : Integer.MAX_VALUE), -1, true); // ////// Generate GroupbyOperator for a partial aggregation @@ -5060,35 +5122,36 @@ } // All distinct expressions must be the same - ASTNode value = qbp.getDistinctFuncExprForClause(dest); - if (value == null) { + List list = qbp.getDistinctFuncExprsForClause(dest); + if (list.isEmpty()) { return null; } List currDestList = new ArrayList(); List currASTList = new ArrayList(); - try { - // 0 is function name - for (int i = 1; i < value.getChildCount(); i++) { - ASTNode parameter = (ASTNode) value.getChild(i); - currDestList.add(genExprNodeDesc(parameter, inputRR)); - currASTList.add(parameter); - } - } catch (SemanticException e) { - return null; - } - - if (oldList == null) { - oldList = currDestList; - oldASTList = currASTList; - } else { - if (oldList.size() != currDestList.size()) { + for (ASTNode value: list) { + try { + // 0 is function name + for (int i = 1; i < value.getChildCount(); i++) { + ASTNode parameter = (ASTNode) value.getChild(i); + currDestList.add(genExprNodeDesc(parameter, inputRR)); + currASTList.add(parameter); + } + } catch (SemanticException e) { return null; } - for (int pos = 0; pos < oldList.size(); pos++) { - if (!oldList.get(pos).isSame(currDestList.get(pos))) { + if (oldList == null) { + oldList = currDestList; + oldASTList = currASTList; + } else { + if (oldList.size() != currDestList.size()) { return null; } + for (int pos = 0; pos < oldList.size(); pos++) { + if (!oldList.get(pos).isSame(currDestList.get(pos))) { + return null; + } + } } } } @@ -5243,13 +5306,15 @@ if (conf.getVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE) .equalsIgnoreCase("true")) { if (conf.getVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) - .equalsIgnoreCase("false")) { + .equalsIgnoreCase("false") || + qbp.getDistinctFuncExprsForClause(dest).size() > 1) { curr = genGroupByPlanMapAggr1MR(dest, qb, curr); } else { curr = genGroupByPlanMapAggr2MR(dest, qb, curr); } } else if (conf.getVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) - .equalsIgnoreCase("true")) { + .equalsIgnoreCase("true") && + qbp.getDistinctFuncExprsForClause(dest).size() <= 1) { curr = genGroupByPlan2MR(dest, qb, curr); } else { curr = genGroupByPlan1MR(dest, qb, curr); Index: ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (working copy) @@ -29,8 +29,9 @@ * PARTIAL1: partial aggregation - first phase: iterate, terminatePartial * PARTIAL2: partial aggregation - second phase: merge, terminatePartial * PARTIALS: For non-distinct the same as PARTIAL2, for distinct the same as - * PARTIAL1 FINAL: partial aggregation - final phase: merge, terminate HASH: - * For non-distinct the same as PARTIAL1 but use hash-table-based aggregation + * PARTIAL1 + * FINAL: partial aggregation - final phase: merge, terminate + * HASH: For non-distinct the same as PARTIAL1 but use hash-table-based aggregation * MERGEPARTIAL: FINAL for non-distinct aggregations, COMPLETE for distinct * aggregations. */ Index: ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java (working copy) @@ -54,6 +54,8 @@ import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; @@ -357,6 +359,43 @@ /** * Convert the ColumnList to FieldSchema list. + * + * Adds uniontype for distinctColIndices. + */ + public static List getFieldSchemasFromColumnListWithLength( + List cols, List> distinctColIndices, + List outputColumnNames, int length, + String fieldPrefix) { + // last one for union column. + List schemas = new ArrayList(length + 1); + for (int i = 0; i < length; i++) { + schemas.add(MetaStoreUtils.getFieldSchemaFromTypeInfo( + fieldPrefix + outputColumnNames.get(i), cols.get(i).getTypeInfo())); + } + + List unionTypes = new ArrayList(); + for (List distinctCols : distinctColIndices) { + List names = new ArrayList(); + List types = new ArrayList(); + int numExprs = 0; + for (int i : distinctCols) { + names.add(HiveConf.getColumnInternalName(numExprs)); + types.add(cols.get(i).getTypeInfo()); + numExprs++; + } + unionTypes.add(TypeInfoFactory.getStructTypeInfo(names, types)); + } + if (cols.size() - length > 0) { + schemas.add(MetaStoreUtils.getFieldSchemaFromTypeInfo( + fieldPrefix + outputColumnNames.get(length), + TypeInfoFactory.getUnionTypeInfo(unionTypes))); + } + + return schemas; + } + + /** + * Convert the ColumnList to FieldSchema list. */ public static List getFieldSchemasFromColumnList( List cols, List outputColumnNames, int start, @@ -446,33 +485,70 @@ ArrayList keyCols, ArrayList valueCols, List outputColumnNames, boolean includeKeyCols, int tag, ArrayList partitionCols, String order, int numReducers) { + return getReduceSinkDesc(keyCols, keyCols.size(), valueCols, + new ArrayList>(), + includeKeyCols ? outputColumnNames.subList(0, keyCols.size()) : + new ArrayList(), + includeKeyCols ? outputColumnNames.subList(keyCols.size(), + outputColumnNames.size()) : outputColumnNames, + includeKeyCols, tag, partitionCols, order, numReducers); + } + + /** + * Create the reduce sink descriptor. + * + * @param keyCols + * The columns to be stored in the key + * @param numKeys + * number of distribution key numbers. Equals to group-by-key + * numbers usually. + * @param valueCols + * The columns to be stored in the value + * @param distinctColIndices + * column indices for distinct aggregate parameters + * @param outputKeyColumnNames + * The output key columns names + * @param outputValueColumnNames + * The output value columns names + * @param tag + * The tag for this reducesink + * @param partitionCols + * The columns for partitioning. + * @param numReducers + * The number of reducers, set to -1 for automatic inference based on + * input data size. + * @return The reduceSinkDesc object. + */ + public static ReduceSinkDesc getReduceSinkDesc( + final ArrayList keyCols, int numKeys, + ArrayList valueCols, + List> distinctColIndices, + List outputKeyColumnNames, + List outputValueColumnNames, + boolean includeKeyCols, int tag, + ArrayList partitionCols, String order, int numReducers) { TableDesc keyTable = null; TableDesc valueTable = null; ArrayList outputKeyCols = new ArrayList(); ArrayList outputValCols = new ArrayList(); if (includeKeyCols) { - keyTable = getReduceKeyTableDesc(getFieldSchemasFromColumnList(keyCols, - outputColumnNames, 0, ""), order); - outputKeyCols.addAll(outputColumnNames.subList(0, keyCols.size())); - valueTable = getReduceValueTableDesc(getFieldSchemasFromColumnList( - valueCols, outputColumnNames, keyCols.size(), "")); - outputValCols.addAll(outputColumnNames.subList(keyCols.size(), - outputColumnNames.size())); + keyTable = getReduceKeyTableDesc(getFieldSchemasFromColumnListWithLength( + keyCols, distinctColIndices, outputKeyColumnNames, numKeys, ""), + order); + outputKeyCols.addAll(outputKeyColumnNames); } else { - keyTable = getReduceKeyTableDesc(getFieldSchemasFromColumnList(keyCols, - "reducesinkkey"), order); - for (int i = 0; i < keyCols.size(); i++) { + keyTable = getReduceKeyTableDesc(getFieldSchemasFromColumnList( + keyCols, "reducesinkkey"),order); + for (int i = 0; i < keyCols.size(); i++) { outputKeyCols.add("reducesinkkey" + i); } - valueTable = getReduceValueTableDesc(getFieldSchemasFromColumnList( - valueCols, outputColumnNames, 0, "")); - outputValCols.addAll(outputColumnNames); } - return new ReduceSinkDesc(keyCols, valueCols, outputKeyCols, outputValCols, + valueTable = getReduceValueTableDesc(getFieldSchemasFromColumnList( + valueCols, outputValueColumnNames, 0, "")); + outputValCols.addAll(outputValueColumnNames); + return new ReduceSinkDesc(keyCols, numKeys, valueCols, outputKeyCols, + distinctColIndices, outputValCols, tag, partitionCols, numReducers, keyTable, - // Revert to DynamicSerDe: - // getBinaryTableDesc(getFieldSchemasFromColumnList(valueCols, - // "reducesinkvalue"))); valueTable); } @@ -499,6 +575,48 @@ ArrayList keyCols, ArrayList valueCols, List outputColumnNames, boolean includeKey, int tag, int numPartitionFields, int numReducers) throws SemanticException { + return getReduceSinkDesc(keyCols, keyCols.size(), valueCols, + new ArrayList>(), + includeKey ? outputColumnNames.subList(0, keyCols.size()) : + new ArrayList(), + includeKey ? + outputColumnNames.subList(keyCols.size(), outputColumnNames.size()) + : outputColumnNames, + includeKey, tag, numPartitionFields, numReducers); + } + + /** + * Create the reduce sink descriptor. + * + * @param keyCols + * The columns to be stored in the key + * @param numKeys number of distribution keys. Equals to group-by-key + * numbers usually. + * @param valueCols + * The columns to be stored in the value + * @param distinctColIndices + * column indices for distinct aggregates + * @param outputKeyColumnNames + * The output key columns names + * @param outputValueColumnNames + * The output value columns names + * @param tag + * The tag for this reducesink + * @param numPartitionFields + * The first numPartitionFields of keyCols will be partition columns. + * If numPartitionFields=-1, then partition randomly. + * @param numReducers + * The number of reducers, set to -1 for automatic inference based on + * input data size. + * @return The reduceSinkDesc object. + */ + public static ReduceSinkDesc getReduceSinkDesc( + ArrayList keyCols, int numKeys, + ArrayList valueCols, + List> distinctColIndices, + List outputKeyColumnNames, List outputValueColumnNames, + boolean includeKey, int tag, + int numPartitionFields, int numReducers) throws SemanticException { ArrayList partitionCols = null; if (numPartitionFields >= keyCols.size()) { @@ -519,8 +637,9 @@ for (int i = 0; i < keyCols.size(); i++) { order.append("+"); } - return getReduceSinkDesc(keyCols, valueCols, outputColumnNames, includeKey, - tag, partitionCols, order.toString(), numReducers); + return getReduceSinkDesc(keyCols, numKeys, valueCols, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, includeKey, tag, + partitionCols, order.toString(), numReducers); } /** Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java (revision 1026947) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java (working copy) @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.plan; import java.io.Serializable; +import java.util.List; /** * ReduceSinkDesc. @@ -32,6 +33,7 @@ */ private java.util.ArrayList keyCols; private java.util.ArrayList outputKeyColumnNames; + private List> distinctColumnIndices; /** * Value columns are passed to reducer in the "value". */ @@ -52,6 +54,11 @@ private int tag; /** + * Number of distribution keys. + */ + private int numDistributionKeys; + + /** * The partition columns (CLUSTER BY or DISTRIBUTE BY in Hive language). * Partition columns decide the reducer that the current row goes to. * Partition columns are not passed to reducer. @@ -64,20 +71,24 @@ } public ReduceSinkDesc(java.util.ArrayList keyCols, + int numDistributionKeys, java.util.ArrayList valueCols, java.util.ArrayList outputKeyColumnNames, - java.util.ArrayList outputValueolumnNames, int tag, + List> distinctColumnIndices, + java.util.ArrayList outputValueColumnNames, int tag, java.util.ArrayList partitionCols, int numReducers, final TableDesc keySerializeInfo, final TableDesc valueSerializeInfo) { this.keyCols = keyCols; + this.numDistributionKeys = numDistributionKeys; this.valueCols = valueCols; this.outputKeyColumnNames = outputKeyColumnNames; - outputValueColumnNames = outputValueolumnNames; + this.outputValueColumnNames = outputValueColumnNames; this.tag = tag; this.numReducers = numReducers; this.partitionCols = partitionCols; this.keySerializeInfo = keySerializeInfo; this.valueSerializeInfo = valueSerializeInfo; + this.distinctColumnIndices = distinctColumnIndices; } public java.util.ArrayList getOutputKeyColumnNames() { @@ -107,6 +118,14 @@ this.keyCols = keyCols; } + public int getNumDistributionKeys() { + return this.numDistributionKeys; + } + + public void setNumDistributionKeys(int numKeys) { + this.numDistributionKeys = numKeys; + } + @Explain(displayName = "value expressions") public java.util.ArrayList getValueCols() { return valueCols; @@ -184,4 +203,12 @@ orderStr); } + public List> getDistinctColumnIndices() { + return distinctColumnIndices; + } + + public void setDistinctColumnIndices( + List> distinctColumnIndices) { + this.distinctColumnIndices = distinctColumnIndices; + } } Index: ql/src/test/queries/clientpositive/count.q =================================================================== --- ql/src/test/queries/clientpositive/count.q (revision 0) +++ ql/src/test/queries/clientpositive/count.q (revision 0) @@ -0,0 +1,17 @@ +create table abcd (a int, b int, c int, d int); +LOAD DATA LOCAL INPATH '../data/files/in4.txt' INTO TABLE abcd; + +select * from abcd; +set hive.map.aggr=true; +explain select a, count(distinct b), count(distinct c), sum(d) from abcd group by a; +select a, count(distinct b), count(distinct c), sum(d) from abcd group by a; + +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd; +select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd; + +set hive.map.aggr=false; +explain select a, count(distinct b), count(distinct c), sum(d) from abcd group by a; +select a, count(distinct b), count(distinct c), sum(d) from abcd group by a; + +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd; +select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd; Index: ql/src/test/queries/clientpositive/groupby2_map_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby2_map_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby2_map_multi_distinct.q (revision 0) @@ -0,0 +1,14 @@ +set hive.map.aggr=true; +set hive.groupby.skewindata=false; +set mapred.reduce.tasks=31; + +CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE; + +EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1); + +FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1); + +SELECT dest1.* FROM dest1; Index: ql/src/test/queries/clientpositive/groupby2_map_skew_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby2_map_skew_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby2_map_skew_multi_distinct.q (revision 0) @@ -0,0 +1,14 @@ +set hive.map.aggr=true; +set hive.groupby.skewindata=true; +set mapred.reduce.tasks=31; + +CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE; + +EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1); + +FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1); + +SELECT dest1.* FROM dest1; Index: ql/src/test/queries/clientpositive/groupby2_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby2_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby2_multi_distinct.q (revision 0) @@ -0,0 +1,13 @@ +set hive.map.aggr=false; +set hive.groupby.skewindata=true; + +CREATE TABLE dest_g2(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE; + +EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1); + +FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1); + +SELECT dest_g2.* FROM dest_g2; Index: ql/src/test/queries/clientpositive/groupby2_noskew_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby2_noskew_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby2_noskew_multi_distinct.q (revision 0) @@ -0,0 +1,14 @@ +set hive.map.aggr=false; +set hive.groupby.skewindata=false; +set mapred.reduce.tasks=31; + +CREATE TABLE dest_g2(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE; + +EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1); + +FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1); + +SELECT dest_g2.* FROM dest_g2; Index: ql/src/test/queries/clientpositive/groupby3_map_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby3_map_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby3_map_multi_distinct.q (revision 0) @@ -0,0 +1,36 @@ +set hive.map.aggr=true; +set hive.groupby.skewindata=false; +set mapred.reduce.tasks=31; + +CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE; + +EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)); + +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)); + +SELECT dest1.* FROM dest1; Index: ql/src/test/queries/clientpositive/groupby3_map_skew_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby3_map_skew_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby3_map_skew_multi_distinct.q (revision 0) @@ -0,0 +1,36 @@ +set hive.map.aggr=true; +set hive.groupby.skewindata=true; +set mapred.reduce.tasks=31; + +CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE; + +EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)); + +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)); + +SELECT dest1.* FROM dest1; Index: ql/src/test/queries/clientpositive/groupby3_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby3_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby3_multi_distinct.q (revision 0) @@ -0,0 +1,36 @@ +set hive.map.aggr=false; +set hive.groupby.skewindata=true; + +CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE; + +EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)); + + +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)); + +SELECT dest1.* FROM dest1; Index: ql/src/test/queries/clientpositive/groupby3_noskew_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby3_noskew_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby3_noskew_multi_distinct.q (revision 0) @@ -0,0 +1,38 @@ +set hive.map.aggr=false; + +set hive.groupby.skewindata=false; +set mapred.reduce.tasks=31; + +CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE; + +EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)); + +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)); + +SELECT dest1.* FROM dest1; + Index: ql/src/test/queries/clientpositive/groupby_map_ppr_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_map_ppr_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_map_ppr_multi_distinct.q (revision 0) @@ -0,0 +1,20 @@ +set hive.map.aggr=true; +set hive.groupby.skewindata=false; +set mapred.reduce.tasks=31; + +CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, C3 INT, c4 INT) STORED AS TEXTFILE; + +EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +SELECT dest1.* FROM dest1; Index: ql/src/test/queries/clientpositive/groupby_ppr_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_ppr_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_ppr_multi_distinct.q (revision 0) @@ -0,0 +1,19 @@ +set hive.map.aggr=false; +set hive.groupby.skewindata=false; + +CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE; + +EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +SELECT dest1.* FROM dest1; Index: ql/src/test/queries/clientpositive/join18_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/join18_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/join18_multi_distinct.q (revision 0) @@ -0,0 +1,26 @@ +EXPLAIN + SELECT a.key, a.value, b.key, b.value1, b.value2 + FROM + ( + SELECT src1.key as key, count(src1.value) AS value FROM src src1 group by src1.key + ) a + FULL OUTER JOIN + ( + SELECT src2.key as key, count(distinct(src2.value)) AS value1, + count(distinct(src2.key)) AS value2 + FROM src1 src2 group by src2.key + ) b + ON (a.key = b.key); + + SELECT a.key, a.value, b.key, b.value1, b.value2 + FROM + ( + SELECT src1.key as key, count(src1.value) AS value FROM src src1 group by src1.key + ) a + FULL OUTER JOIN + ( + SELECT src2.key as key, count(distinct(src2.value)) AS value1, + count(distinct(src2.key)) AS value2 + FROM src1 src2 group by src2.key + ) b + ON (a.key = b.key); Index: ql/src/test/queries/clientpositive/nullgroup4_multi_distinct.q =================================================================== --- ql/src/test/queries/clientpositive/nullgroup4_multi_distinct.q (revision 0) +++ ql/src/test/queries/clientpositive/nullgroup4_multi_distinct.q (revision 0) @@ -0,0 +1,31 @@ +set hive.map.aggr=true; +set hive.groupby.skewindata=true; + +explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999; + +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999; + +set hive.map.aggr=true; +set hive.groupby.skewindata=false; + +explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999; + +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999; + +set hive.map.aggr=false; +set hive.groupby.skewindata=true; + +explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999; + +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999; + +set hive.map.aggr=false; +set hive.groupby.skewindata=false; + +explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999; + +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999; Index: ql/src/test/results/clientpositive/count.q.out =================================================================== --- ql/src/test/results/clientpositive/count.q.out (revision 0) +++ ql/src/test/results/clientpositive/count.q.out (revision 0) @@ -0,0 +1,577 @@ +PREHOOK: query: create table abcd (a int, b int, c int, d int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table abcd (a int, b int, c int, d int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@abcd +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/in4.txt' INTO TABLE abcd +PREHOOK: type: LOAD +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/in4.txt' INTO TABLE abcd +POSTHOOK: type: LOAD +POSTHOOK: Output: default@abcd +PREHOOK: query: select * from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-12_523_248548306762990690/-mr-10000 +POSTHOOK: query: select * from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-12_523_248548306762990690/-mr-10000 +NULL 35 23 6 +10 1000 50 1 +100 100 10 3 +12 NULL 80 2 +10 100 NULL 5 +10 100 45 4 +12 100 75 7 +PREHOOK: query: explain select a, count(distinct b), count(distinct c), sum(d) from abcd group by a +PREHOOK: type: QUERY +POSTHOOK: query: explain select a, count(distinct b), count(distinct c), sum(d) from abcd group by a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF abcd)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL d)))) (TOK_GROUPBY (TOK_TABLE_OR_COL a)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + abcd + TableScan + alias: abcd + Select Operator + expressions: + expr: a + type: int + expr: b + type: int + expr: c + type: int + expr: d + type: int + outputColumnNames: a, b, c, d + Group By Operator + aggregations: + expr: count(DISTINCT b) + expr: count(DISTINCT c) + expr: sum(d) + bucketGroup: false + keys: + expr: a + type: int + expr: b + type: int + expr: c + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: int + sort order: +++ + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col3 + type: bigint + expr: _col4 + type: bigint + expr: _col5 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: count(DISTINCT KEY._col1:1._col0) + expr: sum(VALUE._col2) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select a, count(distinct b), count(distinct c), sum(d) from abcd group by a +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-12_925_5142539455043997266/-mr-10000 +POSTHOOK: query: select a, count(distinct b), count(distinct c), sum(d) from abcd group by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-12_925_5142539455043997266/-mr-10000 +NULL 1 1 6 +10 2 2 10 +12 1 2 9 +100 1 1 3 +PREHOOK: query: explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF abcd)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL a))) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL b))) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + abcd + TableScan + alias: abcd + Select Operator + expressions: + expr: a + type: int + expr: b + type: int + expr: c + type: int + expr: d + type: int + outputColumnNames: a, b, c, d + Group By Operator + aggregations: + expr: count(1) + expr: count() + expr: count(a) + expr: count(b) + expr: count(c) + expr: count(d) + expr: count(DISTINCT a) + expr: count(DISTINCT b) + expr: count(DISTINCT c) + expr: count(DISTINCT d) + expr: count(DISTINCT a, b) + expr: count(DISTINCT b, c) + expr: count(DISTINCT c, d) + expr: count(DISTINCT a, d) + expr: count(DISTINCT a, c) + expr: count(DISTINCT b, d) + expr: count(DISTINCT a, b, c) + expr: count(DISTINCT b, c, d) + expr: count(DISTINCT a, c, d) + expr: count(DISTINCT a, b, d) + expr: count(DISTINCT a, b, c, d) + bucketGroup: false + keys: + expr: a + type: int + expr: b + type: int + expr: c + type: int + expr: d + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23, _col24 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: int + expr: _col3 + type: int + sort order: ++++ + tag: -1 + value expressions: + expr: _col4 + type: bigint + expr: _col5 + type: bigint + expr: _col6 + type: bigint + expr: _col7 + type: bigint + expr: _col8 + type: bigint + expr: _col9 + type: bigint + expr: _col10 + type: bigint + expr: _col11 + type: bigint + expr: _col12 + type: bigint + expr: _col13 + type: bigint + expr: _col14 + type: bigint + expr: _col15 + type: bigint + expr: _col16 + type: bigint + expr: _col17 + type: bigint + expr: _col18 + type: bigint + expr: _col19 + type: bigint + expr: _col20 + type: bigint + expr: _col21 + type: bigint + expr: _col22 + type: bigint + expr: _col23 + type: bigint + expr: _col24 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + expr: count(VALUE._col1) + expr: count(VALUE._col2) + expr: count(VALUE._col3) + expr: count(VALUE._col4) + expr: count(VALUE._col5) + expr: count(DISTINCT KEY._col0:0._col0) + expr: count(DISTINCT KEY._col0:1._col0) + expr: count(DISTINCT KEY._col0:2._col0) + expr: count(DISTINCT KEY._col0:3._col0) + expr: count(DISTINCT KEY._col0:4._col0, KEY._col0:4._col1) + expr: count(DISTINCT KEY._col0:5._col0, KEY._col0:5._col1) + expr: count(DISTINCT KEY._col0:6._col0, KEY._col0:6._col1) + expr: count(DISTINCT KEY._col0:7._col0, KEY._col0:7._col1) + expr: count(DISTINCT KEY._col0:8._col0, KEY._col0:8._col1) + expr: count(DISTINCT KEY._col0:9._col0, KEY._col0:9._col1) + expr: count(DISTINCT KEY._col0:10._col0, KEY._col0:10._col1, KEY._col0:10._col2) + expr: count(DISTINCT KEY._col0:11._col0, KEY._col0:11._col1, KEY._col0:11._col2) + expr: count(DISTINCT KEY._col0:12._col0, KEY._col0:12._col1, KEY._col0:12._col2) + expr: count(DISTINCT KEY._col0:13._col0, KEY._col0:13._col1, KEY._col0:13._col2) + expr: count(DISTINCT KEY._col0:14._col0, KEY._col0:14._col1, KEY._col0:14._col2, KEY._col0:14._col3) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 + Select Operator + expressions: + expr: _col0 + type: bigint + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: bigint + expr: _col5 + type: bigint + expr: _col6 + type: bigint + expr: _col7 + type: bigint + expr: _col8 + type: bigint + expr: _col9 + type: bigint + expr: _col10 + type: bigint + expr: _col11 + type: bigint + expr: _col12 + type: bigint + expr: _col13 + type: bigint + expr: _col14 + type: bigint + expr: _col15 + type: bigint + expr: _col16 + type: bigint + expr: _col17 + type: bigint + expr: _col18 + type: bigint + expr: _col19 + type: bigint + expr: _col20 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-15_550_8269692453214848200/-mr-10000 +POSTHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-15_550_8269692453214848200/-mr-10000 +7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 +PREHOOK: query: explain select a, count(distinct b), count(distinct c), sum(d) from abcd group by a +PREHOOK: type: QUERY +POSTHOOK: query: explain select a, count(distinct b), count(distinct c), sum(d) from abcd group by a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF abcd)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL d)))) (TOK_GROUPBY (TOK_TABLE_OR_COL a)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + abcd + TableScan + alias: abcd + Select Operator + expressions: + expr: a + type: int + expr: b + type: int + expr: c + type: int + expr: d + type: int + outputColumnNames: a, b, c, d + Reduce Output Operator + key expressions: + expr: a + type: int + expr: b + type: int + expr: c + type: int + sort order: +++ + Map-reduce partition columns: + expr: a + type: int + tag: -1 + value expressions: + expr: d + type: int + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: count(DISTINCT KEY._col1:1._col0) + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select a, count(distinct b), count(distinct c), sum(d) from abcd group by a +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-18_476_7776659377634244729/-mr-10000 +POSTHOOK: query: select a, count(distinct b), count(distinct c), sum(d) from abcd group by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-18_476_7776659377634244729/-mr-10000 +NULL 1 1 6 +10 2 2 10 +12 1 2 9 +100 1 1 3 +PREHOOK: query: explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF abcd)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL a))) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL b))) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL d))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b) (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + abcd + TableScan + alias: abcd + Select Operator + expressions: + expr: a + type: int + expr: b + type: int + expr: c + type: int + expr: d + type: int + outputColumnNames: a, b, c, d + Reduce Output Operator + key expressions: + expr: a + type: int + expr: b + type: int + expr: c + type: int + expr: d + type: int + sort order: ++++ + tag: -1 + value expressions: + expr: 1 + type: int + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + expr: count() + expr: count(KEY._col0:14._col0) + expr: count(KEY._col0:14._col1) + expr: count(KEY._col0:14._col2) + expr: count(KEY._col0:14._col3) + expr: count(DISTINCT KEY._col0:0._col0) + expr: count(DISTINCT KEY._col0:1._col0) + expr: count(DISTINCT KEY._col0:2._col0) + expr: count(DISTINCT KEY._col0:3._col0) + expr: count(DISTINCT KEY._col0:4._col0, KEY._col0:4._col1) + expr: count(DISTINCT KEY._col0:5._col0, KEY._col0:5._col1) + expr: count(DISTINCT KEY._col0:6._col0, KEY._col0:6._col1) + expr: count(DISTINCT KEY._col0:7._col0, KEY._col0:7._col1) + expr: count(DISTINCT KEY._col0:8._col0, KEY._col0:8._col1) + expr: count(DISTINCT KEY._col0:9._col0, KEY._col0:9._col1) + expr: count(DISTINCT KEY._col0:10._col0, KEY._col0:10._col1, KEY._col0:10._col2) + expr: count(DISTINCT KEY._col0:11._col0, KEY._col0:11._col1, KEY._col0:11._col2) + expr: count(DISTINCT KEY._col0:12._col0, KEY._col0:12._col1, KEY._col0:12._col2) + expr: count(DISTINCT KEY._col0:13._col0, KEY._col0:13._col1, KEY._col0:13._col2) + expr: count(DISTINCT KEY._col0:14._col0, KEY._col0:14._col1, KEY._col0:14._col2, KEY._col0:14._col3) + bucketGroup: false + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 + Select Operator + expressions: + expr: _col0 + type: bigint + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: bigint + expr: _col5 + type: bigint + expr: _col6 + type: bigint + expr: _col7 + type: bigint + expr: _col8 + type: bigint + expr: _col9 + type: bigint + expr: _col10 + type: bigint + expr: _col11 + type: bigint + expr: _col12 + type: bigint + expr: _col13 + type: bigint + expr: _col14 + type: bigint + expr: _col15 + type: bigint + expr: _col16 + type: bigint + expr: _col17 + type: bigint + expr: _col18 + type: bigint + expr: _col19 + type: bigint + expr: _col20 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-20_818_3596074897102517680/-mr-10000 +POSTHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-19_03-08-20_818_3596074897102517680/-mr-10000 +7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 Index: ql/src/test/results/clientpositive/groupby2.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby2.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby2.q.out (working copy) @@ -50,8 +50,8 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col1) - expr: sum(KEY._col1) + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(KEY._col1:0._col0) bucketGroup: false keys: expr: KEY._col0 Index: ql/src/test/results/clientpositive/groupby2_map.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby2_map.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby2_map.q.out (working copy) @@ -64,7 +64,7 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col1) + expr: count(DISTINCT KEY._col1:0._col0) expr: sum(VALUE._col1) bucketGroup: false keys: Index: ql/src/test/results/clientpositive/groupby2_map_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby2_map_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby2_map_multi_distinct.q.out (revision 0) @@ -0,0 +1,169 @@ +PREHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL src) value)))) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(DISTINCT substr(value, 5)) + expr: sum(substr(value, 5)) + expr: sum(DISTINCT substr(value, 5)) + expr: count(value) + bucketGroup: false + keys: + expr: substr(key, 1, 1) + type: string + expr: substr(value, 5) + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + expr: _col3 + type: double + expr: _col4 + type: double + expr: _col5 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(VALUE._col1) + expr: sum(DISTINCT KEY._col1:1._col0) + expr: count(VALUE._col3) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: concat(_col0, _col2) + type: string + expr: _col3 + type: double + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + expr: UDFToInteger(_col3) + type: int + expr: UDFToInteger(_col4) + type: int + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-03-34_053_4042312084657375262/-mr-10000 +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-03-34_053_4042312084657375262/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +0 1 00.0 0 3 +1 71 116414.0 10044 115 +2 69 225571.0 15780 111 +3 62 332004.0 20119 99 +4 74 452763.0 30965 124 +5 6 5397.0 278 10 +6 5 6398.0 331 6 +7 6 7735.0 447 10 +8 8 8762.0 595 10 +9 7 91047.0 577 12 Index: ql/src/test/results/clientpositive/groupby2_map_skew.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby2_map_skew.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby2_map_skew.q.out (working copy) @@ -67,7 +67,7 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col1) + expr: count(DISTINCT KEY._col1:0._col0) expr: sum(VALUE._col1) bucketGroup: false keys: Index: ql/src/test/results/clientpositive/groupby2_map_skew_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby2_map_skew_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby2_map_skew_multi_distinct.q.out (revision 0) @@ -0,0 +1,169 @@ +PREHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL src) value)))) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(DISTINCT substr(value, 5)) + expr: sum(substr(value, 5)) + expr: sum(DISTINCT substr(value, 5)) + expr: count(value) + bucketGroup: false + keys: + expr: substr(key, 1, 1) + type: string + expr: substr(value, 5) + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + expr: _col3 + type: double + expr: _col4 + type: double + expr: _col5 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(VALUE._col1) + expr: sum(DISTINCT KEY._col1:1._col0) + expr: count(VALUE._col3) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: concat(_col0, _col2) + type: string + expr: _col3 + type: double + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + expr: UDFToInteger(_col3) + type: int + expr: UDFToInteger(_col4) + type: int + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-02-53_328_1602391311904578639/-mr-10000 +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-02-53_328_1602391311904578639/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +0 1 00.0 0 3 +1 71 116414.0 10044 115 +2 69 225571.0 15780 111 +3 62 332004.0 20119 99 +4 74 452763.0 30965 124 +5 6 5397.0 278 10 +6 5 6398.0 331 6 +7 6 7735.0 447 10 +8 8 8762.0 595 10 +9 7 91047.0 577 12 Index: ql/src/test/results/clientpositive/groupby2_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby2_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby2_multi_distinct.q.out (revision 0) @@ -0,0 +1,149 @@ +PREHOOK: query: CREATE TABLE dest_g2(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_g2(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_g2 +PREHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest_g2)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL src) value)))) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Reduce Output Operator + key expressions: + expr: substr(key, 1, 1) + type: string + expr: substr(value, 5) + type: string + sort order: ++ + Map-reduce partition columns: + expr: substr(key, 1, 1) + type: string + tag: -1 + value expressions: + expr: value + type: string + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(KEY._col1:1._col0) + expr: sum(DISTINCT KEY._col1:1._col0) + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: concat(_col0, _col2) + type: string + expr: _col3 + type: double + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + expr: UDFToInteger(_col3) + type: int + expr: UDFToInteger(_col4) + type: int + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest_g2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest_g2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dest_g2 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dest_g2 +POSTHOOK: Lineage: dest_g2.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest_g2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT dest_g2.* FROM dest_g2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g2 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-01-20_611_7544013891892050730/-mr-10000 +POSTHOOK: query: SELECT dest_g2.* FROM dest_g2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g2 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-01-20_611_7544013891892050730/-mr-10000 +POSTHOOK: Lineage: dest_g2.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest_g2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +0 1 00.0 0 3 +1 71 116414.0 10044 115 +2 69 225571.0 15780 111 +3 62 332004.0 20119 99 +4 74 452763.0 30965 124 +5 6 5397.0 278 10 +6 5 6398.0 331 6 +7 6 7735.0 447 10 +8 8 8762.0 595 10 +9 7 91047.0 577 12 Index: ql/src/test/results/clientpositive/groupby2_noskew.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby2_noskew.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby2_noskew.q.out (working copy) @@ -47,8 +47,8 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col1) - expr: sum(KEY._col1) + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(KEY._col1:0._col0) bucketGroup: false keys: expr: KEY._col0 Index: ql/src/test/results/clientpositive/groupby2_noskew_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby2_noskew_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby2_noskew_multi_distinct.q.out (revision 0) @@ -0,0 +1,149 @@ +PREHOOK: query: CREATE TABLE dest_g2(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest_g2(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest_g2 +PREHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest_g2)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL src) value)))) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Reduce Output Operator + key expressions: + expr: substr(key, 1, 1) + type: string + expr: substr(value, 5) + type: string + sort order: ++ + Map-reduce partition columns: + expr: substr(key, 1, 1) + type: string + tag: -1 + value expressions: + expr: value + type: string + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(KEY._col1:1._col0) + expr: sum(DISTINCT KEY._col1:1._col0) + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: concat(_col0, _col2) + type: string + expr: _col3 + type: double + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + expr: UDFToInteger(_col3) + type: int + expr: UDFToInteger(_col4) + type: int + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest_g2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest_g2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dest_g2 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE dest_g2 SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(src.value) GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dest_g2 +POSTHOOK: Lineage: dest_g2.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest_g2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT dest_g2.* FROM dest_g2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g2 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-04-17_707_2990757298865083943/-mr-10000 +POSTHOOK: query: SELECT dest_g2.* FROM dest_g2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g2 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-04-17_707_2990757298865083943/-mr-10000 +POSTHOOK: Lineage: dest_g2.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest_g2.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest_g2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +0 1 00.0 0 3 +1 71 116414.0 10044 115 +2 69 225571.0 15780 111 +3 62 332004.0 20119 99 +4 74 452763.0 30965 124 +5 6 5397.0 278 10 +6 5 6398.0 331 6 +7 6 7735.0 447 10 +8 8 8762.0 595 10 +9 7 91047.0 577 12 Index: ql/src/test/results/clientpositive/groupby3.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby3.q.out (working copy) @@ -62,15 +62,15 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: sum(KEY._col0) - expr: avg(KEY._col0) - expr: avg(DISTINCT KEY._col0) - expr: max(KEY._col0) - expr: min(KEY._col0) - expr: std(KEY._col0) - expr: stddev_samp(KEY._col0) - expr: variance(KEY._col0) - expr: var_samp(KEY._col0) + expr: sum(KEY._col0:0._col0) + expr: avg(KEY._col0:0._col0) + expr: avg(DISTINCT KEY._col0:0._col0) + expr: max(KEY._col0:0._col0) + expr: min(KEY._col0:0._col0) + expr: std(KEY._col0:0._col0) + expr: stddev_samp(KEY._col0:0._col0) + expr: variance(KEY._col0:0._col0) + expr: var_samp(KEY._col0:0._col0) bucketGroup: false mode: partial1 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Index: ql/src/test/results/clientpositive/groupby3_map.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_map.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby3_map.q.out (working copy) @@ -96,7 +96,7 @@ aggregations: expr: sum(VALUE._col0) expr: avg(VALUE._col1) - expr: avg(DISTINCT KEY._col0) + expr: avg(DISTINCT KEY._col0:0._col0) expr: max(VALUE._col3) expr: min(VALUE._col4) expr: std(VALUE._col5) Index: ql/src/test/results/clientpositive/groupby3_map_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_map_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby3_map_multi_distinct.q.out (revision 0) @@ -0,0 +1,256 @@ +PREHOOK: query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION std (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION stddev_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION variance (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION var_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Group By Operator + aggregations: + expr: sum(substr(value, 5)) + expr: avg(substr(value, 5)) + expr: avg(DISTINCT substr(value, 5)) + expr: max(substr(value, 5)) + expr: min(substr(value, 5)) + expr: std(substr(value, 5)) + expr: stddev_samp(substr(value, 5)) + expr: variance(substr(value, 5)) + expr: var_samp(substr(value, 5)) + expr: sum(DISTINCT substr(value, 5)) + expr: count(DISTINCT substr(value, 5)) + bucketGroup: false + keys: + expr: substr(value, 5) + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col1 + type: double + expr: _col2 + type: struct + expr: _col3 + type: struct + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col6 + type: struct + expr: _col7 + type: struct + expr: _col8 + type: struct + expr: _col9 + type: struct + expr: _col10 + type: double + expr: _col11 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + expr: avg(VALUE._col1) + expr: avg(DISTINCT KEY._col0:0._col0) + expr: max(VALUE._col3) + expr: min(VALUE._col4) + expr: std(VALUE._col5) + expr: stddev_samp(VALUE._col6) + expr: variance(VALUE._col7) + expr: var_samp(VALUE._col8) + expr: sum(DISTINCT KEY._col0:1._col0) + expr: count(DISTINCT KEY._col0:2._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: double + expr: _col2 + type: double + expr: _col3 + type: string + expr: _col4 + type: string + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + expr: _col9 + type: double + expr: _col10 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: double + expr: _col2 + type: double + expr: UDFToDouble(_col3) + type: double + expr: UDFToDouble(_col4) + type: double + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + expr: _col9 + type: double + expr: UDFToDouble(_col10) + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c10 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c11 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c5 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c6 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c7 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c8 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c9 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_02-58-52_499_6413045004889165690/-mr-10000 +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_02-58-52_499_6413045004889165690/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c10 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c11 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c5 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c6 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c7 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c8 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c9 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +130091.0 260.182 256.10355987055016 98.0 0.0 142.9268095075238 143.06995106518906 20428.072876 20469.01089779559 79136.0 309.0 Index: ql/src/test/results/clientpositive/groupby3_map_skew.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_map_skew.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby3_map_skew.q.out (working copy) @@ -100,7 +100,7 @@ aggregations: expr: sum(VALUE._col0) expr: avg(VALUE._col1) - expr: avg(DISTINCT KEY._col0) + expr: avg(DISTINCT KEY._col0:0._col0) expr: max(VALUE._col3) expr: min(VALUE._col4) expr: std(VALUE._col5) Index: ql/src/test/results/clientpositive/groupby3_map_skew_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_map_skew_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby3_map_skew_multi_distinct.q.out (revision 0) @@ -0,0 +1,256 @@ +PREHOOK: query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION std (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION stddev_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION variance (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION var_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Group By Operator + aggregations: + expr: sum(substr(value, 5)) + expr: avg(substr(value, 5)) + expr: avg(DISTINCT substr(value, 5)) + expr: max(substr(value, 5)) + expr: min(substr(value, 5)) + expr: std(substr(value, 5)) + expr: stddev_samp(substr(value, 5)) + expr: variance(substr(value, 5)) + expr: var_samp(substr(value, 5)) + expr: sum(DISTINCT substr(value, 5)) + expr: count(DISTINCT substr(value, 5)) + bucketGroup: false + keys: + expr: substr(value, 5) + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col1 + type: double + expr: _col2 + type: struct + expr: _col3 + type: struct + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col6 + type: struct + expr: _col7 + type: struct + expr: _col8 + type: struct + expr: _col9 + type: struct + expr: _col10 + type: double + expr: _col11 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + expr: avg(VALUE._col1) + expr: avg(DISTINCT KEY._col0:0._col0) + expr: max(VALUE._col3) + expr: min(VALUE._col4) + expr: std(VALUE._col5) + expr: stddev_samp(VALUE._col6) + expr: variance(VALUE._col7) + expr: var_samp(VALUE._col8) + expr: sum(DISTINCT KEY._col0:1._col0) + expr: count(DISTINCT KEY._col0:2._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: double + expr: _col2 + type: double + expr: _col3 + type: string + expr: _col4 + type: string + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + expr: _col9 + type: double + expr: _col10 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: double + expr: _col2 + type: double + expr: UDFToDouble(_col3) + type: double + expr: UDFToDouble(_col4) + type: double + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + expr: _col9 + type: double + expr: UDFToDouble(_col10) + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c10 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c11 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c5 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c6 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c7 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c8 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c9 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_02-59-36_097_8563942409739966059/-mr-10000 +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_02-59-36_097_8563942409739966059/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c10 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c11 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c5 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c6 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c7 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c8 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c9 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +130091.0 260.182 256.10355987055016 98.0 0.0 142.9268095075238 143.06995106518906 20428.072876 20469.01089779559 79136.0 309.0 Index: ql/src/test/results/clientpositive/groupby3_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby3_multi_distinct.q.out (revision 0) @@ -0,0 +1,214 @@ +PREHOOK: query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION std (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION stddev_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION variance (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION var_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Reduce Output Operator + key expressions: + expr: substr(value, 5) + type: string + sort order: + + tag: -1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(KEY._col0:2._col0) + expr: avg(KEY._col0:2._col0) + expr: avg(DISTINCT KEY._col0:0._col0) + expr: max(KEY._col0:2._col0) + expr: min(KEY._col0:2._col0) + expr: std(KEY._col0:2._col0) + expr: stddev_samp(KEY._col0:2._col0) + expr: variance(KEY._col0:2._col0) + expr: var_samp(KEY._col0:2._col0) + expr: sum(DISTINCT KEY._col0:1._col0) + expr: count(DISTINCT KEY._col0:2._col0) + bucketGroup: false + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: double + expr: _col2 + type: double + expr: _col3 + type: string + expr: _col4 + type: string + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + expr: _col9 + type: double + expr: _col10 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: double + expr: _col2 + type: double + expr: UDFToDouble(_col3) + type: double + expr: UDFToDouble(_col4) + type: double + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + expr: _col9 + type: double + expr: UDFToDouble(_col10) + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c10 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c11 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c5 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c6 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c7 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c8 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c9 EXPRESSION [(src)src.null, ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-00-24_510_2707611074709398280/-mr-10000 +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-00-24_510_2707611074709398280/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c10 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c11 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c5 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c6 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c7 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c8 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c9 EXPRESSION [(src)src.null, ] +130091.0 260.182 256.10355987055016 98.0 0.0 142.92680950752379 143.06995106518903 20428.07287599999 20469.010897795582 79136.0 309.0 Index: ql/src/test/results/clientpositive/groupby3_noskew.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_noskew.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby3_noskew.q.out (working copy) @@ -58,15 +58,15 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: sum(KEY._col0) - expr: avg(KEY._col0) - expr: avg(DISTINCT KEY._col0) - expr: max(KEY._col0) - expr: min(KEY._col0) - expr: std(KEY._col0) - expr: stddev_samp(KEY._col0) - expr: variance(KEY._col0) - expr: var_samp(KEY._col0) + expr: sum(KEY._col0:0._col0) + expr: avg(KEY._col0:0._col0) + expr: avg(DISTINCT KEY._col0:0._col0) + expr: max(KEY._col0:0._col0) + expr: min(KEY._col0:0._col0) + expr: std(KEY._col0:0._col0) + expr: stddev_samp(KEY._col0:0._col0) + expr: variance(KEY._col0:0._col0) + expr: var_samp(KEY._col0:0._col0) bucketGroup: false mode: complete outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Index: ql/src/test/results/clientpositive/groupby3_noskew_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_noskew_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby3_noskew_multi_distinct.q.out (revision 0) @@ -0,0 +1,214 @@ +PREHOOK: query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE, c10 DOUBLE, c11 DOUBLE) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION std (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION stddev_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION variance (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION var_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Reduce Output Operator + key expressions: + expr: substr(value, 5) + type: string + sort order: + + tag: -1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(KEY._col0:2._col0) + expr: avg(KEY._col0:2._col0) + expr: avg(DISTINCT KEY._col0:0._col0) + expr: max(KEY._col0:2._col0) + expr: min(KEY._col0:2._col0) + expr: std(KEY._col0:2._col0) + expr: stddev_samp(KEY._col0:2._col0) + expr: variance(KEY._col0:2._col0) + expr: var_samp(KEY._col0:2._col0) + expr: sum(DISTINCT KEY._col0:1._col0) + expr: count(DISTINCT KEY._col0:2._col0) + bucketGroup: false + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: double + expr: _col2 + type: double + expr: _col3 + type: string + expr: _col4 + type: string + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + expr: _col9 + type: double + expr: _col10 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: double + expr: _col2 + type: double + expr: UDFToDouble(_col3) + type: double + expr: UDFToDouble(_col4) + type: double + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + expr: _col9 + type: double + expr: UDFToDouble(_col10) + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)), + sum(DISTINCT substr(src.value, 5)), + count(DISTINCT substr(src.value, 5)) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c10 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c11 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c5 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c6 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c7 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c8 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c9 EXPRESSION [(src)src.null, ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_02-58-11_371_9192473715365716502/-mr-10000 +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_02-58-11_371_9192473715365716502/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c10 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c11 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c5 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c6 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c7 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c8 EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: dest1.c9 EXPRESSION [(src)src.null, ] +130091.0 260.182 256.10355987055016 98.0 0.0 142.92680950752379 143.06995106518903 20428.07287599999 20469.010897795582 79136.0 309.0 Index: ql/src/test/results/clientpositive/groupby_map_ppr.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_map_ppr.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby_map_ppr.q.out (working copy) @@ -168,7 +168,7 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col1) + expr: count(DISTINCT KEY._col1:0._col0) expr: sum(VALUE._col1) bucketGroup: false keys: Index: ql/src/test/results/clientpositive/groupby_map_ppr_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_map_ppr_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_map_ppr_multi_distinct.q.out (revision 0) @@ -0,0 +1,318 @@ +PREHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, C3 INT, c4 INT) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, C3 INT, c4 INT) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF srcpart src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL src) value)))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL src) ds) '2008-04-08')) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: (ds = '2008-04-08') + type: boolean + Filter Operator + isSamplingPred: false + predicate: + expr: (ds = '2008-04-08') + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(DISTINCT substr(value, 5)) + expr: sum(substr(value, 5)) + expr: sum(DISTINCT substr(value, 5)) + expr: count(DISTINCT value) + bucketGroup: false + keys: + expr: substr(key, 1, 1) + type: string + expr: substr(value, 5) + type: string + expr: value + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: string + sort order: +++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col3 + type: bigint + expr: _col4 + type: double + expr: _col5 + type: double + expr: _col6 + type: bigint + Needs Tagging: false + Path -> Alias: + pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 [src] + pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 [src] + Path -> Partition: + pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 + Partition + base file name: hr=11 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 11 + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart + name srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568503 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart + name srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568503 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: srcpart + name: srcpart + pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 + Partition + base file name: hr=12 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 12 + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart + name srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568503 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart + name srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568503 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: srcpart + name: srcpart + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(VALUE._col1) + expr: sum(DISTINCT KEY._col1:1._col0) + expr: count(DISTINCT KEY._col1:2._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: concat(_col0, _col2) + type: string + expr: _col3 + type: double + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + expr: UDFToInteger(_col3) + type: int + expr: UDFToInteger(_col4) + type: int + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 1 + directory: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_02-55-09_918_9018058416994975476/-ext-10000 + NumFilesPerFileSink: 1 + Stats Publishing Key Prefix: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_02-55-09_918_9018058416994975476/-ext-10000/ + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,c1,c2,c3,c4 + columns.types string:int:string:int:int + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/dest1 + name dest1 + serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568509 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true + source: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_02-55-09_918_9018058416994975476/-ext-10000 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,c1,c2,c3,c4 + columns.types string:int:string:int:int + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/dest1 + name dest1 + serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568509 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + tmp directory: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_02-55-09_918_9018058416994975476/-ext-10001 + + Stage: Stage-2 + Stats-Aggr Operator + Stats Aggregation Key Prefix: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_02-55-09_918_9018058416994975476/-ext-10000/ + + +PREHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_02-55-17_568_1009308907147328020/-mr-10000 +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_02-55-17_568_1009308907147328020/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +0 1 00.0 0 1 +1 71 132828.0 10044 71 +2 69 251142.0 15780 69 +3 62 364008.0 20119 62 +4 74 4105526.0 30965 74 +5 6 5794.0 278 6 +6 5 6796.0 331 5 +7 6 71470.0 447 6 +8 8 81524.0 595 8 +9 7 92094.0 577 7 Index: ql/src/test/results/clientpositive/groupby_ppr.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_ppr.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/groupby_ppr.q.out (working copy) @@ -151,8 +151,8 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col1) - expr: sum(KEY._col1) + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(KEY._col1:0._col0) bucketGroup: false keys: expr: KEY._col0 Index: ql/src/test/results/clientpositive/groupby_ppr_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_ppr_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_ppr_multi_distinct.q.out (revision 0) @@ -0,0 +1,293 @@ +PREHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, c3 INT, c4 INT) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF srcpart src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL src) value)))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL src) ds) '2008-04-08')) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: (ds = '2008-04-08') + type: boolean + Filter Operator + isSamplingPred: false + predicate: + expr: (ds = '2008-04-08') + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Reduce Output Operator + key expressions: + expr: substr(key, 1, 1) + type: string + expr: substr(value, 5) + type: string + expr: value + type: string + sort order: +++ + Map-reduce partition columns: + expr: substr(key, 1, 1) + type: string + tag: -1 + Needs Tagging: false + Path -> Alias: + pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 [src] + pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 [src] + Path -> Partition: + pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 + Partition + base file name: hr=11 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 11 + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart + name srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568918 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart + name srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568918 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: srcpart + name: srcpart + pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 + Partition + base file name: hr=12 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 12 + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart + name srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568918 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/srcpart + name srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568918 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: srcpart + name: srcpart + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(KEY._col1:1._col0) + expr: sum(DISTINCT KEY._col1:1._col0) + expr: count(DISTINCT KEY._col1:2._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: concat(_col0, _col2) + type: string + expr: _col3 + type: double + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + expr: UDFToInteger(_col3) + type: int + expr: UDFToInteger(_col4) + type: int + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 1 + directory: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_03-02-02_146_1431549693668387725/-ext-10000 + NumFilesPerFileSink: 1 + Stats Publishing Key Prefix: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_03-02-02_146_1431549693668387725/-ext-10000/ + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,c1,c2,c3,c4 + columns.types string:int:string:int:int + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/dest1 + name dest1 + serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568922 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true + source: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_03-02-02_146_1431549693668387725/-ext-10000 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,c1,c2,c3,c4 + columns.types string:int:string:int:int + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/home/amarsri/workspace/hive/build/ql/test/data/warehouse/dest1 + name dest1 + serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1287568922 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: dest1 + tmp directory: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_03-02-02_146_1431549693668387725/-ext-10001 + + Stage: Stage-2 + Stats-Aggr Operator + Stats Aggregation Key Prefix: pfile:/home/amarsri/workspace/hive/build/ql/scratchdir/hive_2010-10-20_03-02-02_146_1431549693668387725/-ext-10000/ + + +PREHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(srcpart)src.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-02-06_602_914264234722067634/-mr-10000 +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-02-06_602_914264234722067634/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(srcpart)src.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +0 1 00.0 0 1 +1 71 132828.0 10044 71 +2 69 251142.0 15780 69 +3 62 364008.0 20119 62 +4 74 4105526.0 30965 74 +5 6 5794.0 278 6 +6 5 6796.0 331 5 +7 6 71470.0 447 6 +8 8 81524.0 595 8 +9 7 92094.0 577 7 Index: ql/src/test/results/clientpositive/join18.q.out =================================================================== --- ql/src/test/results/clientpositive/join18.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/join18.q.out (working copy) @@ -75,7 +75,7 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col1) + expr: count(DISTINCT KEY._col1:0._col0) bucketGroup: false keys: expr: KEY._col0 Index: ql/src/test/results/clientpositive/join18_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/join18_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/join18_multi_distinct.q.out (revision 0) @@ -0,0 +1,596 @@ +PREHOOK: query: EXPLAIN + SELECT a.key, a.value, b.key, b.value1, b.value2 + FROM + ( + SELECT src1.key as key, count(src1.value) AS value FROM src src1 group by src1.key + ) a + FULL OUTER JOIN + ( + SELECT src2.key as key, count(distinct(src2.value)) AS value1, + count(distinct(src2.key)) AS value2 + FROM src1 src2 group by src2.key + ) b + ON (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN + SELECT a.key, a.value, b.key, b.value1, b.value2 + FROM + ( + SELECT src1.key as key, count(src1.value) AS value FROM src src1 group by src1.key + ) a + FULL OUTER JOIN + ( + SELECT src2.key as key, count(distinct(src2.value)) AS value1, + count(distinct(src2.key)) AS value2 + FROM src1 src2 group by src2.key + ) b + ON (a.key = b.key) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_FULLOUTERJOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF src src1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) key) key) (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL src1) value)) value)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL src1) key)))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF src1 src2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL src2) key) key) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL src2) value)) value1) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL src2) key)) value2)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL src2) key)))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value1)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value2))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-3 + Stage-3 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b:src2 + TableScan + alias: src2 + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(DISTINCT value) + expr: count(DISTINCT key) + bucketGroup: false + keys: + expr: key + type: string + expr: value + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + expr: _col3 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: count(DISTINCT KEY._col1:1._col0) + bucketGroup: false + keys: + expr: KEY._col1:1._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: bigint + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: bigint + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a:src1 + TableScan + alias: src1 + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(value) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.key, a.value, b.key, b.value1, b.value2 + FROM + ( + SELECT src1.key as key, count(src1.value) AS value FROM src src1 group by src1.key + ) a + FULL OUTER JOIN + ( + SELECT src2.key as key, count(distinct(src2.value)) AS value1, + count(distinct(src2.key)) AS value2 + FROM src1 src2 group by src2.key + ) b + ON (a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-06-44_648_893234056742394843/-mr-10000 +POSTHOOK: query: SELECT a.key, a.value, b.key, b.value1, b.value2 + FROM + ( + SELECT src1.key as key, count(src1.value) AS value FROM src src1 group by src1.key + ) a + FULL OUTER JOIN + ( + SELECT src2.key as key, count(distinct(src2.value)) AS value1, + count(distinct(src2.key)) AS value2 + FROM src1 src2 group by src2.key + ) b + ON (a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-06-44_648_893234056742394843/-mr-10000 +NULL NULL 1 0 +NULL NULL 1 0 +NULL NULL 1 0 +NULL NULL 1 1 +0 3 NULL NULL NULL +10 1 NULL NULL NULL +100 2 NULL NULL NULL +103 2 NULL NULL NULL +104 2 NULL NULL NULL +105 1 NULL NULL NULL +11 1 NULL NULL NULL +111 1 NULL NULL NULL +113 2 NULL NULL NULL +114 1 NULL NULL NULL +116 1 NULL NULL NULL +118 2 NULL NULL NULL +119 3 NULL NULL NULL +12 2 NULL NULL NULL +120 2 NULL NULL NULL +125 2 NULL NULL NULL +126 1 NULL NULL NULL +128 3 128 0 1 +129 2 NULL NULL NULL +131 1 NULL NULL NULL +133 1 NULL NULL NULL +134 2 NULL NULL NULL +136 1 NULL NULL NULL +137 2 NULL NULL NULL +138 4 NULL NULL NULL +143 1 NULL NULL NULL +145 1 NULL NULL NULL +146 2 146 0 1 +149 2 NULL NULL NULL +15 2 NULL NULL NULL +150 1 150 0 1 +152 2 NULL NULL NULL +153 1 NULL NULL NULL +155 1 NULL NULL NULL +156 1 NULL NULL NULL +157 1 NULL NULL NULL +158 1 NULL NULL NULL +160 1 NULL NULL NULL +162 1 NULL NULL NULL +163 1 NULL NULL NULL +164 2 NULL NULL NULL +165 2 NULL NULL NULL +166 1 NULL NULL NULL +167 3 NULL NULL NULL +168 1 NULL NULL NULL +169 4 NULL NULL NULL +17 1 NULL NULL NULL +170 1 NULL NULL NULL +172 2 NULL NULL NULL +174 2 NULL NULL NULL +175 2 NULL NULL NULL +176 2 NULL NULL NULL +177 1 NULL NULL NULL +178 1 NULL NULL NULL +179 2 NULL NULL NULL +18 2 NULL NULL NULL +180 1 NULL NULL NULL +181 1 NULL NULL NULL +183 1 NULL NULL NULL +186 1 NULL NULL NULL +187 3 NULL NULL NULL +189 1 NULL NULL NULL +19 1 NULL NULL NULL +190 1 NULL NULL NULL +191 2 NULL NULL NULL +192 1 NULL NULL NULL +193 3 NULL NULL NULL +194 1 NULL NULL NULL +195 2 NULL NULL NULL +196 1 NULL NULL NULL +197 2 NULL NULL NULL +199 3 NULL NULL NULL +2 1 NULL NULL NULL +20 1 NULL NULL NULL +200 2 NULL NULL NULL +201 1 NULL NULL NULL +202 1 NULL NULL NULL +203 2 NULL NULL NULL +205 2 NULL NULL NULL +207 2 NULL NULL NULL +208 3 NULL NULL NULL +209 2 NULL NULL NULL +213 2 213 0 1 +214 1 NULL NULL NULL +216 2 NULL NULL NULL +217 2 NULL NULL NULL +218 1 NULL NULL NULL +219 2 NULL NULL NULL +221 2 NULL NULL NULL +222 1 NULL NULL NULL +223 2 NULL NULL NULL +224 2 224 0 1 +226 1 NULL NULL NULL +228 1 NULL NULL NULL +229 2 NULL NULL NULL +230 5 NULL NULL NULL +233 2 NULL NULL NULL +235 1 NULL NULL NULL +237 2 NULL NULL NULL +238 2 238 0 1 +239 2 NULL NULL NULL +24 2 NULL NULL NULL +241 1 NULL NULL NULL +242 2 NULL NULL NULL +244 1 NULL NULL NULL +247 1 NULL NULL NULL +248 1 NULL NULL NULL +249 1 NULL NULL NULL +252 1 NULL NULL NULL +255 2 255 0 1 +256 2 NULL NULL NULL +257 1 NULL NULL NULL +258 1 NULL NULL NULL +26 2 NULL NULL NULL +260 1 NULL NULL NULL +262 1 NULL NULL NULL +263 1 NULL NULL NULL +265 2 NULL NULL NULL +266 1 NULL NULL NULL +27 1 NULL NULL NULL +272 2 NULL NULL NULL +273 3 273 0 1 +274 1 NULL NULL NULL +275 1 NULL NULL NULL +277 4 NULL NULL NULL +278 2 278 0 1 +28 1 NULL NULL NULL +280 2 NULL NULL NULL +281 2 NULL NULL NULL +282 2 NULL NULL NULL +283 1 NULL NULL NULL +284 1 NULL NULL NULL +285 1 NULL NULL NULL +286 1 NULL NULL NULL +287 1 NULL NULL NULL +288 2 NULL NULL NULL +289 1 NULL NULL NULL +291 1 NULL NULL NULL +292 1 NULL NULL NULL +296 1 NULL NULL NULL +298 3 NULL NULL NULL +30 1 NULL NULL NULL +302 1 NULL NULL NULL +305 1 NULL NULL NULL +306 1 NULL NULL NULL +307 2 NULL NULL NULL +308 1 NULL NULL NULL +309 2 NULL NULL NULL +310 1 NULL NULL NULL +311 3 311 0 1 +315 1 NULL NULL NULL +316 3 NULL NULL NULL +317 2 NULL NULL NULL +318 3 NULL NULL NULL +321 2 NULL NULL NULL +322 2 NULL NULL NULL +323 1 NULL NULL NULL +325 2 NULL NULL NULL +327 3 NULL NULL NULL +33 1 NULL NULL NULL +331 2 NULL NULL NULL +332 1 NULL NULL NULL +333 2 NULL NULL NULL +335 1 NULL NULL NULL +336 1 NULL NULL NULL +338 1 NULL NULL NULL +339 1 NULL NULL NULL +34 1 NULL NULL NULL +341 1 NULL NULL NULL +342 2 NULL NULL NULL +344 2 NULL NULL NULL +345 1 NULL NULL NULL +348 5 NULL NULL NULL +35 3 NULL NULL NULL +351 1 NULL NULL NULL +353 2 NULL NULL NULL +356 1 NULL NULL NULL +360 1 NULL NULL NULL +362 1 NULL NULL NULL +364 1 NULL NULL NULL +365 1 NULL NULL NULL +366 1 NULL NULL NULL +367 2 NULL NULL NULL +368 1 NULL NULL NULL +369 3 369 0 1 +37 2 NULL NULL NULL +373 1 NULL NULL NULL +374 1 NULL NULL NULL +375 1 NULL NULL NULL +377 1 NULL NULL NULL +378 1 NULL NULL NULL +379 1 NULL NULL NULL +382 2 NULL NULL NULL +384 3 NULL NULL NULL +386 1 NULL NULL NULL +389 1 NULL NULL NULL +392 1 NULL NULL NULL +393 1 NULL NULL NULL +394 1 NULL NULL NULL +395 2 NULL NULL NULL +396 3 NULL NULL NULL +397 2 NULL NULL NULL +399 2 NULL NULL NULL +4 1 NULL NULL NULL +400 1 NULL NULL NULL +401 5 401 0 1 +402 1 NULL NULL NULL +403 3 NULL NULL NULL +404 2 NULL NULL NULL +406 4 406 0 1 +407 1 NULL NULL NULL +409 3 NULL NULL NULL +41 1 NULL NULL NULL +411 1 NULL NULL NULL +413 2 NULL NULL NULL +414 2 NULL NULL NULL +417 3 NULL NULL NULL +418 1 NULL NULL NULL +419 1 NULL NULL NULL +42 2 NULL NULL NULL +421 1 NULL NULL NULL +424 2 NULL NULL NULL +427 1 NULL NULL NULL +429 2 NULL NULL NULL +43 1 NULL NULL NULL +430 3 NULL NULL NULL +431 3 NULL NULL NULL +432 1 NULL NULL NULL +435 1 NULL NULL NULL +436 1 NULL NULL NULL +437 1 NULL NULL NULL +438 3 NULL NULL NULL +439 2 NULL NULL NULL +44 1 NULL NULL NULL +443 1 NULL NULL NULL +444 1 NULL NULL NULL +446 1 NULL NULL NULL +448 1 NULL NULL NULL +449 1 NULL NULL NULL +452 1 NULL NULL NULL +453 1 NULL NULL NULL +454 3 NULL NULL NULL +455 1 NULL NULL NULL +457 1 NULL NULL NULL +458 2 NULL NULL NULL +459 2 NULL NULL NULL +460 1 NULL NULL NULL +462 2 NULL NULL NULL +463 2 NULL NULL NULL +466 3 NULL NULL NULL +467 1 NULL NULL NULL +468 4 NULL NULL NULL +469 5 NULL NULL NULL +47 1 NULL NULL NULL +470 1 NULL NULL NULL +472 1 NULL NULL NULL +475 1 NULL NULL NULL +477 1 NULL NULL NULL +478 2 NULL NULL NULL +479 1 NULL NULL NULL +480 3 NULL NULL NULL +481 1 NULL NULL NULL +482 1 NULL NULL NULL +483 1 NULL NULL NULL +484 1 NULL NULL NULL +485 1 NULL NULL NULL +487 1 NULL NULL NULL +489 4 NULL NULL NULL +490 1 NULL NULL NULL +491 1 NULL NULL NULL +492 2 NULL NULL NULL +493 1 NULL NULL NULL +494 1 NULL NULL NULL +495 1 NULL NULL NULL +496 1 NULL NULL NULL +497 1 NULL NULL NULL +498 3 NULL NULL NULL +5 3 NULL NULL NULL +51 2 NULL NULL NULL +53 1 NULL NULL NULL +54 1 NULL NULL NULL +57 1 NULL NULL NULL +58 2 NULL NULL NULL +64 1 NULL NULL NULL +65 1 NULL NULL NULL +66 1 66 0 1 +67 2 NULL NULL NULL +69 1 NULL NULL NULL +70 3 NULL NULL NULL +72 2 NULL NULL NULL +74 1 NULL NULL NULL +76 2 NULL NULL NULL +77 1 NULL NULL NULL +78 1 NULL NULL NULL +8 1 NULL NULL NULL +80 1 NULL NULL NULL +82 1 NULL NULL NULL +83 2 NULL NULL NULL +84 2 NULL NULL NULL +85 1 NULL NULL NULL +86 1 NULL NULL NULL +87 1 NULL NULL NULL +9 1 NULL NULL NULL +90 3 NULL NULL NULL +92 1 NULL NULL NULL +95 2 NULL NULL NULL +96 1 NULL NULL NULL +97 2 NULL NULL NULL +98 2 98 0 1 +NULL NULL val_146 1 0 +NULL NULL val_150 1 0 +NULL NULL val_165 1 0 +NULL NULL val_193 1 0 +NULL NULL val_213 1 0 +NULL NULL val_238 1 0 +NULL NULL val_255 1 0 +NULL NULL val_265 1 0 +NULL NULL val_27 1 0 +NULL NULL val_273 1 0 +NULL NULL val_278 1 0 +NULL NULL val_311 1 0 +NULL NULL val_401 1 0 +NULL NULL val_406 1 0 +NULL NULL val_409 1 0 +NULL NULL val_484 1 0 +NULL NULL val_66 1 0 +NULL NULL val_98 1 0 Index: ql/src/test/results/clientpositive/nullgroup4.q.out =================================================================== --- ql/src/test/results/clientpositive/nullgroup4.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/nullgroup4.q.out (working copy) @@ -60,7 +60,7 @@ Group By Operator aggregations: expr: count(VALUE._col0) - expr: count(DISTINCT KEY._col0) + expr: count(DISTINCT KEY._col0:0._col0) bucketGroup: false mode: partials outputColumnNames: _col0, _col1 @@ -177,7 +177,7 @@ Group By Operator aggregations: expr: count(VALUE._col0) - expr: count(DISTINCT KEY._col0) + expr: count(DISTINCT KEY._col0:0._col0) bucketGroup: false mode: mergepartial outputColumnNames: _col0, _col1 @@ -259,7 +259,7 @@ Group By Operator aggregations: expr: count(VALUE._col0) - expr: count(DISTINCT KEY._col0) + expr: count(DISTINCT KEY._col0:0._col0) bucketGroup: false mode: partial1 outputColumnNames: _col0, _col1 @@ -364,7 +364,7 @@ Group By Operator aggregations: expr: count(VALUE._col0) - expr: count(DISTINCT KEY._col0) + expr: count(DISTINCT KEY._col0:0._col0) bucketGroup: false mode: complete outputColumnNames: _col0, _col1 Index: ql/src/test/results/clientpositive/nullgroup4_multi_distinct.q.out =================================================================== --- ql/src/test/results/clientpositive/nullgroup4_multi_distinct.q.out (revision 0) +++ ql/src/test/results/clientpositive/nullgroup4_multi_distinct.q.out (revision 0) @@ -0,0 +1,366 @@ +PREHOOK: query: explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL x) value))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL x) value) 5)))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL x) key) 9999)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Filter Operator + predicate: + expr: (key = 9999) + type: boolean + Filter Operator + predicate: + expr: (key = 9999) + type: boolean + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Group By Operator + aggregations: + expr: count(1) + expr: count(DISTINCT value) + expr: count(DISTINCT substr(value, 5)) + bucketGroup: false + keys: + expr: value + type: string + expr: substr(value, 5) + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + tag: -1 + value expressions: + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + expr: count(DISTINCT KEY._col0:0._col0) + expr: count(DISTINCT KEY._col0:1._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: bigint + expr: _col1 + type: bigint + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-05-42_907_6792276248599421191/-mr-10000 +POSTHOOK: query: select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-05-42_907_6792276248599421191/-mr-10000 +0 0 0 +PREHOOK: query: explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL x) value))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL x) value) 5)))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL x) key) 9999)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Filter Operator + predicate: + expr: (key = 9999) + type: boolean + Filter Operator + predicate: + expr: (key = 9999) + type: boolean + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Group By Operator + aggregations: + expr: count(1) + expr: count(DISTINCT value) + expr: count(DISTINCT substr(value, 5)) + bucketGroup: false + keys: + expr: value + type: string + expr: substr(value, 5) + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + tag: -1 + value expressions: + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + expr: count(DISTINCT KEY._col0:0._col0) + expr: count(DISTINCT KEY._col0:1._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: bigint + expr: _col1 + type: bigint + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-05-45_595_476394043954943052/-mr-10000 +POSTHOOK: query: select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-05-45_595_476394043954943052/-mr-10000 +0 0 0 +PREHOOK: query: explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL x) value))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL x) value) 5)))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL x) key) 9999)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Filter Operator + predicate: + expr: (key = 9999) + type: boolean + Filter Operator + predicate: + expr: (key = 9999) + type: boolean + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Reduce Output Operator + key expressions: + expr: value + type: string + expr: substr(value, 5) + type: string + sort order: ++ + tag: -1 + value expressions: + expr: 1 + type: int + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + expr: count(DISTINCT KEY._col0:0._col0) + expr: count(DISTINCT KEY._col0:1._col0) + bucketGroup: false + mode: complete + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: bigint + expr: _col1 + type: bigint + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-05-48_033_2307493078145631854/-mr-10000 +POSTHOOK: query: select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-05-48_033_2307493078145631854/-mr-10000 +0 0 0 +PREHOOK: query: explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL x) value))) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL x) value) 5)))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL x) key) 9999)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x + TableScan + alias: x + Filter Operator + predicate: + expr: (key = 9999) + type: boolean + Filter Operator + predicate: + expr: (key = 9999) + type: boolean + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Reduce Output Operator + key expressions: + expr: value + type: string + expr: substr(value, 5) + type: string + sort order: ++ + tag: -1 + value expressions: + expr: 1 + type: int + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + expr: count(DISTINCT KEY._col0:0._col0) + expr: count(DISTINCT KEY._col0:1._col0) + bucketGroup: false + mode: complete + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: bigint + expr: _col1 + type: bigint + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-05-50_428_9039696963657316865/-mr-10000 +POSTHOOK: query: select count(1), count(distinct x.value), count(distinct substr(x.value, 5)) from src x where x.key = 9999 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/tmp/amarsri/hive_2010-10-20_03-05-50_428_9039696963657316865/-mr-10000 +0 0 0 Index: ql/src/test/results/clientpositive/udf_count.q.out =================================================================== --- ql/src/test/results/clientpositive/udf_count.q.out (revision 1026947) +++ ql/src/test/results/clientpositive/udf_count.q.out (working copy) @@ -124,7 +124,7 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col0) + expr: count(DISTINCT KEY._col0:0._col0) bucketGroup: false mode: mergepartial outputColumnNames: _col0 @@ -204,7 +204,7 @@ Reduce Operator Tree: Group By Operator aggregations: - expr: count(DISTINCT KEY._col0, KEY._col1) + expr: count(DISTINCT KEY._col0:0._col0, KEY._col0:0._col1) bucketGroup: false mode: mergepartial outputColumnNames: _col0 Index: ql/src/test/results/compiler/plan/groupby1.q.xml =================================================================== --- ql/src/test/results/compiler/plan/groupby1.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/groupby1.q.xml (working copy) @@ -270,6 +270,9 @@ + + + @@ -306,6 +309,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/groupby2.q.xml =================================================================== --- ql/src/test/results/compiler/plan/groupby2.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/groupby2.q.xml (working copy) @@ -132,31 +132,38 @@ - - KEY._col1 - - - _col1 - - - - - - - - - + + + + + + 1 + + + + + - + + + _col1 + + + + + + + + @@ -183,12 +190,15 @@ columns.types - string,string + string,uniontype<struct<_col0:string>> + + 1 + -1 @@ -334,7 +344,7 @@ - KEY._col1 + KEY._col1:0._col0 @@ -1190,7 +1200,7 @@ _col1 - + _col1 @@ -1204,7 +1214,7 @@ _col0 - + _col0 @@ -1223,10 +1233,10 @@ - + - + @@ -1329,7 +1339,7 @@ _col0 - + KEY._col0 @@ -1365,7 +1375,7 @@ - KEY._col1 + KEY._col1:0._col0 @@ -1410,7 +1420,7 @@ - + Index: ql/src/test/results/compiler/plan/groupby3.q.xml =================================================================== --- ql/src/test/results/compiler/plan/groupby3.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/groupby3.q.xml (working copy) @@ -113,33 +113,39 @@ - - - KEY._col0 - - - _col0 - - - - - - - - string + + + + + + + + + + 0 - - - - - + + + _col0 + + + + + + + + string + + + + @@ -166,7 +172,7 @@ columns.types - string + uniontype<struct<_col0:string>> @@ -369,7 +375,7 @@ - KEY._col0 + KEY._col0:0._col0 @@ -1356,7 +1362,7 @@ _col4 - + _col4 @@ -1370,7 +1376,7 @@ _col3 - + _col3 @@ -1384,7 +1390,7 @@ _col2 - + _col2 @@ -1398,7 +1404,7 @@ _col1 - + _col1 @@ -1412,7 +1418,7 @@ _col0 - + _col0 @@ -1431,9 +1437,6 @@ - - - @@ -1445,6 +1448,9 @@ + + + @@ -1655,7 +1661,7 @@ - KEY._col0 + KEY._col0:0._col0 Index: ql/src/test/results/compiler/plan/groupby4.q.xml =================================================================== --- ql/src/test/results/compiler/plan/groupby4.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/groupby4.q.xml (working copy) @@ -136,6 +136,9 @@ + + + @@ -172,6 +175,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/groupby5.q.xml =================================================================== --- ql/src/test/results/compiler/plan/groupby5.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/groupby5.q.xml (working copy) @@ -136,6 +136,9 @@ + + + @@ -172,6 +175,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/groupby6.q.xml =================================================================== --- ql/src/test/results/compiler/plan/groupby6.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/groupby6.q.xml (working copy) @@ -136,6 +136,9 @@ + + + @@ -172,6 +175,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/input20.q.xml =================================================================== --- ql/src/test/results/compiler/plan/input20.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/input20.q.xml (working copy) @@ -144,6 +144,9 @@ + + + @@ -187,6 +190,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/input4.q.xml =================================================================== --- ql/src/test/results/compiler/plan/input4.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/input4.q.xml (working copy) @@ -278,6 +278,9 @@ + + + @@ -321,6 +324,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/input5.q.xml =================================================================== --- ql/src/test/results/compiler/plan/input5.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/input5.q.xml (working copy) @@ -282,6 +282,9 @@ + + + @@ -325,6 +328,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/join1.q.xml =================================================================== --- ql/src/test/results/compiler/plan/join1.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/join1.q.xml (working copy) @@ -349,6 +349,9 @@ + + + @@ -395,6 +398,9 @@ + + 1 + -1 @@ -624,6 +630,9 @@ + + + @@ -670,6 +679,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/join2.q.xml =================================================================== --- ql/src/test/results/compiler/plan/join2.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/join2.q.xml (working copy) @@ -273,6 +273,9 @@ + + + @@ -360,6 +363,9 @@ + + 1 + -1 @@ -542,6 +548,9 @@ + + + @@ -609,6 +618,9 @@ + + 1 + -1 @@ -1657,6 +1669,9 @@ + + + @@ -1703,6 +1718,9 @@ + + 1 + -1 @@ -1925,6 +1943,9 @@ + + + @@ -1971,6 +1992,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/join3.q.xml =================================================================== --- ql/src/test/results/compiler/plan/join3.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/join3.q.xml (working copy) @@ -417,6 +417,9 @@ + + + @@ -467,6 +470,9 @@ + + 1 + -1 @@ -674,6 +680,9 @@ + + + @@ -720,6 +729,9 @@ + + 1 + -1 @@ -945,6 +957,9 @@ + + + @@ -991,6 +1006,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/join4.q.xml =================================================================== --- ql/src/test/results/compiler/plan/join4.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/join4.q.xml (working copy) @@ -235,6 +235,9 @@ + + + @@ -278,6 +281,9 @@ + + 1 + -1 @@ -927,6 +933,9 @@ + + + @@ -970,6 +979,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/join5.q.xml =================================================================== --- ql/src/test/results/compiler/plan/join5.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/join5.q.xml (working copy) @@ -235,6 +235,9 @@ + + + @@ -278,6 +281,9 @@ + + 1 + -1 @@ -927,6 +933,9 @@ + + + @@ -970,6 +979,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/join6.q.xml =================================================================== --- ql/src/test/results/compiler/plan/join6.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/join6.q.xml (working copy) @@ -235,6 +235,9 @@ + + + @@ -278,6 +281,9 @@ + + 1 + -1 @@ -927,6 +933,9 @@ + + + @@ -970,6 +979,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/join7.q.xml =================================================================== --- ql/src/test/results/compiler/plan/join7.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/join7.q.xml (working copy) @@ -322,6 +322,9 @@ + + + @@ -365,6 +368,9 @@ + + 1 + -1 @@ -1014,6 +1020,9 @@ + + + @@ -1057,6 +1066,9 @@ + + 1 + -1 @@ -1697,6 +1709,9 @@ + + + @@ -1740,6 +1755,9 @@ + + 1 + -1 Index: ql/src/test/results/compiler/plan/join8.q.xml =================================================================== --- ql/src/test/results/compiler/plan/join8.q.xml (revision 1026947) +++ ql/src/test/results/compiler/plan/join8.q.xml (working copy) @@ -235,6 +235,9 @@ + + + @@ -278,6 +281,9 @@ + + 1 + -1 @@ -927,6 +933,9 @@ + + + @@ -970,6 +979,9 @@ + + 1 + -1