diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java index 4bdd3c9..2a46b30 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; @@ -178,7 +179,6 @@ private MapOpCtx initObjectInspector(Configuration hconf, MapOpCtx opCtx, SerDeUtils.createOverlayedProperties(td.getProperties(), pd.getProperties()); Map partSpec = pd.getPartSpec(); - opCtx.tableName = String.valueOf(overlayedProps.getProperty("name")); opCtx.partName = String.valueOf(partSpec); opCtx.deserializer = pd.getDeserializer(hconf); @@ -279,19 +279,20 @@ private MapOpCtx initObjectInspector(Configuration hconf, MapOpCtx opCtx, * and P1's schema is same as T, whereas P2's scheme is different from T, conversion * might be needed for both P1 and P2, since SettableOI might be needed for T */ - private Map getConvertedOI(Configuration hconf) + private Map getConvertedOI(Map tableToConf) throws HiveException { Map tableDescOI = new HashMap(); Set identityConverterTableDesc = new HashSet(); + try { Map oiSettableProperties = new HashMap(); for (Path onefile : conf.getPathToAliases().keySet()) { PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile); TableDesc tableDesc = pd.getTableDesc(); + Configuration hconf = tableToConf.get(tableDesc.getTableName()); Deserializer partDeserializer = pd.getDeserializer(hconf); - StructObjectInspector partRawRowObjectInspector; boolean isAcid = AcidUtils.isTablePropertyTransactional(tableDesc.getProperties()); if (Utilities.isSchemaEvolutionEnabled(hconf, isAcid) && Utilities.isInputFileFormatSelfDescribing(pd)) { @@ -329,6 +330,58 @@ else if (partRawRowObjectInspector.equals(tblRawRowObjectInspector)) { return tableDescOI; } + /** + * For each source table, combine the nested column pruning information from all its + * table scan descriptors and set it in a configuration copy. This is necessary since + * the configuration property "READ_NESTED_COLUMN_PATH_CONF_STR" is set on a per-table + * basis, so we can't just use a single configuration for all the tables. + */ + private Map cloneConfsForNestedColPruning(Configuration hconf) { + Map tableNameToConf = new HashMap<>(); + + for (Map.Entry> e : conf.getPathToAliases().entrySet()) { + List aliases = e.getValue(); + if (aliases == null || aliases.isEmpty()) { + continue; + } + + String tableName = conf.getPathToPartitionInfo().get(e.getKey()).getTableName(); + for (String alias: aliases) { + Operator rootOp = conf.getAliasToWork().get(alias); + if (!(rootOp instanceof TableScanOperator)) { + continue; + } + TableScanDesc tableScanDesc = ((TableScanOperator) rootOp).getConf(); + List nestedColumnPaths = tableScanDesc.getNeededNestedColumnPaths(); + if (nestedColumnPaths == null || nestedColumnPaths.isEmpty()) { + continue; + } + if (!tableNameToConf.containsKey(tableName)) { + Configuration clonedConf = new Configuration(hconf); + clonedConf.unset(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR); + tableNameToConf.put(tableName, clonedConf); + } + Configuration newConf = tableNameToConf.get(tableName); + ColumnProjectionUtils.appendNestedColumnPaths(newConf, nestedColumnPaths); + } + } + + // Assign tables without nested column pruning info to the default conf + for (PartitionDesc pd : conf.getPathToPartitionInfo().values()) { + if (!tableNameToConf.containsKey(pd.getTableName())) { + tableNameToConf.put(pd.getTableName(), hconf); + } + } + + for (PartitionDesc pd: conf.getAliasToPartnInfo().values()) { + if (!tableNameToConf.containsKey(pd.getTableName())) { + tableNameToConf.put(pd.getTableName(), hconf); + } + } + + return tableNameToConf; + } + /* * This is the same as the setChildren method below but for empty tables. * It takes care of the following: @@ -339,15 +392,19 @@ else if (partRawRowObjectInspector.equals(tblRawRowObjectInspector)) { public void initEmptyInputChildren(List> children, Configuration hconf) throws SerDeException, Exception { setChildOperators(children); + + Map tableNameToConf = cloneConfsForNestedColPruning(hconf); + for (Operator child : children) { TableScanOperator tsOp = (TableScanOperator) child; StructObjectInspector soi = null; PartitionDesc partDesc = conf.getAliasToPartnInfo().get(tsOp.getConf().getAlias()); + Configuration newConf = tableNameToConf.get(partDesc.getTableDesc().getTableName()); Deserializer serde = partDesc.getTableDesc().getDeserializer(); partDesc.setProperties(partDesc.getProperties()); MapOpCtx opCtx = new MapOpCtx(tsOp.getConf().getAlias(), child, partDesc); StructObjectInspector tableRowOI = (StructObjectInspector) serde.getObjectInspector(); - initObjectInspector(hconf, opCtx, tableRowOI); + initObjectInspector(newConf, opCtx, tableRowOI); soi = opCtx.rowObjectInspector; child.getParentOperators().add(this); childrenOpToOI.put(child, soi); @@ -359,12 +416,15 @@ public void setChildren(Configuration hconf) throws Exception { List> children = new ArrayList>(); - Map convertedOI = getConvertedOI(hconf); + Map tableNameToConf = cloneConfsForNestedColPruning(hconf); + Map convertedOI = getConvertedOI(tableNameToConf); for (Map.Entry> entry : conf.getPathToAliases().entrySet()) { Path onefile = entry.getKey(); List aliases = entry.getValue(); PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile); + TableDesc tableDesc = partDesc.getTableDesc(); + Configuration newConf = tableNameToConf.get(tableDesc.getTableName()); for (String alias : aliases) { Operator op = conf.getAliasToWork().get(alias); @@ -381,7 +441,7 @@ public void setChildren(Configuration hconf) throws Exception { } MapOpCtx context = new MapOpCtx(alias, op, partDesc); StructObjectInspector tableRowOI = convertedOI.get(partDesc.getTableDesc()); - contexts.put(op, initObjectInspector(hconf, context, tableRowOI)); + contexts.put(op, initObjectInspector(newConf, context, tableRowOI)); if (children.contains(op) == false) { op.setParentOperators(new ArrayList>(1)); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java index 16064b2..efce46f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -228,7 +228,7 @@ public static MessageType getProjectedSchema( MessageType schema, List colNames, List colIndexes, - List nestedColumnPaths) { + Set nestedColumnPaths) { List schemaTypes = new ArrayList(); Map prunedCols = getPrunedNestedColumns(nestedColumnPaths); @@ -236,7 +236,8 @@ public static MessageType getProjectedSchema( if (i < colNames.size()) { if (i < schema.getFieldCount()) { Type t = schema.getType(i); - if (!prunedCols.containsKey(t.getName())) { + String tn = t.getName().toLowerCase(); + if (!prunedCols.containsKey(tn)) { schemaTypes.add(schema.getType(i)); } else { if (t.isPrimitive()) { @@ -245,7 +246,7 @@ public static MessageType getProjectedSchema( } else { // For group type, we need to build the projected group type with required leaves List g = - projectLeafTypes(Arrays.asList(t), Arrays.asList(prunedCols.get(t.getName()))); + projectLeafTypes(Arrays.asList(t), Arrays.asList(prunedCols.get(tn))); if (!g.isEmpty()) { schemaTypes.addAll(g); } @@ -264,20 +265,19 @@ public static MessageType getProjectedSchema( /** * Return the columns which contains required nested attribute level - * e.g. - * Given struct a and a is required while y is not, so the method will return a - * who contains the attribute x + * E.g., given struct a: while 'x' is required and 'y' is not, the method will return + * a pruned struct for 'a' which only contains the attribute 'x' * * @param nestedColPaths the paths for required nested attribute - * @return column list contains required nested attribute + * @return a map from the column to its selected nested column paths, of which the keys are all lower-cased. */ - private static Map getPrunedNestedColumns(List nestedColPaths) { + private static Map getPrunedNestedColumns(Set nestedColPaths) { Map resMap = new HashMap<>(); if (nestedColPaths.isEmpty()) { return resMap; } for (String s : nestedColPaths) { - String c = StringUtils.split(s, '.')[0]; + String c = StringUtils.split(s, '.')[0].toLowerCase(); if (!resMap.containsKey(c)) { FieldNode f = NestedColumnFieldPruningUtils.addNodeByPath(null, s); resMap.put(c, f); @@ -309,7 +309,7 @@ private static GroupType buildProjectedGroupType( fieldMap.put(n.getFieldName(), n); } for (Type type : types) { - String tn = type.getName(); + String tn = type.getName().toLowerCase(); if (fieldMap.containsKey(tn)) { FieldNode f = fieldMap.get(tn); @@ -373,7 +373,7 @@ private static GroupType buildProjectedGroupType( contextMetadata.put(PARQUET_COLUMN_INDEX_ACCESS, String.valueOf(indexAccess)); this.hiveTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList); - List groupPaths = ColumnProjectionUtils.getNestedColumnPaths(configuration); + Set groupPaths = ColumnProjectionUtils.getNestedColumnPaths(configuration); List indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration); if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) { MessageType requestedSchemaByUser = getProjectedSchema(tableSchema, columnNamesList, diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java index 8df0cc1..9d85652 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java @@ -72,14 +72,20 @@ public ArrayWritableObjectInspector(boolean isRoot, final String name = fieldNames.get(i); final TypeInfo fieldInfo = fieldInfos.get(i); - StructFieldImpl field; - if (prunedTypeInfo != null && prunedTypeInfo.getAllStructFieldNames().indexOf(name) >= 0) { - int adjustedIndex = prunedTypeInfo.getAllStructFieldNames().indexOf(name); - TypeInfo prunedFieldInfo = prunedTypeInfo.getAllStructFieldTypeInfos().get(adjustedIndex); - field = new StructFieldImpl(name, getObjectInspector(fieldInfo, prunedFieldInfo), i, adjustedIndex); - } else { + StructFieldImpl field = null; + if (prunedTypeInfo != null) { + for (int idx = 0; idx < prunedTypeInfo.getAllStructFieldNames().size(); ++idx) { + if (prunedTypeInfo.getAllStructFieldNames().get(idx).equalsIgnoreCase(name)) { + TypeInfo prunedFieldInfo = prunedTypeInfo.getAllStructFieldTypeInfos().get(idx); + field = new StructFieldImpl(name, getObjectInspector(fieldInfo, prunedFieldInfo), i, idx); + break; + } + } + } + if (field == null) { field = new StructFieldImpl(name, getObjectInspector(fieldInfo, null), i, i); } + fields.add(field); fieldsByName.put(name.toLowerCase(), field); } @@ -222,29 +228,47 @@ public Object setStructFieldData(Object struct, StructField field, Object fieldV } @Override - public boolean equals(Object obj) { - if (obj == null) { + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + ArrayWritableObjectInspector that = (ArrayWritableObjectInspector) o; + + if (isRoot != that.isRoot) { + return false; + } + if (typeInfo != null ? !typeInfo.equals(that.typeInfo) : that.typeInfo != null) { return false; } - if (getClass() != obj.getClass()) { + if (fieldInfos != null ? !fieldInfos.equals(that.fieldInfos) : that.fieldInfos != null) { return false; } - final ArrayWritableObjectInspector other = (ArrayWritableObjectInspector) obj; - if (this.typeInfo != other.typeInfo && (this.typeInfo == null || !this.typeInfo.equals(other.typeInfo))) { + if (fieldNames != null ? !fieldNames.equals(that.fieldNames) : that.fieldNames != null) { return false; } - return true; + if (fields != null ? !fields.equals(that.fields) : that.fields != null) { + return false; + } + return fieldsByName != null ? fieldsByName.equals(that.fieldsByName) : that.fieldsByName == null; + } @Override public int hashCode() { - int hash = 5; - hash = 29 * hash + (this.typeInfo != null ? this.typeInfo.hashCode() : 0); - return hash; + int result = typeInfo != null ? typeInfo.hashCode() : 0; + result = 31 * result + (fieldInfos != null ? fieldInfos.hashCode() : 0); + result = 31 * result + (fieldNames != null ? fieldNames.hashCode() : 0); + result = 31 * result + (fields != null ? fields.hashCode() : 0); + result = 31 * result + (fieldsByName != null ? fieldsByName.hashCode() : 0); + result = 31 * result + (isRoot ? 1 : 0); + return result; } private class StructFieldImpl implements StructField { - private final String name; private final ObjectInspector inspector; private final int index; @@ -288,5 +312,37 @@ public ObjectInspector getFieldObjectInspector() { public int getFieldID() { return index; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + StructFieldImpl that = (StructFieldImpl) o; + + if (index != that.index) { + return false; + } + if (adjustedIndex != that.adjustedIndex) { + return false; + } + if (name != null ? !name.equals(that.name) : that.name != null) { + return false; + } + return inspector != null ? inspector.equals(that.inspector) : that.inspector == null; + } + + @Override + public int hashCode() { + int result = name != null ? name.hashCode() : 0; + result = 31 * result + (inspector != null ? inspector.hashCode() : 0); + result = 31 * result + index; + result = 31 * result + adjustedIndex; + return result; + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java index ef79760..a124938 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java @@ -22,6 +22,7 @@ import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.optimizer.FieldNode; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; @@ -116,8 +117,9 @@ public final void initialize(final Configuration conf, final Properties tbl) thr (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); StructTypeInfo prunedTypeInfo = null; if (conf != null) { - String prunedColumnPaths = conf.get(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR); - if (prunedColumnPaths != null) { + String rawPrunedColumnPaths = conf.get(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR); + if (rawPrunedColumnPaths != null) { + List prunedColumnPaths = processRawPrunedPaths(rawPrunedColumnPaths); prunedTypeInfo = pruneFromPaths(completeTypeInfo, prunedColumnPaths); } } @@ -177,30 +179,43 @@ public SerDeStats getSerDeStats() { } /** + * Given a list of raw pruned paths separated by ',', return a list of merged pruned paths. + * For instance, if the 'prunedPaths' is "s.a, s, s", this returns ["s"]. + */ + private static List processRawPrunedPaths(String prunedPaths) { + List fieldNodes = new ArrayList<>(); + for (String p : prunedPaths.split(",")) { + fieldNodes = FieldNode.mergeFieldNodes(fieldNodes, FieldNode.fromPath(p)); + } + List prunedPathList = new ArrayList<>(); + for (FieldNode fn : fieldNodes) { + prunedPathList.addAll(fn.toPaths()); + } + return prunedPathList; + } + + /** * Given a complete struct type info and pruned paths containing selected fields * from the type info, return a pruned struct type info only with the selected fields. * * For instance, if 'originalTypeInfo' is: s:struct, d:string> - * and 'prunedPaths' is "s.a.b,s.d", then the result will be: + * and 'prunedPaths' is ["s.a.b,s.d"], then the result will be: * s:struct, d:string> * * @param originalTypeInfo the complete struct type info * @param prunedPaths a string representing the pruned paths, separated by ',' * @return the pruned struct type info */ - private StructTypeInfo pruneFromPaths( - StructTypeInfo originalTypeInfo, String prunedPaths) { + private static StructTypeInfo pruneFromPaths( + StructTypeInfo originalTypeInfo, List prunedPaths) { PrunedStructTypeInfo prunedTypeInfo = new PrunedStructTypeInfo(originalTypeInfo); - - String[] prunedPathList = prunedPaths.split(","); - for (String path : prunedPathList) { + for (String path : prunedPaths) { pruneFromSinglePath(prunedTypeInfo, path); } - return prunedTypeInfo.prune(); } - private void pruneFromSinglePath(PrunedStructTypeInfo prunedInfo, String path) { + private static void pruneFromSinglePath(PrunedStructTypeInfo prunedInfo, String path) { Preconditions.checkArgument(prunedInfo != null, "PrunedStructTypeInfo for path " + path + " should not be null"); @@ -212,7 +227,7 @@ private void pruneFromSinglePath(PrunedStructTypeInfo prunedInfo, String path) { String fieldName = path.substring(0, index); prunedInfo.markSelected(fieldName); if (index < path.length()) { - pruneFromSinglePath(prunedInfo.children.get(fieldName), path.substring(index + 1)); + pruneFromSinglePath(prunedInfo.getChild(fieldName), path.substring(index + 1)); } } @@ -228,16 +243,22 @@ private void pruneFromSinglePath(PrunedStructTypeInfo prunedInfo, String path) { for (int i = 0; i < typeInfo.getAllStructFieldTypeInfos().size(); ++i) { TypeInfo ti = typeInfo.getAllStructFieldTypeInfos().get(i); if (ti.getCategory() == Category.STRUCT) { - this.children.put(typeInfo.getAllStructFieldNames().get(i), + this.children.put(typeInfo.getAllStructFieldNames().get(i).toLowerCase(), new PrunedStructTypeInfo((StructTypeInfo) ti)); } } } + PrunedStructTypeInfo getChild(String fieldName) { + return children.get(fieldName.toLowerCase()); + } + void markSelected(String fieldName) { - int index = typeInfo.getAllStructFieldNames().indexOf(fieldName); - if (index >= 0) { - selected[index] = true; + for (int i = 0; i < typeInfo.getAllStructFieldNames().size(); ++i) { + if (typeInfo.getAllStructFieldNames().get(i).equalsIgnoreCase(fieldName)) { + selected[i] = true; + break; + } } } @@ -250,8 +271,8 @@ StructTypeInfo prune() { String fn = oldNames.get(i); if (selected[i]) { newNames.add(fn); - if (children.containsKey(fn)) { - newTypes.add(children.get(fn).prune()); + if (children.containsKey(fn.toLowerCase())) { + newTypes.add(children.get(fn.toLowerCase()).prune()); } else { newTypes.add(oldTypes.get(i)); } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcCtx.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcCtx.java index 4364298..e9af7a7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcCtx.java @@ -19,11 +19,10 @@ package org.apache.hadoop.hive.ql.optimizer; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.exec.CommonJoinOperator; @@ -34,111 +33,78 @@ import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; -import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; -import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import static org.apache.hadoop.hive.ql.optimizer.FieldNode.mergeFieldNodes; /** * This class implements the processor context for Column Pruner. */ public class ColumnPrunerProcCtx implements NodeProcessorCtx { - private final ParseContext pctx; - private final Map, List> prunedColLists; - /** - * This map stores the pruned nested column path for each operator + * A mapping from operators to nested column paths being used in them. + * Note: paths are of format "s.a.b" which represents field "b" of + * struct "a" is being used, while "a" itself is a field of struct "s". */ - private final Map, List> prunedNestedColLists; - - private final Map>> joinPrunedColLists; - - private final Map> unionPrunedColLists; + private final Map, List> prunedColLists; + private final Map>> joinPrunedColLists; public ColumnPrunerProcCtx(ParseContext pctx) { this.pctx = pctx; - prunedColLists = new HashMap, List>(); - prunedNestedColLists = new HashMap, List>(); - joinPrunedColLists = new HashMap>>(); - unionPrunedColLists = new HashMap<>(); + prunedColLists = new HashMap<>(); + joinPrunedColLists = new HashMap<>(); } public ParseContext getParseContext() { return pctx; } - public Map>> getJoinPrunedColLists() { + public Map>> getJoinPrunedColLists() { return joinPrunedColLists; } - public Map> getUnionPrunedColLists() { - return unionPrunedColLists; - } - - /** - * @return the prunedColLists - */ - public List getPrunedColList(Operator op) { + public List getPrunedColList(Operator op) { return prunedColLists.get(op); } - public Map, List> getPrunedColLists() { + public Map, List> getPrunedColLists() { return prunedColLists; } - public Map, List> getPrunedNestedColLists() { - return prunedNestedColLists; - } - /** - * Creates the list of internal column names(these names are used in the - * RowResolver and are different from the external column names) that are - * needed in the subtree. These columns eventually have to be selected from - * the table scan. + * Creates the list of internal column names(represented by field nodes, + * these names are used in the RowResolver and are different from the + * external column names) that are needed in the subtree. These columns + * eventually have to be selected from the table scan. * - * @param curOp - * The root of the operator subtree. - * @return List of the internal column names. - * @throws SemanticException + * @param curOp The root of the operator subtree. + * @return a list of field nodes representing the internal column names. */ - public List genColLists(Operator curOp) + public List genColLists(Operator curOp) throws SemanticException { if (curOp.getChildOperators() == null) { return null; } - List colList = null; + List colList = null; for (Operator child : curOp.getChildOperators()) { - List prunList = null; + List prunList = null; if (child instanceof CommonJoinOperator) { int tag = child.getParentOperators().indexOf(curOp); prunList = joinPrunedColLists.get(child).get((byte) tag); - } else if (child instanceof UnionOperator) { - List positions = unionPrunedColLists.get(child); - if (positions != null) { - prunList = new ArrayList<>(); - RowSchema oldRS = curOp.getSchema(); - for (Integer pos : positions) { - ColumnInfo colInfo = oldRS.getSignature().get(pos); - prunList.add(colInfo.getInternalName()); - } - } } else if (child instanceof FileSinkOperator) { prunList = new ArrayList<>(); RowSchema oldRS = curOp.getSchema(); for (ColumnInfo colInfo : oldRS.getSignature()) { - prunList.add(colInfo.getInternalName()); + prunList.add(new FieldNode(colInfo.getInternalName())); } } else { prunList = prunedColLists.get(child); @@ -147,49 +113,25 @@ public ParseContext getParseContext() { continue; } if (colList == null) { - colList = new ArrayList(prunList); + colList = new ArrayList<>(prunList); } else { - colList = Utilities.mergeUniqElems(colList, prunList); + colList = mergeFieldNodes(colList, prunList); } } return colList; } /** - * Get the path to the root column for the nested column attribute - * - * @param curOp current operator - * @return the nested column paths for current operator and its child operator - */ - public List genNestedColPaths(Operator curOp) { - if (curOp.getChildOperators() == null) { - return null; - } - Set groupPathsList = new HashSet<>(); - - for (Operator child : curOp.getChildOperators()) { - if (prunedNestedColLists.containsKey(child)) { - groupPathsList.addAll(prunedNestedColLists.get(child)); - } - } - - return new ArrayList<>(groupPathsList); - } - - /** - * Creates the list of internal column names(these names are used in the - * RowResolver and are different from the external column names) that are - * needed in the subtree. These columns eventually have to be selected from - * the table scan. + * Creates the list of internal column names (represented by field nodes, + * these names are used in the RowResolver and are different from the + * external column names) that are needed in the subtree. These columns + * eventually have to be selected from the table scan. * - * @param curOp - * The root of the operator subtree. - * @param child - * The consumer. - * @return List of the internal column names. - * @throws SemanticException + * @param curOp The root of the operator subtree. + * @param child The consumer. + * @return a list of field nodes representing the internal column names. */ - public List genColLists(Operator curOp, + public List genColLists(Operator curOp, Operator child) throws SemanticException { if (curOp.getChildOperators() == null) { @@ -198,43 +140,31 @@ public ParseContext getParseContext() { if (child instanceof CommonJoinOperator) { int tag = child.getParentOperators().indexOf(curOp); return joinPrunedColLists.get(child).get((byte) tag); - } else if (child instanceof UnionOperator) { - List positions = unionPrunedColLists.get(child); - List prunList = new ArrayList<>(); - if (positions != null && positions.size() > 0) { - RowSchema oldRS = curOp.getSchema(); - for (Integer pos : positions) { - ColumnInfo colInfo = oldRS.getSignature().get(pos); - prunList.add(colInfo.getInternalName()); - } - } - return prunList; } else { return prunedColLists.get(child); } } /** - * Creates the list of internal column names from select expressions in a - * select operator. This function is used for the select operator instead of - * the genColLists function (which is used by the rest of the operators). + * Creates the list of internal column names (represented by field nodes) + * from select expressions in a select operator. This function is used for the + * select operator instead of the genColLists function (which is used by + * the rest of the operators). * - * @param op - * The select operator. - * @return List of the internal column names. + * @param op The select operator. + * @return a list of field nodes representing the internal column names. */ - public List getColsFromSelectExpr(SelectOperator op) { - List cols = new ArrayList(); + public List getColsFromSelectExpr(SelectOperator op) { + List cols = new ArrayList<>(); SelectDesc conf = op.getConf(); if(conf.isSelStarNoCompute()) { for (ColumnInfo colInfo : op.getSchema().getSignature()) { - cols.add(colInfo.getInternalName()); + cols.add(new FieldNode(colInfo.getInternalName())); } - } - else { + } else { List exprList = conf.getColList(); for (ExprNodeDesc expr : exprList) { - cols = Utilities.mergeUniqElems(cols, expr.getCols()); + cols = mergeFieldNodesWithDesc(cols, expr); } } return cols; @@ -243,16 +173,14 @@ public ParseContext getParseContext() { /** * Creates the list of internal column names for select * expressions. * - * @param op - * The select operator. - * @param colList - * The list of internal column names returned by the children of the - * select operator. - * @return List of the internal column names. + * @param op The select operator. + * @param colList The list of internal column names (represented by field nodes) + * returned by the children of the select operator. + * @return a list of field nodes representing the internal column names. */ - public List getSelectColsFromChildren(SelectOperator op, - List colList) { - List cols = new ArrayList(); + public List getSelectColsFromChildren(SelectOperator op, + List colList) { + List cols = new ArrayList<>(); SelectDesc conf = op.getConf(); if (colList != null && conf.isSelStarNoCompute()) { @@ -268,9 +196,24 @@ public ParseContext getParseContext() { // input columns are used. List outputColumnNames = conf.getOutputColumnNames(); for (int i = 0; i < outputColumnNames.size(); i++) { - if (colList == null || colList.contains(outputColumnNames.get(i))) { - ExprNodeDesc expr = selectExprs.get(i); - cols = Utilities.mergeUniqElems(cols, expr.getCols()); + if (colList == null) { + cols = mergeFieldNodesWithDesc(cols, selectExprs.get(i)); + } else { + FieldNode childFn = lookupColumn(colList, outputColumnNames.get(i)); + if (childFn != null) { + // In SemanticAnalyzer we inject SEL op before aggregation. The columns + // in this SEL are derived from the table schema, and do not reflect the + // actual columns being selected in the current query. + // In this case, we skip the merge and just use the path from the child ops. + ExprNodeDesc desc = selectExprs.get(i); + if (desc instanceof ExprNodeColumnDesc && ((ExprNodeColumnDesc) desc).getIsGenerated()) { + FieldNode fn = new FieldNode(((ExprNodeColumnDesc) desc).getColumn()); + fn.setNodes(childFn.getNodes()); + cols = mergeFieldNodes(cols, fn); + } else { + cols = mergeFieldNodesWithDesc(cols, selectExprs.get(i)); + } + } } } @@ -278,56 +221,30 @@ public ParseContext getParseContext() { } /** - * Creates the list of internal group paths for select * expressions. - * - * @param op The select operator. - * @param paths The list of nested column paths returned by the children of the - * select operator. - * @return List of the nested column path from leaf to the root. + * Given the 'desc', construct a list of field nodes representing the + * nested columns paths referenced by this 'desc'. + * @param desc the node descriptor + * @return a list of nested column paths referenced in the 'desc' */ - public List getSelectNestedColPathsFromChildren( - SelectOperator op, - List paths) { - List groups = new ArrayList<>(); - SelectDesc conf = op.getConf(); - - if (paths != null && conf.isSelStarNoCompute()) { - groups.addAll(paths); - return groups; - } - - List selectDescs = conf.getColList(); - - List outputColumnNames = conf.getOutputColumnNames(); - for (int i = 0; i < outputColumnNames.size(); i++) { - if (paths == null || paths.contains(outputColumnNames.get(i))) { - ExprNodeDesc desc = selectDescs.get(i); - List gp = getNestedColPathByDesc(desc); - groups.addAll(gp); - } - } - - return groups; - } - - // Entry method - private List getNestedColPathByDesc(ExprNodeDesc desc) { - List res = new ArrayList<>(); - getNestedColsFromExprNodeDesc(desc, "", res); - return res; + private static List getNestedColPathByDesc(ExprNodeDesc desc) { + List res = new ArrayList<>(); + getNestedColsFromExprNodeDesc(desc, null, res); + return mergeFieldNodes(new ArrayList(), res); } - private void getNestedColsFromExprNodeDesc( + private static void getNestedColsFromExprNodeDesc( ExprNodeDesc desc, - String pathToRoot, - List paths) { + FieldNode pathToRoot, + List paths) { if (desc instanceof ExprNodeColumnDesc) { String f = ((ExprNodeColumnDesc) desc).getColumn(); - String p = pathToRoot.isEmpty() ? f : f + "." + pathToRoot; + FieldNode p = new FieldNode(f); + p.addFieldNodes(pathToRoot); paths.add(p); } else if (desc instanceof ExprNodeFieldDesc) { String f = ((ExprNodeFieldDesc) desc).getFieldName(); - String p = pathToRoot.isEmpty() ? f : f + "." + pathToRoot; + FieldNode p = new FieldNode(f); + p.addFieldNodes(pathToRoot); getNestedColsFromExprNodeDesc(((ExprNodeFieldDesc) desc).getDesc(), p, paths); } else { List children = desc.getChildren(); @@ -343,11 +260,11 @@ private void getNestedColsFromExprNodeDesc( /** * Create the list of internal columns for select tag of LV */ - public List getSelectColsFromLVJoin(RowSchema rs, - List colList) throws SemanticException { - List columns = new ArrayList(); - for (String col : colList) { - if (rs.getColumnInfo(col) != null) { + public List getSelectColsFromLVJoin(RowSchema rs, + List colList) throws SemanticException { + List columns = new ArrayList<>(); + for (FieldNode col : colList) { + if (rs.getColumnInfo(col.getFieldName()) != null) { columns.add(col); } } @@ -369,13 +286,11 @@ public void handleFilterUnionChildren(Operator curOp) if (curOp.getChildOperators() == null || !(curOp instanceof FilterOperator)) { return; } - List parentPrunList = prunedColLists.get(curOp); + List parentPrunList = prunedColLists.get(curOp); if(parentPrunList == null || parentPrunList.size() == 0) { return; } - FilterOperator filOp = (FilterOperator)curOp; - List prunList = null; - List[] childToParentIndex = null; + List prunList = null; for (Operator child : curOp.getChildOperators()) { if (child instanceof UnionOperator) { @@ -389,7 +304,7 @@ public void handleFilterUnionChildren(Operator curOp) Map colExprMap = new HashMap(); ArrayList outputRS = new ArrayList(); for (ColumnInfo colInfo : child.getSchema().getSignature()) { - if (!prunList.contains(colInfo.getInternalName())) { + if (lookupColumn(prunList, colInfo.getInternalName()) == null) { continue; } ExprNodeDesc colDesc = new ExprNodeColumnDesc(colInfo.getType(), @@ -408,10 +323,36 @@ public void handleFilterUnionChildren(Operator curOp) select, new RowSchema(outputRS), curOp); OperatorFactory.makeChild(sel, child); sel.setColumnExprMap(colExprMap); - } + } + } + static ArrayList toColumnNames(List columns) { + ArrayList names = new ArrayList<>(); + for (FieldNode fn : columns) { + names.add(fn.getFieldName()); } + return names; } + static List fromColumnNames(List columnNames) { + List fieldNodes = new ArrayList<>(); + for (String cn : columnNames) { + fieldNodes.add(new FieldNode(cn)); + } + return fieldNodes; + } + + static FieldNode lookupColumn(Collection columns, String colName) { + for (FieldNode fn : columns) { + if (fn.getFieldName() != null && fn.getFieldName().equals(colName)) { + return fn; + } + } + return null; + } + + static List mergeFieldNodesWithDesc(List left, ExprNodeDesc desc) { + return FieldNode.mergeFieldNodes(left, getNestedColPathByDesc(desc)); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java index 6ca4df9..7681a83 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java @@ -20,6 +20,7 @@ import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -79,6 +80,12 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import static org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx.fromColumnNames; +import static org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx.lookupColumn; +import static org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx.mergeFieldNodesWithDesc; +import static org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx.toColumnNames; +import static org.apache.hadoop.hive.ql.optimizer.FieldNode.mergeFieldNodes; + /** * Factory for generating the different node processors used by ColumnPruner. */ @@ -98,11 +105,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, FilterOperator op = (FilterOperator) nd; ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; ExprNodeDesc condn = op.getConf().getPredicate(); - // get list of columns used in the filter - List cl = condn.getCols(); - // merge it with the downstream col list - List filterOpPrunedColLists = Utilities.mergeUniqElems(cppCtx.genColLists(op), cl); - List filterOpPrunedColListsOrderPreserved = preserveColumnOrder(op, + List filterOpPrunedColLists = mergeFieldNodesWithDesc(cppCtx.genColLists(op), condn); + List filterOpPrunedColListsOrderPreserved = preserveColumnOrder(op, filterOpPrunedColLists); cppCtx.getPrunedColLists().put(op, filterOpPrunedColListsOrderPreserved); @@ -131,25 +135,27 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { GroupByOperator gbOp = (GroupByOperator) nd; ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; - List colLists = new ArrayList(); + List colLists = new ArrayList<>(); GroupByDesc conf = gbOp.getConf(); + ArrayList keys = conf.getKeys(); for (ExprNodeDesc key : keys) { - colLists = Utilities.mergeUniqElems(colLists, key.getCols()); + colLists = mergeFieldNodesWithDesc(colLists, key); } ArrayList aggrs = conf.getAggregators(); for (AggregationDesc aggr : aggrs) { ArrayList params = aggr.getParameters(); for (ExprNodeDesc param : params) { - colLists = Utilities.mergeUniqElems(colLists, param.getCols()); + colLists = mergeFieldNodesWithDesc(colLists, param); } } + int groupingSetPosition = conf.getGroupingSetPosition(); if (groupingSetPosition >= 0) { - List neededCols = cppCtx.genColLists(gbOp); + List neededCols = cppCtx.genColLists(gbOp); String groupingColumn = conf.getOutputColumnNames().get(groupingSetPosition); - if (!neededCols.contains(groupingColumn)) { + if (lookupColumn(neededCols, groupingColumn) == null) { conf.getOutputColumnNames().remove(groupingSetPosition); if (gbOp.getSchema() != null) { gbOp.getSchema().getSignature().remove(groupingSetPosition); @@ -163,8 +169,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, if (child instanceof SelectOperator || child instanceof ReduceSinkOperator) { continue; } - List colList = cppCtx.genColLists(gbOp, child); - Set neededCols = new HashSet(); + List colList = cppCtx.genColLists(gbOp, child); + Set neededCols = new HashSet<>(); if (colList != null) { neededCols.addAll(colList); } else { @@ -177,7 +183,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Map colExprMap = new HashMap(); ArrayList outputRS = new ArrayList(); for (ColumnInfo colInfo : gbOp.getSchema().getSignature()) { - if (!neededCols.contains(colInfo.getInternalName())) { + if (lookupColumn(neededCols, colInfo.getInternalName()) == null) { continue; } ExprNodeDesc colDesc = new ExprNodeColumnDesc(colInfo.getType(), @@ -223,14 +229,14 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Operator op = (Operator) nd; RowSchema inputRS = op.getSchema(); - List prunedCols = cppCtx.getPrunedColList(op.getChildOperators() + List prunedCols = cppCtx.getPrunedColList(op.getChildOperators() .get(0)); Operator parent = op.getParentOperators().get(0); RowSchema parentRS = parent.getSchema(); List sig = parentRS.getSignature(); - List colList = new ArrayList(); + List colList = new ArrayList<>(); for (ColumnInfo cI : sig) { - colList.add(cI.getInternalName()); + colList.add(new FieldNode(cI.getInternalName())); } if (prunedCols.size() != inputRS.getSignature().size() @@ -239,7 +245,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, ArrayList outputs = new ArrayList(); Map colExprMap = new HashMap(); ArrayList outputRS = new ArrayList(); - for (String internalName : prunedCols) { + for (FieldNode internalCol: prunedCols) { + String internalName = internalCol.getFieldName(); ColumnInfo valueInfo = inputRS.getColumnInfo(internalName); ExprNodeDesc colDesc = new ExprNodeColumnDesc(valueInfo.getType(), valueInfo.getInternalName(), valueInfo.getTabAlias(), valueInfo.getIsVirtualCol()); @@ -273,7 +280,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { super.process(nd, stack, ctx, nodeOutputs); - List cols = ((ColumnPrunerProcCtx)ctx).getPrunedColLists().get(nd); + List cols = ((ColumnPrunerProcCtx) ctx).getPrunedColLists().get(nd); if (null != cols) { pruneOperator(ctx, (LimitOperator) nd, cols); } @@ -313,14 +320,14 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, return super.process(nd, stack, cppCtx, nodeOutputs); } - List prunedCols = cppCtx.getPrunedColList(op.getChildOperators().get(0)); + List prunedCols = cppCtx.getPrunedColList(op.getChildOperators().get(0)); if (conf.forWindowing()) { WindowTableFunctionDef def = (WindowTableFunctionDef) funcDef; - prunedCols = Utilities.mergeUniqElems(getWindowFunctionColumns(def), prunedCols); + prunedCols = mergeFieldNodes(prunedCols, getWindowFunctionColumns(def)); } else if (conf.forNoop()) { prunedCols = new ArrayList(cppCtx.getPrunedColList(op.getChildOperators().get(0))); } else { - prunedCols = referencedColumns; + prunedCols = fromColumnNames(referencedColumns); } List newRS = prunedColumnsList(prunedCols, op.getSchema(), funcDef); @@ -328,16 +335,16 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, op.getSchema().setSignature(new ArrayList(newRS)); ShapeDetails outputShape = funcDef.getStartOfChain().getInput().getOutputShape(); - cppCtx.getPrunedColLists().put(op, outputShape.getColumnNames()); + cppCtx.getPrunedColLists().put(op, fromColumnNames(outputShape.getColumnNames())); return null; } - private List buildPrunedRS(List prunedCols, RowSchema oldRS) + private List buildPrunedRS(List prunedCols, RowSchema oldRS) throws SemanticException { ArrayList sig = new ArrayList(); - HashSet prunedColsSet = new HashSet(prunedCols); + HashSet prunedColsSet = new HashSet<>(prunedCols); for (ColumnInfo cInfo : oldRS.getSignature()) { - if (prunedColsSet.contains(cInfo.getInternalName())) { + if (lookupColumn(prunedColsSet, cInfo.getInternalName()) != null) { sig.add(cInfo); } } @@ -345,22 +352,21 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, } // always should be in this order (see PTFDeserializer#initializeWindowing) - private List getWindowFunctionColumns(WindowTableFunctionDef tDef) { - List columns = new ArrayList(); + private List getWindowFunctionColumns(WindowTableFunctionDef tDef) { + List columns = new ArrayList<>(); if (tDef.getWindowFunctions() != null) { for (WindowFunctionDef wDef : tDef.getWindowFunctions()) { - columns.add(wDef.getAlias()); + columns.add(new FieldNode(wDef.getAlias())); } } return columns; } - private RowResolver buildPrunedRR(List prunedCols, RowSchema oldRS) - throws SemanticException { + private RowResolver buildPrunedRR(List prunedCols, RowSchema oldRS) throws SemanticException { RowResolver resolver = new RowResolver(); - HashSet prunedColsSet = new HashSet(prunedCols); + HashSet prunedColsSet = new HashSet<>(prunedCols); for (ColumnInfo cInfo : oldRS.getSignature()) { - if (prunedColsSet.contains(cInfo.getInternalName())) { + if (lookupColumn(prunedColsSet, cInfo.getInternalName()) != null) { resolver.put(cInfo.getTabAlias(), cInfo.getAlias(), cInfo); } } @@ -370,7 +376,7 @@ private RowResolver buildPrunedRR(List prunedCols, RowSchema oldRS) /* * add any input columns referenced in WindowFn args or expressions. */ - private List prunedColumnsList(List prunedCols, RowSchema oldRS, + private List prunedColumnsList(List prunedCols, RowSchema oldRS, PartitionedTableFunctionDef pDef) throws SemanticException { pDef.getOutputShape().setRr(null); pDef.getOutputShape().setColumnNames(null); @@ -383,20 +389,20 @@ private RowResolver buildPrunedRR(List prunedCols, RowSchema oldRS) } for (PTFExpressionDef arg : wDef.getArgs()) { ExprNodeDesc exprNode = arg.getExprNode(); - Utilities.mergeUniqElems(prunedCols, exprNode.getCols()); + prunedCols = mergeFieldNodesWithDesc(prunedCols, exprNode); } } } if (tDef.getPartition() != null) { for (PTFExpressionDef col : tDef.getPartition().getExpressions()) { ExprNodeDesc exprNode = col.getExprNode(); - Utilities.mergeUniqElems(prunedCols, exprNode.getCols()); + prunedCols = mergeFieldNodesWithDesc(prunedCols, exprNode); } } if (tDef.getOrder() != null) { for (PTFExpressionDef col : tDef.getOrder().getExpressions()) { ExprNodeDesc exprNode = col.getExprNode(); - Utilities.mergeUniqElems(prunedCols, exprNode.getCols()); + prunedCols = mergeFieldNodesWithDesc(prunedCols, exprNode); } } } else { @@ -408,9 +414,9 @@ private RowResolver buildPrunedRR(List prunedCols, RowSchema oldRS) return prunedColumnsList(prunedCols, oldRS, (PartitionedTableFunctionDef)input); } - ArrayList inputColumns = prunedInputList(prunedCols, input); + ArrayList inputColumns = prunedInputList(prunedCols, input); input.getOutputShape().setRr(buildPrunedRR(inputColumns, oldRS)); - input.getOutputShape().setColumnNames(inputColumns); + input.getOutputShape().setColumnNames(toColumnNames(inputColumns)); return buildPrunedRS(prunedCols, oldRS); } @@ -419,17 +425,17 @@ private RowResolver buildPrunedRR(List prunedCols, RowSchema oldRS) * from the prunedCols list filter out columns that refer to WindowFns or WindowExprs * the returned list is set as the prunedList needed by the PTFOp. */ - private ArrayList prunedInputList(List prunedCols, PTFInputDef tDef) { - ArrayList prunedInputCols = new ArrayList(); + private ArrayList prunedInputList(List prunedCols, PTFInputDef tDef) { + ArrayList prunedInputCols = new ArrayList<>(); StructObjectInspector OI = tDef.getOutputShape().getOI(); for(StructField f : OI.getAllStructFieldRefs()) { String fName = f.getFieldName(); - if ( prunedCols.contains(fName)) { - prunedInputCols.add(fName); + FieldNode fn = lookupColumn(prunedCols, fName); + if (fn != null) { + prunedInputCols.add(fn); } } - return prunedInputCols; } } @@ -477,23 +483,19 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { TableScanOperator scanOp = (TableScanOperator) nd; ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; - List cols = cppCtx + List cols = cppCtx .genColLists((Operator) nd); if (cols == null && !scanOp.getConf().isGatherStats() ) { scanOp.setNeededColumnIDs(null); return null; } - cols = cols == null ? new ArrayList() : cols; - List nestedCols = cppCtx.genNestedColPaths((Operator) nd); + cols = cols == null ? new ArrayList() : cols; cppCtx.getPrunedColLists().put((Operator) nd, cols); - cppCtx.getPrunedNestedColLists().put((Operator) nd, nestedCols); RowSchema inputRS = scanOp.getSchema(); setupNeededColumns(scanOp, inputRS, cols); - scanOp.setNeededNestedColumnPaths(nestedCols); - return null; } } @@ -502,9 +504,10 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, * RowSchema as well as the needed virtual columns, into TableScanDesc. */ public static void setupNeededColumns(TableScanOperator scanOp, RowSchema inputRS, - List cols) throws SemanticException { + List cols) throws SemanticException { List neededColumnIds = new ArrayList(); List neededColumnNames = new ArrayList(); + List neededNestedColumnPaths = new ArrayList<>(); List referencedColumnNames = new ArrayList(); TableScanDesc desc = scanOp.getConf(); List virtualCols = desc.getVirtualCols(); @@ -512,10 +515,11 @@ public static void setupNeededColumns(TableScanOperator scanOp, RowSchema inputR // add virtual columns for ANALYZE TABLE if(scanOp.getConf().isGatherStats()) { - cols.add(VirtualColumn.RAWDATASIZE.getName()); + cols.add(new FieldNode(VirtualColumn.RAWDATASIZE.getName())); } - for (String column : cols) { + for (FieldNode fn : cols) { + String column = fn.getFieldName(); ColumnInfo colInfo = inputRS.getColumnInfo(column); if (colInfo == null) { continue; @@ -538,12 +542,14 @@ public static void setupNeededColumns(TableScanOperator scanOp, RowSchema inputR // get the needed columns by id and name neededColumnIds.add(position); neededColumnNames.add(column); + neededNestedColumnPaths.addAll(fn.toPaths()); } } desc.setVirtualCols(newVirtualCols); scanOp.setNeededColumnIDs(neededColumnIds); scanOp.setNeededColumns(neededColumnNames); + scanOp.setNeededNestedColumnPaths(neededNestedColumnPaths); scanOp.setReferencedColumns(referencedColumnNames); } @@ -567,21 +573,21 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; ReduceSinkDesc conf = op.getConf(); - List colLists = new ArrayList(); + List colLists = new ArrayList<>(); ArrayList keys = conf.getKeyCols(); LOG.debug("Reduce Sink Operator " + op.getIdentifier() + " key:" + keys); for (ExprNodeDesc key : keys) { - colLists = Utilities.mergeUniqElems(colLists, key.getCols()); + colLists = mergeFieldNodesWithDesc(colLists, key); } for (ExprNodeDesc key : conf.getPartitionCols()) { - colLists = Utilities.mergeUniqElems(colLists, key.getCols()); + colLists = mergeFieldNodesWithDesc(colLists, key); } assert op.getNumChild() == 1; Operator child = op.getChildOperators().get(0); - List childCols = null; + List childCols = null; if (child instanceof CommonJoinOperator) { childCols = cppCtx.getJoinPrunedColLists().get(child) == null ? null : cppCtx.getJoinPrunedColLists().get(child) @@ -596,16 +602,21 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, if (childCols != null) { boolean[] flags = new boolean[valCols.size()]; - for (String childCol : childCols) { - int index = valColNames.indexOf(Utilities.removeValueTag(childCol)); + for (FieldNode childCol : childCols) { + int index = valColNames.indexOf(Utilities.removeValueTag(childCol.getFieldName())); if (index < 0) { continue; } flags[index] = true; - colLists = Utilities.mergeUniqElems(colLists, valCols.get(index).getCols()); + colLists = mergeFieldNodesWithDesc(colLists, valCols.get(index)); } - Collections.sort(colLists); + Collections.sort(colLists, new Comparator() { + @Override + public int compare(FieldNode o1, FieldNode o2) { + return o1.getFieldName().compareTo(o2.getFieldName()); + } + }); pruneReduceSinkOperator(flags, op, cppCtx); cppCtx.getPrunedColLists().put(op, colLists); return null; @@ -614,7 +625,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, // Reduce Sink contains the columns needed - no need to aggregate from // children for (ExprNodeDesc val : valCols) { - colLists = Utilities.mergeUniqElems(colLists, val.getCols()); + colLists = mergeFieldNodesWithDesc(colLists, val); } cppCtx.getPrunedColLists().put(op, colLists); @@ -640,7 +651,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { LateralViewJoinOperator op = (LateralViewJoinOperator) nd; ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; - List cols = cppCtx.genColLists(op); + List cols = cppCtx.genColLists(op); if (cols == null) { return null; } @@ -658,25 +669,25 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, // columns from SEL(*) branch only and append all columns from UDTF branch to it int numSelColumns = op.getConf().getNumSelColumns(); - List colsAfterReplacement = new ArrayList(); - ArrayList newColNames = new ArrayList(); - for (String col : cols) { - int index = outputCols.indexOf(col); + List colsAfterReplacement = new ArrayList<>(); + List newCols = new ArrayList<>(); + for (FieldNode col : cols) { + int index = outputCols.indexOf(col.getFieldName()); // colExprMap.size() == size of cols from SEL(*) branch if (index >= 0 && index < numSelColumns) { - ExprNodeDesc transformed = colExprMap.get(col); - Utilities.mergeUniqElems(colsAfterReplacement, transformed.getCols()); - newColNames.add(col); + ExprNodeDesc transformed = colExprMap.get(col.getFieldName()); + colsAfterReplacement = mergeFieldNodesWithDesc(colsAfterReplacement, transformed); + newCols.add(col); } } // update number of columns from sel(*) - op.getConf().setNumSelColumns(newColNames.size()); + op.getConf().setNumSelColumns(newCols.size()); // add all UDTF columns // following SEL will do CP for columns from UDTF, not adding SEL in here - newColNames.addAll(outputCols.subList(numSelColumns, outputCols.size())); - op.getConf().setOutputInternalColNames(newColNames); - pruneOperator(ctx, op, newColNames); + newCols.addAll(fromColumnNames(outputCols.subList(numSelColumns, outputCols.size()))); + op.getConf().setOutputInternalColNames(toColumnNames(newCols)); + pruneOperator(ctx, op, newCols); cppCtx.getPrunedColLists().put(op, colsAfterReplacement); return null; } @@ -698,26 +709,26 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, // Update the info of SEL operator based on the pruned reordered columns // these are from ColumnPrunerSelectProc - List cols = cppCtx.getPrunedColList(select); + List cols = cppCtx.getPrunedColList(select); RowSchema rs = op.getSchema(); - ArrayList colList = new ArrayList(); - ArrayList outputColNames = new ArrayList(); - for (String col : cols) { + ArrayList colList = new ArrayList<>(); + List outputCols = new ArrayList<>(); + for (FieldNode col : cols) { // revert output cols of SEL(*) to ExprNodeColumnDesc - ColumnInfo colInfo = rs.getColumnInfo(col); + ColumnInfo colInfo = rs.getColumnInfo(col.getFieldName()); ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(colInfo); colList.add(colExpr); - outputColNames.add(col); + outputCols.add(col); } // replace SEL(*) to SEL(exprs) ((SelectDesc)select.getConf()).setSelStarNoCompute(false); ((SelectDesc)select.getConf()).setColList(colList); - ((SelectDesc)select.getConf()).setOutputColumnNames(outputColNames); - pruneOperator(ctx, select, outputColNames); + ((SelectDesc)select.getConf()).setOutputColumnNames(toColumnNames(outputCols)); + pruneOperator(ctx, select, outputCols); Operator udtfPath = op.getChildOperators().get(LateralViewJoinOperator.UDTF_TAG); - List lvFCols = new ArrayList(cppCtx.getPrunedColLists().get(udtfPath)); - lvFCols = Utilities.mergeUniqElems(lvFCols, outputColNames); + List lvFCols = new ArrayList<>(cppCtx.getPrunedColLists().get(udtfPath)); + lvFCols = mergeFieldNodes(lvFCols, outputCols); pruneOperator(ctx, op, lvFCols); return null; @@ -757,7 +768,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, } } - List cols = cppCtx.genColLists(op); + List cols = cppCtx.genColLists(op); SelectDesc conf = op.getConf(); @@ -774,7 +785,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, // and return the ones which have a marked column cppCtx.getPrunedColLists().put(op, cppCtx.getSelectColsFromChildren(op, cols)); - cppCtx.getPrunedNestedColLists().put(op, cppCtx.getSelectNestedColPathsFromChildren(op, cols)); if (cols == null || conf.isSelStarNoCompute()) { return null; } @@ -788,8 +798,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, if (cppCtx.getParseContext().getColumnAccessInfo() != null && cppCtx.getParseContext().getViewProjectToTableSchema() != null && cppCtx.getParseContext().getViewProjectToTableSchema().containsKey(op)) { - for (String col : cols) { - int index = originalOutputColumnNames.indexOf(col); + for (FieldNode col : cols) { + int index = originalOutputColumnNames.indexOf(col.getFieldName()); Table tab = cppCtx.getParseContext().getViewProjectToTableSchema().get(op); cppCtx.getParseContext().getColumnAccessInfo() .add(tab.getCompleteName(), tab.getCols().get(index).getName()); @@ -800,16 +810,16 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, ArrayList newOutputColumnNames = new ArrayList(); ArrayList rs_oldsignature = op.getSchema().getSignature(); ArrayList rs_newsignature = new ArrayList(); - for (String col : cols) { - int index = originalOutputColumnNames.indexOf(col); - newOutputColumnNames.add(col); + for (FieldNode col : cols) { + int index = originalOutputColumnNames.indexOf(col.getFieldName()); + newOutputColumnNames.add(col.getFieldName()); newColList.add(originalColList.get(index)); rs_newsignature.add(rs_oldsignature.get(index)); } op.getSchema().setSignature(rs_newsignature); conf.setColList(newColList); conf.setOutputColumnNames(newOutputColumnNames); - handleChildren(op, cols, cppCtx); + handleChildren(op, toColumnNames(cols), cppCtx); } return null; @@ -999,21 +1009,21 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object.. throws SemanticException { ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; UnionOperator op = (UnionOperator) nd; - List childColLists = cppCtx.genColLists(op); + List childColLists = cppCtx.genColLists(op); if (childColLists == null) { return null; } RowSchema inputSchema = op.getSchema(); if (inputSchema != null) { - List positions = new ArrayList<>(); - RowSchema oldRS = op.getSchema(); - for (int index = 0; index < oldRS.getSignature().size(); index++) { - ColumnInfo colInfo = oldRS.getSignature().get(index); - if (childColLists.contains(colInfo.getInternalName())) { - positions.add(index); + List prunedCols = new ArrayList<>(); + for (int index = 0; index < inputSchema.getSignature().size(); index++) { + ColumnInfo colInfo = inputSchema.getSignature().get(index); + FieldNode fn = lookupColumn(childColLists, colInfo.getInternalName()); + if (fn != null) { + prunedCols.add(fn); } } - cppCtx.getUnionPrunedColLists().put(op, positions); + cppCtx.getPrunedColLists().put(op, prunedCols); } return null; } @@ -1021,7 +1031,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object.. private static void pruneOperator(NodeProcessorCtx ctx, Operator op, - List cols) + List cols) throws SemanticException { // the pruning needs to preserve the order of columns in the input schema RowSchema inputSchema = op.getSchema(); @@ -1029,7 +1039,7 @@ private static void pruneOperator(NodeProcessorCtx ctx, ArrayList rs = new ArrayList(); RowSchema oldRS = op.getSchema(); for(ColumnInfo i : oldRS.getSignature()) { - if ( cols.contains(i.getInternalName())) { + if (lookupColumn(cols, i.getInternalName()) != null) { rs.add(i); } } @@ -1044,16 +1054,17 @@ private static void pruneOperator(NodeProcessorCtx ctx, * @return * @throws SemanticException */ - private static List preserveColumnOrder(Operator op, - List cols) + private static List preserveColumnOrder(Operator op, + List cols) throws SemanticException { RowSchema inputSchema = op.getSchema(); if (inputSchema != null) { - ArrayList rs = new ArrayList(); + ArrayList rs = new ArrayList<>(); ArrayList inputCols = inputSchema.getSignature(); for (ColumnInfo i: inputCols) { - if (cols.contains(i.getInternalName())) { - rs.add(i.getInternalName()); + FieldNode fn = lookupColumn(cols, i.getInternalName()); + if (fn != null) { + rs.add(fn); } } return rs; @@ -1062,7 +1073,6 @@ private static void pruneOperator(NodeProcessorCtx ctx, } } - private static void pruneJoinOperator(NodeProcessorCtx ctx, CommonJoinOperator op, JoinDesc conf, Map columnExprMap, @@ -1073,14 +1083,14 @@ private static void pruneJoinOperator(NodeProcessorCtx ctx, LOG.info("JOIN " + op.getIdentifier() + " oldExprs: " + conf.getExprs()); - List childColLists = cppCtx.genColLists(op); + List childColLists = cppCtx.genColLists(op); if (childColLists == null) { return; } - Map> prunedColLists = new HashMap>(); + Map> prunedColLists = new HashMap<>(); for (byte tag : conf.getTagOrder()) { - prunedColLists.put(tag, new ArrayList()); + prunedColLists.put(tag, new ArrayList()); } //add the columns in join filters @@ -1091,8 +1101,8 @@ private static void pruneJoinOperator(NodeProcessorCtx ctx, Map.Entry> entry = iter.next(); Byte tag = entry.getKey(); for (ExprNodeDesc desc : entry.getValue()) { - List cols = prunedColLists.get(tag); - cols = Utilities.mergeUniqElems(cols, desc.getCols()); + List cols = prunedColLists.get(tag); + cols = mergeFieldNodesWithDesc(cols, desc); prunedColLists.put(tag, cols); } } @@ -1106,7 +1116,7 @@ private static void pruneJoinOperator(NodeProcessorCtx ctx, String internalName = conf.getOutputColumnNames().get(i); ExprNodeDesc desc = columnExprMap.get(internalName); Byte tag = conf.getReversedExprs().get(internalName); - if (!childColLists.contains(internalName)) { + if (lookupColumn(childColLists, internalName) == null) { int index = conf.getExprs().get(tag).indexOf(desc); if (index < 0) { continue; @@ -1116,12 +1126,12 @@ private static void pruneJoinOperator(NodeProcessorCtx ctx, retainMap.get(tag).remove(index); } } else { - List prunedRSList = prunedColLists.get(tag); + List prunedRSList = prunedColLists.get(tag); if (prunedRSList == null) { - prunedRSList = new ArrayList(); + prunedRSList = new ArrayList<>(); prunedColLists.put(tag, prunedRSList); } - prunedRSList = Utilities.mergeUniqElems(prunedRSList, desc.getCols()); + prunedColLists.put(tag, mergeFieldNodesWithDesc(prunedRSList, desc)); outputCols.add(internalName); newColExprMap.put(internalName, desc); } @@ -1154,8 +1164,8 @@ private static void pruneJoinOperator(NodeProcessorCtx ctx, for (int j = 0; j < lists.size(); j++) { ExprNodeDesc desc = lists.get(j); Byte tag = entry.getKey(); - List cols = prunedColLists.get(tag); - cols = Utilities.mergeUniqElems(cols, desc.getCols()); + List cols = prunedColLists.get(tag); + cols = mergeFieldNodesWithDesc(cols, desc); prunedColLists.put(tag, cols); } } @@ -1164,7 +1174,7 @@ private static void pruneJoinOperator(NodeProcessorCtx ctx, for (Operator child : childOperators) { if (child instanceof ReduceSinkOperator) { - boolean[] flags = getPruneReduceSinkOpRetainFlags(childColLists, + boolean[] flags = getPruneReduceSinkOpRetainFlags(toColumnNames(childColLists), (ReduceSinkOperator) child); pruneReduceSinkOperator(flags, (ReduceSinkOperator) child, cppCtx); } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/FieldNode.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/FieldNode.java index 1579797..c96e1fb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/FieldNode.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/FieldNode.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.optimizer; +import com.google.common.base.Preconditions; + import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -35,9 +37,21 @@ public String getFieldName() { return fieldName; } + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } + public void addFieldNodes(FieldNode... nodes) { - if (nodes != null || nodes.length > 0) { - this.nodes.addAll(Arrays.asList(nodes)); + if (nodes != null) { + addFieldNodes(Arrays.asList(nodes)); + } + } + + public void addFieldNodes(List nodes) { + for (FieldNode fn : nodes) { + if (fn != null) { + this.nodes.add(fn); + } } } @@ -45,6 +59,95 @@ public void addFieldNodes(FieldNode... nodes) { return nodes; } + public void setNodes(List nodes) { + this.nodes = nodes; + } + + public List toPaths() { + List result = new ArrayList<>(); + if (nodes.isEmpty()) { + result.add(fieldName); + } else { + for (FieldNode child : nodes) { + for (String rest : child.toPaths()) { + result.add(fieldName + "." + rest); + } + } + } + return result; + } + + public static FieldNode fromPath(String path) { + String[] parts = path.split("\\."); + return fromPath(parts, 0); + } + + private static FieldNode fromPath(String[] parts, int index) { + if (index == parts.length) { + return null; + } + FieldNode fn = new FieldNode(parts[index]); + fn.addFieldNodes(fromPath(parts, index + 1)); + return fn; + } + + /** + * Merge the field node 'fn' into list 'nodes', and return the result list. + */ + public static List mergeFieldNodes(List nodes, FieldNode fn) { + List result = new ArrayList<>(nodes); + for (int i = 0; i < nodes.size(); ++i) { + FieldNode mfn = mergeFieldNode(nodes.get(i), fn); + if (mfn != null) { + result.set(i, mfn); + return result; + } + } + result.add(fn); + return result; + } + + public static List mergeFieldNodes(List left, List right) { + List result = new ArrayList<>(left); + for (FieldNode fn : right) { + result = mergeFieldNodes(result, fn); + } + return result; + } + + /** + * Merge the field nodes 'left' and 'right' and return the merged node. + * Return null if the two nodes cannot be merged. + * + * There are basically 3 cases here: + * 1. 'left' and 'right' have the same depth, e.g., 'left' is s[b[c]] and + * 'right' is s[b[d]]. In this case, the merged node is s[b[c,d]] + * 2. 'left' has larger depth than 'right', e.g., 'left' is s[b] while + * 'right' is s[b[d]]. In this case, the merged node is s[b] + * 3. 'left' has smaller depth than 'right', e.g., 'left' is s[b[c]] while + * 'right' is s[b]. This is the opposite case of 2), and similarly, + * the merged node is s[b]. + * + * A example where the two inputs cannot be merged is, 'left' is s[b] while + * 'right' is p[c]. + */ + public static FieldNode mergeFieldNode(FieldNode left, FieldNode right) { + Preconditions.checkArgument(left.getFieldName() != null && right.getFieldName() != null); + if (!left.getFieldName().equals(right.getFieldName())) { + return null; + } + if (left.getNodes().isEmpty()) { + return left; + } else if (right.getNodes().isEmpty()) { + return right; + } else { + // Both are not empty. Merge two lists. + FieldNode result = new FieldNode(left.getFieldName()); + result.setNodes(mergeFieldNodes(left.getNodes(), right.getNodes())); + return result; + } + } + @Override public String toString() { String res = fieldName; @@ -63,18 +166,27 @@ public String toString() { } @Override - public boolean equals(Object object) { - FieldNode fieldNode = (FieldNode) object; - if (!fieldName.equals(fieldNode.getFieldName()) || fieldNode.getNodes().size() != fieldNode - .getNodes().size()) { + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { return false; } - for (int i = 0; i < fieldNode.getNodes().size(); i++) { - if (fieldNode.getNodes().get(i).equals(nodes.get(i))) { - return false; - } + FieldNode fieldNode = (FieldNode) o; + + if (fieldName != null ? !fieldName.equals(fieldNode.fieldName) : fieldNode.fieldName != null) { + return false; } - return true; + return nodes != null ? nodes.equals(fieldNode.nodes) : fieldNode.nodes == null; + + } + + @Override + public int hashCode() { + int result = fieldName != null ? fieldName.hashCode() : 0; + result = 31 * result + (nodes != null ? nodes.hashCode() : 0); + return result; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java index 3d11907..dcea0e5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.List; +import org.apache.hadoop.hive.ql.optimizer.FieldNode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; @@ -213,7 +214,7 @@ private void replaceTableScanProcess(TableScanOperator scanOperator) throws Sema rewriteQueryCtx.getParseContext().setTopOps(topOps); ColumnPrunerProcFactory.setupNeededColumns(scanOperator, rs, - Arrays.asList(rewriteQueryCtx.getIndexKey())); + Arrays.asList(new FieldNode(rewriteQueryCtx.getIndexKey()))); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index d55db0a..42a7ab9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -9135,9 +9135,9 @@ private Operator genSelectAllDesc(Operator input) throws SemanticException { new HashMap(); for (int i = 0; i < columns.size(); i++) { ColumnInfo col = columns.get(i); - colList.add(new ExprNodeColumnDesc(col)); + colList.add(new ExprNodeColumnDesc(col, true)); columnNames.add(col.getInternalName()); - columnExprMap.put(col.getInternalName(), new ExprNodeColumnDesc(col)); + columnExprMap.put(col.getInternalName(), new ExprNodeColumnDesc(col, true)); } RowResolver outputRR = inputRR.duplicate(); Operator output = putOpInsertMap(OperatorFactory.getAndMakeChild( diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java index 9a32054..4cfd0d7 100755 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java @@ -54,36 +54,60 @@ */ private boolean isSkewedCol; + /** + * Is this column a generated column, i.e., a column + * that is generated from table schema when Hive inserting SEL op for column pruning. + * This column has no relation with the input query. + * + * This is used for nested column pruning where we could have the following scenario: + * ... + * | + * SEL (use a) + * | + * OP (use a.f) + * | + * ... + * Without this field we do not know whether the column 'a' is actually specified in + * the input query or an inserted op by Hive. For the former case, the pruning needs + * to produce 'a', while for the latter case, it should produce 'a.f'. + */ + private transient boolean isGenerated; + public ExprNodeColumnDesc() { } public ExprNodeColumnDesc(ColumnInfo ci) { - this(ci.getType(), ci.getInternalName(), ci.getTabAlias(), ci.getIsVirtualCol()); + this(ci, false); + } + + public ExprNodeColumnDesc(ColumnInfo ci, boolean isGenerated) { + this(ci.getType(), ci.getInternalName(), ci.getTabAlias(), ci.getIsVirtualCol(), false, isGenerated); } public ExprNodeColumnDesc(TypeInfo typeInfo, String column, String tabAlias, boolean isPartitionColOrVirtualCol) { - super(typeInfo); - this.column = column; - this.tabAlias = tabAlias; - this.isPartitionColOrVirtualCol = isPartitionColOrVirtualCol; + this(typeInfo, column, tabAlias, isPartitionColOrVirtualCol, false, false); } public ExprNodeColumnDesc(Class c, String column, String tabAlias, boolean isPartitionColOrVirtualCol) { - super(TypeInfoFactory.getPrimitiveTypeInfoFromJavaPrimitive(c)); - this.column = column; - this.tabAlias = tabAlias; - this.isPartitionColOrVirtualCol = isPartitionColOrVirtualCol; + this(TypeInfoFactory.getPrimitiveTypeInfoFromJavaPrimitive(c), + column, tabAlias, isPartitionColOrVirtualCol, false, false); } public ExprNodeColumnDesc(TypeInfo typeInfo, String column, String tabAlias, boolean isPartitionColOrVirtualCol, boolean isSkewedCol) { + this(typeInfo, column, tabAlias, isPartitionColOrVirtualCol, isSkewedCol, false); + } + + public ExprNodeColumnDesc(TypeInfo typeInfo, String column, String tabAlias, + boolean isPartitionColOrVirtualCol, boolean isSkewedCol, boolean isGenerated) { super(typeInfo); this.column = column; this.tabAlias = tabAlias; this.isPartitionColOrVirtualCol = isPartitionColOrVirtualCol; this.isSkewedCol = isSkewedCol; + this.isGenerated = isGenerated; } public String getColumn() { @@ -110,6 +134,14 @@ public void setIsPartitionColOrVirtualCol(boolean isPartitionCol) { this.isPartitionColOrVirtualCol = isPartitionCol; } + public boolean getIsGenerated() { + return this.isGenerated; + } + + public void setIsGenerated(boolean isGenerated) { + this.isGenerated = isGenerated; + } + @Override public String toString() { return "Column[" + column + "]"; diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/TableDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/TableDesc.java index 1da8e91..4f053d8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/TableDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/TableDesc.java @@ -199,6 +199,10 @@ public int hashCode() { @Override public boolean equals(Object o) { + if (o == this) { + return true; + } + if (!(o instanceof TableDesc)) { return false; } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestDataWritableReadSupport.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestDataWritableReadSupport.java index b3aaca6..fc08ea6 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestDataWritableReadSupport.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestDataWritableReadSupport.java @@ -13,11 +13,13 @@ */ package org.apache.hadoop.hive.ql.io.parquet.read; +import com.google.common.collect.Sets; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; import org.junit.Test; import java.util.Arrays; +import java.util.HashSet; import static org.apache.hadoop.hive.ql.io.parquet.HiveParquetSchemaTestUtils.testConversion; @@ -36,7 +38,7 @@ public void testGetProjectedSchema1() throws Exception { testConversion("structCol", "struct", DataWritableReadSupport .getProjectedSchema(originalMsg, Arrays.asList("structCol"), Arrays.asList(0), - Arrays.asList("structCol.a")).toString()); + Sets.newHashSet("structCol.a")).toString()); } @Test @@ -51,7 +53,7 @@ public void testGetProjectedSchema2() throws Exception { testConversion("structCol", "struct", DataWritableReadSupport .getProjectedSchema(originalMsg, Arrays.asList("structCol"), Arrays.asList(0), - Arrays.asList("structCol.a", "structCol.b")).toString()); + Sets.newHashSet("structCol.a", "structCol.b")).toString()); } @Test @@ -67,7 +69,7 @@ public void testGetProjectedSchema3() throws Exception { testConversion("structCol,c", "struct,boolean", DataWritableReadSupport .getProjectedSchema(originalMsg, Arrays.asList("structCol", "c"), Arrays.asList(0, 1), - Arrays.asList("structCol.b", "c")).toString()); + Sets.newHashSet("structCol.b", "c")).toString()); } @Test @@ -86,7 +88,7 @@ public void testGetProjectedSchema4() throws Exception { testConversion("structCol", "struct>", DataWritableReadSupport .getProjectedSchema(originalMsg, Arrays.asList("structCol"), Arrays.asList(0), - Arrays.asList("structCol.subStructCol.b")).toString()); + Sets.newHashSet("structCol.subStructCol.b")).toString()); } @Test @@ -105,8 +107,8 @@ public void testGetProjectedSchema5() throws Exception { testConversion("structCol", "struct>", DataWritableReadSupport - .getProjectedSchema(originalMsg, Arrays.asList("structCol"), Arrays.asList(0), Arrays - .asList("structCol.subStructCol", "structCol.subStructCol.b", - "structCol.subStructCol.c")).toString()); + .getProjectedSchema(originalMsg, Arrays.asList("structCol"), Arrays.asList(0), + Sets.newHashSet("structCol.subStructCol", "structCol.subStructCol.b", + "structCol.subStructCol.c")).toString()); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestColumnPrunerProcCtx.java ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestColumnPrunerProcCtx.java index dfcd154..2cfa747 100644 --- ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestColumnPrunerProcCtx.java +++ ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestColumnPrunerProcCtx.java @@ -38,6 +38,7 @@ import java.util.List; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -80,11 +81,11 @@ public void testGetSelectNestedColPathsFromChildren1() { ExprNodeDesc colDesc = new ExprNodeColumnDesc(col3Type, "root", "test", false); ExprNodeDesc col1 = new ExprNodeFieldDesc(col1Type, colDesc, "col1", false); ExprNodeDesc fieldDesc = new ExprNodeFieldDesc(TypeInfoFactory.booleanTypeInfo, col1, "a", false); - final List paths = Arrays.asList("_col0"); + final List paths = Arrays.asList(new FieldNode("_col0")); SelectOperator selectOperator = buildSelectOperator(Arrays.asList(fieldDesc), paths); - List groups = ctx.getSelectNestedColPathsFromChildren(selectOperator, paths); - assertEquals(new String[] { "root.col1.a" }, groups.toArray(new String[groups.size()])); + List groups = ctx.getSelectColsFromChildren(selectOperator, paths); + compareTestResults(groups, "root.col1.a"); } // Test select root.col1 from root:struct,col2:double> @@ -94,11 +95,11 @@ public void testGetSelectNestedColPathsFromChildren2() { ExprNodeDesc colDesc = new ExprNodeColumnDesc(col3Type, "root", "test", false); ExprNodeDesc fieldDesc = new ExprNodeFieldDesc(col1Type, colDesc, "col1", false); - final List paths = Arrays.asList("_col0"); + final List paths = Arrays.asList(new FieldNode("_col0")); SelectOperator selectOperator = buildSelectOperator(Arrays.asList(fieldDesc), paths); - List groups = ctx.getSelectNestedColPathsFromChildren(selectOperator, paths); - assertEquals(new String[] { "root.col1" }, groups.toArray(new String[groups.size()])); + List groups = ctx.getSelectColsFromChildren(selectOperator, paths); + compareTestResults(groups, "root.col1"); } // Test select root.col2 from root:struct,col2:double> @@ -108,11 +109,11 @@ public void testGetSelectNestedColPathsFromChildren3() { ExprNodeDesc colDesc = new ExprNodeColumnDesc(col3Type, "root", "test", false); ExprNodeDesc fieldDesc = new ExprNodeFieldDesc(col1Type, colDesc, "col2", false); - final List paths = Arrays.asList("_col0"); + final List paths = Arrays.asList(new FieldNode("_col0")); SelectOperator selectOperator = buildSelectOperator(Arrays.asList(fieldDesc), paths); - List groups = ctx.getSelectNestedColPathsFromChildren(selectOperator, paths); - assertEquals(new String[] { "root.col2" }, groups.toArray(new String[groups.size()])); + List groups = ctx.getSelectColsFromChildren(selectOperator, paths); + compareTestResults(groups, "root.col2"); } // Test select root from root:struct,col2:double> @@ -121,11 +122,11 @@ public void testGetSelectNestedColPathsFromChildren4() { ColumnPrunerProcCtx ctx = new ColumnPrunerProcCtx(null); ExprNodeDesc colDesc = new ExprNodeColumnDesc(col3Type, "root", "test", false); - final List paths = Arrays.asList("_col0"); + final List paths = Arrays.asList(new FieldNode("_col0")); SelectOperator selectOperator = buildSelectOperator(Arrays.asList(colDesc), paths); - List groups = ctx.getSelectNestedColPathsFromChildren(selectOperator, paths); - assertEquals(new String[] { "root" }, groups.toArray(new String[groups.size()])); + List groups = ctx.getSelectColsFromChildren(selectOperator, paths); + compareTestResults(groups, "root"); } // Test select named_struct from named_struct:struct @@ -143,9 +144,9 @@ public void testGetSelectNestedColPathsFromChildren5(){ ExprNodeDesc fieldDesc = new ExprNodeFieldDesc(TypeInfoFactory.doubleTypeInfo, funcDesc, "foo", false); - final List paths = Arrays.asList("_col0"); + final List paths = Arrays.asList(new FieldNode("_col0")); SelectOperator selectOperator = buildSelectOperator(Arrays.asList(fieldDesc), paths); - List groups = ctx.getSelectNestedColPathsFromChildren(selectOperator, paths); + List groups = ctx.getSelectColsFromChildren(selectOperator, paths); // Return empty result since only constant Desc exists assertEquals(0, groups.size()); } @@ -160,7 +161,7 @@ public void testGetSelectNestedColPathsFromChildren6(){ ExprNodeDesc col1 = new ExprNodeFieldDesc(col1Type, colDesc, "col1", false); ExprNodeDesc fieldDesc = new ExprNodeFieldDesc(TypeInfoFactory.doubleTypeInfo, col1, "b", false); - final List paths = Arrays.asList("_col0"); + final List paths = Arrays.asList(new FieldNode("_col0")); GenericUDF udf = mock(GenericUDFBridge.class); @@ -170,8 +171,8 @@ public void testGetSelectNestedColPathsFromChildren6(){ list); SelectOperator selectOperator = buildSelectOperator(Arrays.asList(funcDesc), paths); - List groups = ctx.getSelectNestedColPathsFromChildren(selectOperator, paths); - assertEquals(new String[] { "root.col1.b" }, groups.toArray(new String[groups.size()])); + List groups = ctx.getSelectColsFromChildren(selectOperator, paths); + compareTestResults(groups, "root.col1.b"); } // Test select pow(root.col1.b, root.col2) from table test(root @@ -187,7 +188,7 @@ public void testGetSelectNestedColPathsFromChildren7(){ colDesc = new ExprNodeColumnDesc(col3Type, "root", "test", false); ExprNodeDesc col2 = new ExprNodeFieldDesc(col2Type, colDesc, "col2", false); - final List paths = Arrays.asList("_col0"); + final List paths = Arrays.asList(new FieldNode("_col0")); GenericUDF udf = mock(GenericUDFPower.class); @@ -198,16 +199,60 @@ public void testGetSelectNestedColPathsFromChildren7(){ list); SelectOperator selectOperator = buildSelectOperator(Arrays.asList(funcDesc), paths); - List groups = ctx.getSelectNestedColPathsFromChildren(selectOperator, paths); - assertEquals(new String[] { "root.col1.b", "root.col2" }, groups.toArray(new String[groups - .size()])); + List groups = ctx.getSelectColsFromChildren(selectOperator, paths); + compareTestResults(groups, "root.col1.b", "root.col2"); + } + + @Test + public void testFieldNodeFromString() { + FieldNode fn = FieldNode.fromPath("s.a.b"); + assertEquals("s", fn.getFieldName()); + assertEquals(1, fn.getNodes().size()); + FieldNode childFn = fn.getNodes().get(0); + assertEquals("a", childFn.getFieldName()); + assertEquals(1, childFn.getNodes().size()); + assertEquals("b", childFn.getNodes().get(0).getFieldName()); + } + + @Test + public void testMergeFieldNode() { + FieldNode fn1 = FieldNode.fromPath("s.a.b"); + FieldNode fn2 = FieldNode.fromPath("s.a"); + assertEquals(fn2, FieldNode.mergeFieldNode(fn1, fn2)); + assertEquals(fn2, FieldNode.mergeFieldNode(fn2, fn1)); + + fn1 = FieldNode.fromPath("s.a"); + fn2 = FieldNode.fromPath("p.b"); + assertNull(FieldNode.mergeFieldNode(fn1, fn2)); + + fn1 = FieldNode.fromPath("s.a.b"); + fn2 = FieldNode.fromPath("s.a.c"); + FieldNode fn = FieldNode.mergeFieldNode(fn1, fn2); + assertEquals("s", fn.getFieldName()); + FieldNode childFn = fn.getNodes().get(0); + assertEquals("a", childFn.getFieldName()); + assertEquals(2, childFn.getNodes().size()); + assertEquals("b", childFn.getNodes().get(0).getFieldName()); + assertEquals("c", childFn.getNodes().get(1).getFieldName()); + } + + private void compareTestResults(List fieldNodes, String... paths) { + List expectedPaths = new ArrayList<>(); + for (FieldNode fn : fieldNodes) { + expectedPaths.addAll(fn.toPaths()); + } + assertEquals("Expected paths to have length " + expectedPaths + ", but got " + + paths.length, expectedPaths.size(), paths.length); + for (int i = 0; i < expectedPaths.size(); ++i) { + assertEquals("Element at index " + i + " doesn't match", expectedPaths.get(i), paths[i]); + } } private SelectOperator buildSelectOperator( List colList, - List outputColumnNames) { + List outputCols) { SelectOperator selectOperator = mock(SelectOperator.class); - SelectDesc selectDesc = new SelectDesc(colList, outputColumnNames); + SelectDesc selectDesc = new SelectDesc(colList, ColumnPrunerProcCtx.toColumnNames(outputCols)); selectDesc.setSelStarNoCompute(false); when(selectOperator.getConf()).thenReturn(selectDesc); return selectOperator; diff --git ql/src/test/queries/clientpositive/nested_column_pruning.q ql/src/test/queries/clientpositive/nested_column_pruning.q new file mode 100644 index 0000000..28b974e --- /dev/null +++ ql/src/test/queries/clientpositive/nested_column_pruning.q @@ -0,0 +1,112 @@ +set hive.fetch.task.conversion = none; + +-- First, create source tables +DROP TABLE IF EXISTS dummy; +CREATE TABLE dummy (i int); +INSERT INTO TABLE dummy VALUES (42); + +DROP TABLE IF EXISTS nested_tbl_1; +CREATE TABLE nested_tbl_1 ( + a int, + s1 struct, f6: int>, + s2 struct, f11: map>>, + s3 struct>> +) STORED AS PARQUET; + +INSERT INTO TABLE nested_tbl_1 SELECT + 1, named_struct('f1', false, 'f2', 'foo', 'f3', named_struct('f4', 4, 'f5', cast(5.0 as double)), 'f6', 4), + named_struct('f7', 'f7', 'f8', named_struct('f9', true, 'f10', array(10, 11), 'f11', map('key1', true, 'key2', false))), + named_struct('f12', array(named_struct('f13', 'foo', 'f14', 14), named_struct('f13', 'bar', 'f14', 28))) +FROM dummy; + +DROP TABLE IF EXISTS nested_tbl_2; +CREATE TABLE nested_tbl_2 LIKE nested_tbl_1; + +INSERT INTO TABLE nested_tbl_2 SELECT + 2, named_struct('f1', true, 'f2', 'bar', 'f3', named_struct('f4', 4, 'f5', cast(6.5 as double)), 'f6', 4), + named_struct('f7', 'f72', 'f8', named_struct('f9', false, 'f10', array(20, 22), 'f11', map('key3', true, 'key4', false))), + named_struct('f12', array(named_struct('f13', 'bar', 'f14', 28), named_struct('f13', 'foo', 'f14', 56))) +FROM dummy; + +-- Testing only select statements + +EXPLAIN SELECT a FROM nested_tbl_1; +SELECT a FROM nested_tbl_1; + +EXPLAIN SELECT s1.f1 FROM nested_tbl_1; +SELECT s1.f1 FROM nested_tbl_1; + +EXPLAIN SELECT s1.f1, s1.f2 FROM nested_tbl_1; +SELECT s1.f1, s1.f2 FROM nested_tbl_1; + +-- In this case 's1.f3' and 's1.f3.f4' should be merged +EXPLAIN SELECT s1.f3, s1.f3.f4 FROM nested_tbl_1; +SELECT s1.f3, s1.f3.f4 FROM nested_tbl_1; + +-- Testing select array and index shifting +EXPLAIN SELECT s1.f3.f5 FROM nested_tbl_1; +SELECT s1.f3.f5 FROM nested_tbl_1; + +-- Testing select from multiple structs +EXPLAIN SELECT s1.f3.f4, s2.f8.f9 FROM nested_tbl_1; +SELECT s1.f3.f4, s2.f8.f9 FROM nested_tbl_1; + + +-- Testing select with filter + +EXPLAIN SELECT s1.f2 FROM nested_tbl_1 WHERE s1.f1 = FALSE; +SELECT s1.f2 FROM nested_tbl_1 WHERE s1.f1 = FALSE; + +EXPLAIN SELECT s1.f3.f5 FROM nested_tbl_1 WHERE s1.f3.f4 = 4; +SELECT s1.f3.f5 FROM nested_tbl_1 WHERE s1.f3.f4 = 4; + +EXPLAIN SELECT s2.f8 FROM nested_tbl_1 WHERE s1.f2 = 'foo' AND size(s2.f8.f10) > 1 AND s2.f8.f11['key1'] = TRUE; +SELECT s2.f8 FROM nested_tbl_1 WHERE s1.f2 = 'foo' AND size(s2.f8.f10) > 1 AND s2.f8.f11['key1'] = TRUE; + + +-- Testing lateral view + +EXPLAIN SELECT col1, col2 FROM nested_tbl_1 +LATERAL VIEW explode(s2.f8.f10) tbl1 AS col1 +LATERAL VIEW explode(s3.f12) tbl2 AS col2; +SELECT col1, col2 FROM nested_tbl_1 +LATERAL VIEW explode(s2.f8.f10) tbl1 AS col1 +LATERAL VIEW explode(s3.f12) tbl2 AS col2; + + +-- Testing UDFs +EXPLAIN SELECT pmod(s2.f8.f10[1], s1.f3.f4) FROM nested_tbl_1; +SELECT pmod(s2.f8.f10[1], s1.f3.f4) FROM nested_tbl_1; + + +-- Testing aggregations + +EXPLAIN SELECT s1.f3.f5, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3.f5; +SELECT s1.f3.f5, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3.f5; + +EXPLAIN SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3; +SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3; + +EXPLAIN SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 ORDER BY s1.f3; +SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 ORDER BY s1.f3; + + +-- Testing joins + +EXPLAIN SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_2 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == FALSE; +SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_2 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == FALSE; + +EXPLAIN SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_1 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == TRUE; +SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_1 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == TRUE; diff --git ql/src/test/results/clientpositive/nested_column_pruning.q.out ql/src/test/results/clientpositive/nested_column_pruning.q.out new file mode 100644 index 0000000..44b07d6 --- /dev/null +++ ql/src/test/results/clientpositive/nested_column_pruning.q.out @@ -0,0 +1,1090 @@ +PREHOOK: query: -- First, create source tables +DROP TABLE IF EXISTS dummy +PREHOOK: type: DROPTABLE +POSTHOOK: query: -- First, create source tables +DROP TABLE IF EXISTS dummy +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE dummy (i int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dummy +POSTHOOK: query: CREATE TABLE dummy (i int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dummy +PREHOOK: query: INSERT INTO TABLE dummy VALUES (42) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@dummy +POSTHOOK: query: INSERT INTO TABLE dummy VALUES (42) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@dummy +POSTHOOK: Lineage: dummy.i EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: DROP TABLE IF EXISTS nested_tbl_1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS nested_tbl_1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE nested_tbl_1 ( + a int, + s1 struct, f6: int>, + s2 struct, f11: map>>, + s3 struct>> +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@nested_tbl_1 +POSTHOOK: query: CREATE TABLE nested_tbl_1 ( + a int, + s1 struct, f6: int>, + s2 struct, f11: map>>, + s3 struct>> +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@nested_tbl_1 +PREHOOK: query: INSERT INTO TABLE nested_tbl_1 SELECT + 1, named_struct('f1', false, 'f2', 'foo', 'f3', named_struct('f4', 4, 'f5', cast(5.0 as double)), 'f6', 4), + named_struct('f7', 'f7', 'f8', named_struct('f9', true, 'f10', array(10, 11), 'f11', map('key1', true, 'key2', false))), + named_struct('f12', array(named_struct('f13', 'foo', 'f14', 14), named_struct('f13', 'bar', 'f14', 28))) +FROM dummy +PREHOOK: type: QUERY +PREHOOK: Input: default@dummy +PREHOOK: Output: default@nested_tbl_1 +POSTHOOK: query: INSERT INTO TABLE nested_tbl_1 SELECT + 1, named_struct('f1', false, 'f2', 'foo', 'f3', named_struct('f4', 4, 'f5', cast(5.0 as double)), 'f6', 4), + named_struct('f7', 'f7', 'f8', named_struct('f9', true, 'f10', array(10, 11), 'f11', map('key1', true, 'key2', false))), + named_struct('f12', array(named_struct('f13', 'foo', 'f14', 14), named_struct('f13', 'bar', 'f14', 28))) +FROM dummy +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dummy +POSTHOOK: Output: default@nested_tbl_1 +POSTHOOK: Lineage: nested_tbl_1.a SIMPLE [] +POSTHOOK: Lineage: nested_tbl_1.s1 EXPRESSION [] +POSTHOOK: Lineage: nested_tbl_1.s2 EXPRESSION [] +POSTHOOK: Lineage: nested_tbl_1.s3 EXPRESSION [] +PREHOOK: query: DROP TABLE IF EXISTS nested_tbl_2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS nested_tbl_2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE nested_tbl_2 LIKE nested_tbl_1 +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@nested_tbl_2 +POSTHOOK: query: CREATE TABLE nested_tbl_2 LIKE nested_tbl_1 +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@nested_tbl_2 +PREHOOK: query: INSERT INTO TABLE nested_tbl_2 SELECT + 2, named_struct('f1', true, 'f2', 'bar', 'f3', named_struct('f4', 4, 'f5', cast(6.5 as double)), 'f6', 4), + named_struct('f7', 'f72', 'f8', named_struct('f9', false, 'f10', array(20, 22), 'f11', map('key3', true, 'key4', false))), + named_struct('f12', array(named_struct('f13', 'bar', 'f14', 28), named_struct('f13', 'foo', 'f14', 56))) +FROM dummy +PREHOOK: type: QUERY +PREHOOK: Input: default@dummy +PREHOOK: Output: default@nested_tbl_2 +POSTHOOK: query: INSERT INTO TABLE nested_tbl_2 SELECT + 2, named_struct('f1', true, 'f2', 'bar', 'f3', named_struct('f4', 4, 'f5', cast(6.5 as double)), 'f6', 4), + named_struct('f7', 'f72', 'f8', named_struct('f9', false, 'f10', array(20, 22), 'f11', map('key3', true, 'key4', false))), + named_struct('f12', array(named_struct('f13', 'bar', 'f14', 28), named_struct('f13', 'foo', 'f14', 56))) +FROM dummy +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dummy +POSTHOOK: Output: default@nested_tbl_2 +POSTHOOK: Lineage: nested_tbl_2.a SIMPLE [] +POSTHOOK: Lineage: nested_tbl_2.s1 EXPRESSION [] +POSTHOOK: Lineage: nested_tbl_2.s2 EXPRESSION [] +POSTHOOK: Lineage: nested_tbl_2.s3 EXPRESSION [] +PREHOOK: query: -- Testing only select statements + +EXPLAIN SELECT a FROM nested_tbl_1 +PREHOOK: type: QUERY +POSTHOOK: query: -- Testing only select statements + +EXPLAIN SELECT a FROM nested_tbl_1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: a + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a FROM nested_tbl_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a FROM nested_tbl_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +1 +PREHOOK: query: EXPLAIN SELECT s1.f1 FROM nested_tbl_1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT s1.f1 FROM nested_tbl_1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1.f1 (type: boolean) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f1 FROM nested_tbl_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f1 FROM nested_tbl_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +false +PREHOOK: query: EXPLAIN SELECT s1.f1, s1.f2 FROM nested_tbl_1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT s1.f1, s1.f2 FROM nested_tbl_1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f1, s1.f2 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1.f1 (type: boolean), s1.f2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f1, s1.f2 FROM nested_tbl_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f1, s1.f2 FROM nested_tbl_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +false foo +PREHOOK: query: -- In this case 's1.f3' and 's1.f3.f4' should be merged +EXPLAIN SELECT s1.f3, s1.f3.f4 FROM nested_tbl_1 +PREHOOK: type: QUERY +POSTHOOK: query: -- In this case 's1.f3' and 's1.f3.f4' should be merged +EXPLAIN SELECT s1.f3, s1.f3.f4 FROM nested_tbl_1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f3 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1.f3 (type: struct), s1.f3.f4 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f3, s1.f3.f4 FROM nested_tbl_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f3, s1.f3.f4 FROM nested_tbl_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +{"f4":4,"f5":5.0} 4 +PREHOOK: query: -- Testing select array and index shifting +EXPLAIN SELECT s1.f3.f5 FROM nested_tbl_1 +PREHOOK: type: QUERY +POSTHOOK: query: -- Testing select array and index shifting +EXPLAIN SELECT s1.f3.f5 FROM nested_tbl_1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f3.f5 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1.f3.f5 (type: double) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f3.f5 FROM nested_tbl_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f3.f5 FROM nested_tbl_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +5.0 +PREHOOK: query: -- Testing select from multiple structs +EXPLAIN SELECT s1.f3.f4, s2.f8.f9 FROM nested_tbl_1 +PREHOOK: type: QUERY +POSTHOOK: query: -- Testing select from multiple structs +EXPLAIN SELECT s1.f3.f4, s2.f8.f9 FROM nested_tbl_1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f3.f4, s2.f8.f9 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1.f3.f4 (type: int), s2.f8.f9 (type: boolean) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f3.f4, s2.f8.f9 FROM nested_tbl_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f3.f4, s2.f8.f9 FROM nested_tbl_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +4 true +PREHOOK: query: -- Testing select with filter + +EXPLAIN SELECT s1.f2 FROM nested_tbl_1 WHERE s1.f1 = FALSE +PREHOOK: type: QUERY +POSTHOOK: query: -- Testing select with filter + +EXPLAIN SELECT s1.f2 FROM nested_tbl_1 WHERE s1.f1 = FALSE +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f2, s1.f1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (s1.f1 = false) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1.f2 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f2 FROM nested_tbl_1 WHERE s1.f1 = FALSE +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f2 FROM nested_tbl_1 WHERE s1.f1 = FALSE +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +foo +PREHOOK: query: EXPLAIN SELECT s1.f3.f5 FROM nested_tbl_1 WHERE s1.f3.f4 = 4 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT s1.f3.f5 FROM nested_tbl_1 WHERE s1.f3.f4 = 4 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f3.f5, s1.f3.f4 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (s1.f3.f4 = 4) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1.f3.f5 (type: double) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f3.f5 FROM nested_tbl_1 WHERE s1.f3.f4 = 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f3.f5 FROM nested_tbl_1 WHERE s1.f3.f4 = 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +5.0 +PREHOOK: query: EXPLAIN SELECT s2.f8 FROM nested_tbl_1 WHERE s1.f2 = 'foo' AND size(s2.f8.f10) > 1 AND s2.f8.f11['key1'] = TRUE +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT s2.f8 FROM nested_tbl_1 WHERE s1.f2 = 'foo' AND size(s2.f8.f10) > 1 AND s2.f8.f11['key1'] = TRUE +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f2, s2.f8 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((s1.f2 = 'foo') and (size(s2.f8.f10) > 1) and (s2.f8.f11['key1'] = true)) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s2.f8 (type: struct,f11:map>) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s2.f8 FROM nested_tbl_1 WHERE s1.f2 = 'foo' AND size(s2.f8.f10) > 1 AND s2.f8.f11['key1'] = TRUE +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s2.f8 FROM nested_tbl_1 WHERE s1.f2 = 'foo' AND size(s2.f8.f10) > 1 AND s2.f8.f11['key1'] = TRUE +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +{"f9":true,"f10":[10,11],"f11":{"key1":true,"key2":false}} +PREHOOK: query: -- Testing lateral view + +EXPLAIN SELECT col1, col2 FROM nested_tbl_1 +LATERAL VIEW explode(s2.f8.f10) tbl1 AS col1 +LATERAL VIEW explode(s3.f12) tbl2 AS col2 +PREHOOK: type: QUERY +POSTHOOK: query: -- Testing lateral view + +EXPLAIN SELECT col1, col2 FROM nested_tbl_1 +LATERAL VIEW explode(s2.f8.f10) tbl1 AS col1 +LATERAL VIEW explode(s3.f12) tbl2 AS col2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s3, s2.f8.f10 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Lateral View Forward + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s3 (type: struct>>) + outputColumnNames: s3 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Lateral View Join Operator + outputColumnNames: _col3, _col7 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Lateral View Forward + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col7 (type: int) + outputColumnNames: _col7 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Lateral View Join Operator + outputColumnNames: _col7, _col8 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col7 (type: int), _col8 (type: struct) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Select Operator + expressions: _col3.f12 (type: array>) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + function name: explode + Lateral View Join Operator + outputColumnNames: _col7, _col8 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col7 (type: int), _col8 (type: struct) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Select Operator + expressions: s2.f8.f10 (type: array) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + function name: explode + Lateral View Join Operator + outputColumnNames: _col3, _col7 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Lateral View Forward + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col7 (type: int) + outputColumnNames: _col7 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Lateral View Join Operator + outputColumnNames: _col7, _col8 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col7 (type: int), _col8 (type: struct) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Select Operator + expressions: _col3.f12 (type: array>) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + function name: explode + Lateral View Join Operator + outputColumnNames: _col7, _col8 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col7 (type: int), _col8 (type: struct) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT col1, col2 FROM nested_tbl_1 +LATERAL VIEW explode(s2.f8.f10) tbl1 AS col1 +LATERAL VIEW explode(s3.f12) tbl2 AS col2 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT col1, col2 FROM nested_tbl_1 +LATERAL VIEW explode(s2.f8.f10) tbl1 AS col1 +LATERAL VIEW explode(s3.f12) tbl2 AS col2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +10 {"f13":"foo","f14":14} +10 {"f13":"bar","f14":28} +11 {"f13":"foo","f14":14} +11 {"f13":"bar","f14":28} +PREHOOK: query: -- Testing UDFs +EXPLAIN SELECT pmod(s2.f8.f10[1], s1.f3.f4) FROM nested_tbl_1 +PREHOOK: type: QUERY +POSTHOOK: query: -- Testing UDFs +EXPLAIN SELECT pmod(s2.f8.f10[1], s1.f3.f4) FROM nested_tbl_1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s2.f8.f10, s1.f3.f4 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: (s2.f8.f10[1] pmod s1.f3.f4) (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT pmod(s2.f8.f10[1], s1.f3.f4) FROM nested_tbl_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT pmod(s2.f8.f10[1], s1.f3.f4) FROM nested_tbl_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +3 +PREHOOK: query: -- Testing aggregations + +EXPLAIN SELECT s1.f3.f5, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3.f5 +PREHOOK: type: QUERY +POSTHOOK: query: -- Testing aggregations + +EXPLAIN SELECT s1.f3.f5, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3.f5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f3.f5, s1.f3.f4 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1 (type: struct,f6:int>) + outputColumnNames: s1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(s1.f3.f4) + keys: s1.f3.f5 (type: double) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: double) + sort order: + + Map-reduce partition columns: _col0 (type: double) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: double) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f3.f5, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3.f5 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f3.f5, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3.f5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +5.0 1 +PREHOOK: query: EXPLAIN SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f3 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1 (type: struct,f6:int>) + outputColumnNames: s1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(s1.f3.f4) + keys: s1.f3 (type: struct) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: struct) + sort order: + + Map-reduce partition columns: _col0 (type: struct) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: struct) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +{"f4":4,"f5":5.0} 1 +PREHOOK: query: EXPLAIN SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 ORDER BY s1.f3 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 ORDER BY s1.f3 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: nested_tbl_1 + NestedColumnPaths: s1.f3 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s1 (type: struct,f6:int>) + outputColumnNames: s1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(s1.f3.f4) + keys: s1.f3 (type: struct) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: struct) + sort order: + + Map-reduce partition columns: _col0 (type: struct) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: struct) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: struct) + sort order: + + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 ORDER BY s1.f3 +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT s1.f3, count(s1.f3.f4) FROM nested_tbl_1 GROUP BY s1.f3 ORDER BY s1.f3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +{"f4":4,"f5":5.0} 1 +PREHOOK: query: -- Testing joins + +EXPLAIN SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_2 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == FALSE +PREHOOK: type: QUERY +POSTHOOK: query: -- Testing joins + +EXPLAIN SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_2 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == FALSE +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t1 + NestedColumnPaths: s1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: s1.f3.f4 is not null (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: s1.f3.f4 (type: int) + sort order: + + Map-reduce partition columns: s1.f3.f4 (type: int) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + value expressions: s1 (type: struct,f6:int>) + TableScan + alias: t2 + NestedColumnPaths: s1.f6, s2 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (s1.f6 is not null and (s2.f8.f9 = false)) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: s1.f6 (type: int) + sort order: + + Map-reduce partition columns: s1.f6 (type: int) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + value expressions: s2 (type: struct,f11:map>>) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 s1.f3.f4 (type: int) + 1 s1.f6 (type: int) + outputColumnNames: _col1, _col9 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1.f3.f5 (type: double), _col9.f8 (type: struct,f11:map>) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_2 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == FALSE +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +PREHOOK: Input: default@nested_tbl_2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_2 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == FALSE +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +POSTHOOK: Input: default@nested_tbl_2 +#### A masked pattern was here #### +5.0 {"f9":false,"f10":[20,22],"f11":{"key3":true,"key4":false}} +PREHOOK: query: EXPLAIN SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_1 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == TRUE +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_1 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == TRUE +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t1 + NestedColumnPaths: s1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: s1.f3.f4 is not null (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: s1.f3.f4 (type: int) + sort order: + + Map-reduce partition columns: s1.f3.f4 (type: int) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + value expressions: s1 (type: struct,f6:int>) + TableScan + alias: t2 + NestedColumnPaths: s1.f6, s2 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (s1.f6 is not null and (s2.f8.f9 = true)) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: s1.f6 (type: int) + sort order: + + Map-reduce partition columns: s1.f6 (type: int) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + value expressions: s2 (type: struct,f11:map>>) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 s1.f3.f4 (type: int) + 1 s1.f6 (type: int) + outputColumnNames: _col1, _col9 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1.f3.f5 (type: double), _col9.f8 (type: struct,f11:map>) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_1 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == TRUE +PREHOOK: type: QUERY +PREHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT t1.s1.f3.f5, t2.s2.f8 +FROM nested_tbl_1 t1 JOIN nested_tbl_1 t2 +ON t1.s1.f3.f4 = t2.s1.f6 +WHERE t2.s2.f8.f9 == TRUE +POSTHOOK: type: QUERY +POSTHOOK: Input: default@nested_tbl_1 +#### A masked pattern was here #### +5.0 {"f9":true,"f10":[10,11],"f11":{"key1":true,"key2":false}} diff --git serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java index 3978a15..1354680 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java @@ -19,7 +19,10 @@ package org.apache.hadoop.hive.serde2; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.StringUtils; @@ -140,8 +143,6 @@ public static void appendNestedColumnPaths( newConfStr = newConfStr + StringUtils.COMMA_STR + old; } setReadNestedColumnPathConf(conf, newConfStr); - // Set READ_ALL_COLUMNS to false - conf.setBoolean(READ_ALL_COLUMNS, false); } @@ -194,18 +195,10 @@ public static void appendReadColumns( return result; } - public static List getNestedColumnPaths(Configuration conf) { + public static Set getNestedColumnPaths(Configuration conf) { String skips = conf.get(READ_NESTED_COLUMN_PATH_CONF_STR, READ_NESTED_COLUMN_PATH_CONF_STR_DEFAULT); - String[] list = StringUtils.split(skips); - List result = new ArrayList<>(list.length); - for (String element : list) { - // it may contain duplicates, remove duplicates - if (!result.contains(element)) { - result.add(element); - } - } - return result; + return new HashSet<>(Arrays.asList(StringUtils.split(skips))); } public static String[] getReadColumnNames(Configuration conf) { @@ -227,6 +220,7 @@ private static void setReadColumnIDConf(Configuration conf, String id) { private static void setReadNestedColumnPathConf( Configuration conf, String nestedColumnPaths) { + nestedColumnPaths = nestedColumnPaths.toLowerCase(); if (nestedColumnPaths.trim().isEmpty()) { conf.set(READ_NESTED_COLUMN_PATH_CONF_STR, READ_NESTED_COLUMN_PATH_CONF_STR_DEFAULT); } else {