diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index 58ed550..aea1c77 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -254,6 +254,9 @@ static public String getOperatorName() { List neededColumnIDs; List neededColumns; + // all column names referenced, used in ColumnAccessAnalyzer + transient List referencedColumns; + public void setNeededColumnIDs(List orign_columns) { neededColumnIDs = orign_columns; } @@ -270,6 +273,14 @@ public void setNeededColumns(List columnNames) { return neededColumns; } + public void setReferencedColumns(List referencedColumns) { + this.referencedColumns = referencedColumns; + } + + public List getReferencedColumns() { + return referencedColumns; + } + @Override public OperatorType getType() { return OperatorType.TABLESCAN; @@ -335,6 +346,7 @@ public boolean supportAutomaticSortMergeJoin() { TableScanOperator ts = (TableScanOperator) super.clone(); ts.setNeededColumnIDs(new ArrayList(getNeededColumnIDs())); ts.setNeededColumns(new ArrayList(getNeededColumns())); + ts.setReferencedColumns(new ArrayList(getReferencedColumns())); return ts; } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java index 6a4dc9b..c5a8b90 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java @@ -81,6 +81,20 @@ private ColumnPrunerProcFactory() { // prevent instantiation } + private static RowResolver buildPrunedRR(List prunedCols, + RowResolver oldRR, ArrayList sig) throws SemanticException{ + RowResolver newRR = new RowResolver(); + HashSet prunedColsSet = new HashSet(prunedCols); + for(ColumnInfo cInfo : oldRR.getRowSchema().getSignature()) { + if ( prunedColsSet.contains(cInfo.getInternalName())) { + String[] nm = oldRR.reverseLookup(cInfo.getInternalName()); + newRR.put(nm[0], nm[1], cInfo); + sig.add(cInfo); + } + } + return newRR; + } + /** * Node Processor for Column Pruning on Filter Operators. */ @@ -196,20 +210,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, return null; } - private static RowResolver buildPrunedRR(List prunedCols, - RowResolver oldRR, ArrayList sig) throws SemanticException{ - RowResolver newRR = new RowResolver(); - HashSet prunedColsSet = new HashSet(prunedCols); - for(ColumnInfo cInfo : oldRR.getRowSchema().getSignature()) { - if ( prunedColsSet.contains(cInfo.getInternalName())) { - String[] nm = oldRR.reverseLookup(cInfo.getInternalName()); - newRR.put(nm[0], nm[1], cInfo); - sig.add(cInfo); - } - } - return newRR; - } - /* * add any input columns referenced in WindowFn args or expressions. */ @@ -312,6 +312,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, cols); List neededColumnIds = new ArrayList(); List neededColumnNames = new ArrayList(); + List referencedColumnNames = new ArrayList(); RowResolver inputRR = cppCtx.getOpToParseCtxMap().get(scanOp).getRowResolver(); TableScanDesc desc = scanOp.getConf(); List virtualCols = desc.getVirtualCols(); @@ -322,11 +323,12 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, cols.add(VirtualColumn.RAWDATASIZE.getName()); } - for (int i = 0; i < cols.size(); i++) { - String[] tabCol = inputRR.reverseLookup(cols.get(i)); - if(tabCol == null) { + for (String column : cols) { + String[] tabCol = inputRR.reverseLookup(column); + if (tabCol == null) { continue; } + referencedColumnNames.add(column); ColumnInfo colInfo = inputRR.get(tabCol[0], tabCol[1]); if (colInfo.getIsVirtualCol()) { // part is also a virtual column, but part col should not in this @@ -340,17 +342,18 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, //no need to pass virtual columns to reader. continue; } - int position = inputRR.getPosition(cols.get(i)); + int position = inputRR.getPosition(column); if (position >= 0) { // get the needed columns by id and name neededColumnIds.add(position); - neededColumnNames.add(cols.get(i)); + neededColumnNames.add(column); } } desc.setVirtualCols(newVirtualCols); scanOp.setNeededColumnIDs(neededColumnIds); scanOp.setNeededColumns(neededColumnNames); + scanOp.setReferencedColumns(referencedColumnNames); return null; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java index 8c4b891..7f574dc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java @@ -147,7 +147,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, .getConfirmedPartitionsForScan(parseInfo); if (confirmedPartns.size() > 0) { Table source = parseCtx.getQB().getMetaData().getTableForAlias(alias); - PrunedPartitionList partList = new PrunedPartitionList(source, confirmedPartns, false); + List partCols = GenMapRedUtils.getPartitionColumns(parseInfo); + PrunedPartitionList partList = new PrunedPartitionList(source, confirmedPartns, partCols, false); GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx, partList); } else { // non-partitioned table GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index f285312..77f56c1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -20,6 +20,7 @@ import java.io.Serializable; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -927,6 +928,7 @@ public static TableScanOperator createTemporaryTableScanOperator(RowSchema rowSc } tableScanOp.setNeededColumnIDs(neededColumnIds); tableScanOp.setNeededColumns(neededColumnNames); + tableScanOp.setReferencedColumns(neededColumnNames); return tableScanOp; } @@ -1747,6 +1749,14 @@ public static Path createMoveTask(Task currTask, boolean return confirmedPartns; } + public static List getPartitionColumns(QBParseInfo parseInfo) { + tableSpec tblSpec = parseInfo.getTableSpec(); + if (tblSpec.tableHandle.isPartitioned()) { + return new ArrayList(tblSpec.getPartSpec().keySet()); + } + return Collections.emptyList(); + } + public static List getInputPathsForPartialScan(QBParseInfo parseInfo, StringBuffer aggregationKey) throws SemanticException { List inputPaths = new ArrayList(); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java index 6bdf394..3c26894 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java @@ -212,9 +212,9 @@ static private ExprNodeDesc compactExpr(ExprNodeDesc expr) { return isAnd ? children.get(0) : null; } } - return (ExprNodeGenericFuncDesc)expr; + return expr; } else { - throw new IllegalStateException("Unexpected type of ExprNodeDesc: " + expr.getExprString()); + throw new IllegalStateException("Unexpected type of ExprNodeDesc: " + expr.getExprString()); } } @@ -225,18 +225,23 @@ static private ExprNodeDesc compactExpr(ExprNodeDesc expr) { * The expression is only used to prune by partition name, so we have no business with VCs. * @param expr original partition pruning expression. * @param partCols list of partition columns for the table. + * @param referred partition columns referred by expr * @return partition pruning expression that only contains partition columns from the list. */ - static private ExprNodeDesc removeNonPartCols(ExprNodeDesc expr, List partCols) { - if (expr instanceof ExprNodeColumnDesc - && !partCols.contains(((ExprNodeColumnDesc) expr).getColumn())) { - // Column doesn't appear to be a partition column for the table. - return new ExprNodeConstantDesc(expr.getTypeInfo(), null); + static private ExprNodeDesc removeNonPartCols(ExprNodeDesc expr, List partCols, + Set referred) { + if (expr instanceof ExprNodeColumnDesc) { + String column = ((ExprNodeColumnDesc) expr).getColumn(); + if (!partCols.contains(column)) { + // Column doesn't appear to be a partition column for the table. + return new ExprNodeConstantDesc(expr.getTypeInfo(), null); + } + referred.add(column); } if (expr instanceof ExprNodeGenericFuncDesc) { List children = expr.getChildren(); for (int i = 0; i < children.size(); ++i) { - children.set(i, removeNonPartCols(children.get(i), partCols)); + children.set(i, removeNonPartCols(children.get(i), partCols, referred)); } } return expr; @@ -266,7 +271,7 @@ private static PrunedPartitionList getPartitionsFromServer(Table tab, try { if (!tab.isPartitioned()) { // If the table is not partitioned, return everything. - return new PrunedPartitionList(tab, getAllPartitions(tab), false); + return new PrunedPartitionList(tab, getAllPartitions(tab), null, false); } LOG.debug("tabname = " + tab.getTableName() + " is partitioned"); @@ -279,18 +284,19 @@ private static PrunedPartitionList getPartitionsFromServer(Table tab, if (prunerExpr == null) { // Non-strict mode, and there is no predicates at all - get everything. - return new PrunedPartitionList(tab, getAllPartitions(tab), false); + return new PrunedPartitionList(tab, getAllPartitions(tab), null, false); } + Set referred = new LinkedHashSet(); // Replace virtual columns with nulls. See javadoc for details. - prunerExpr = removeNonPartCols(prunerExpr, extractPartColNames(tab)); + prunerExpr = removeNonPartCols(prunerExpr, extractPartColNames(tab), referred); // Remove all parts that are not partition columns. See javadoc for details. ExprNodeGenericFuncDesc compactExpr = (ExprNodeGenericFuncDesc)compactExpr(prunerExpr.clone()); String oldFilter = prunerExpr.getExprString(); if (compactExpr == null) { // Non-strict mode, and all the predicates are on non-partition columns - get everything. LOG.debug("Filter " + oldFilter + " was null after compacting"); - return new PrunedPartitionList(tab, getAllPartitions(tab), true); + return new PrunedPartitionList(tab, getAllPartitions(tab), null, true); } LOG.debug("Filter w/ compacting: " + compactExpr.getExprString() @@ -326,6 +332,7 @@ private static PrunedPartitionList getPartitionsFromServer(Table tab, // metastore and so some partitions may have no data based on other filters. boolean isPruningByExactFilter = oldFilter.equals(compactExpr.getExprString()); return new PrunedPartitionList(tab, new LinkedHashSet(partitions), + new ArrayList(referred), hasUnknownPartitions || !isPruningByExactFilter); } catch (HiveException e) { throw e; diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessAnalyzer.java index 74b595a..cd03c2c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessAnalyzer.java @@ -22,8 +22,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Table; public class ColumnAccessAnalyzer { @@ -44,9 +44,23 @@ public ColumnAccessInfo analyzeColumnAccess() throws SemanticException { for (TableScanOperator op : topOps.keySet()) { Table table = topOps.get(op); String tableName = table.getCompleteName(); - List tableCols = table.getCols(); - for (int i : op.getNeededColumnIDs()) { - columnAccessInfo.add(tableName, tableCols.get(i).getName()); + List referenced = op.getReferencedColumns(); + for (String column : referenced) { + columnAccessInfo.add(tableName, column); + } + if (table.isPartitioned()) { + PrunedPartitionList parts; + try { + parts = pGraphContext.getPrunedPartitions(table.getTableName(), op); + } catch (HiveException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + throw new SemanticException(e.getMessage(), e); + } + if (parts.getPartCols() != null) { + for (String partKey : parts.getPartCols()) { + columnAccessInfo.add(tableName, partKey); + } + } } } return columnAccessInfo; diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ProcessAnalyzeTable.java ql/src/java/org/apache/hadoop/hive/ql/parse/ProcessAnalyzeTable.java index c26be3c..9fcc1b2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ProcessAnalyzeTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ProcessAnalyzeTable.java @@ -28,7 +28,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.DriverContext; -import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; @@ -40,12 +39,6 @@ import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; -import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec; -import org.apache.hadoop.hive.ql.parse.GenTezWork; -import org.apache.hadoop.hive.ql.parse.ParseContext; -import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; -import org.apache.hadoop.hive.ql.parse.QBParseInfo; -import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.StatsNoJobWork; import org.apache.hadoop.hive.ql.plan.TezWork; @@ -150,7 +143,8 @@ public Object process(Node nd, Stack stack, PrunedPartitionList partitions = null; if (confirmedPartns.size() > 0) { Table source = queryBlock.getMetaData().getTableForAlias(alias); - partitions = new PrunedPartitionList(source, confirmedPartns, false); + List partCols = GenMapRedUtils.getPartitionColumns(parseInfo); + partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false); } MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/PrunedPartitionList.java ql/src/java/org/apache/hadoop/hive/ql/parse/PrunedPartitionList.java index d3268dd..23783ce 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/PrunedPartitionList.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/PrunedPartitionList.java @@ -36,11 +36,16 @@ /** Partitions that either satisfy the partition criteria, or may satisfy it. */ private Set partitions; + /** partition columns referred by pruner expr */ + private List partCols; + /** Whether there are partitions in the list that may or may not satisfy the criteria. */ private boolean hasUnknowns; - public PrunedPartitionList(Table source, Set partitions, boolean hasUnknowns) { + public PrunedPartitionList(Table source, Set partitions, List partCols, + boolean hasUnknowns) { this.source = source; + this.partCols = partCols; this.partitions = partitions; this.hasUnknowns = hasUnknowns; } @@ -70,4 +75,8 @@ public Table getSourceTable() { public boolean hasUnknownPartitions() { return hasUnknowns; } + + public List getPartCols() { + return partCols; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java index a7cec5d..a4ba4bd 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java @@ -531,7 +531,7 @@ public tableSpec getTableSpec(String tName) { } /** - * This method is used only for the anlayze command to get the partition specs + * This method is used only for the analyze command to get the partition specs */ public tableSpec getTableSpec() { diff --git ql/src/test/queries/clientpositive/column_access_stats.q ql/src/test/queries/clientpositive/column_access_stats.q index fbf8bba..4f43403 100644 --- ql/src/test/queries/clientpositive/column_access_stats.q +++ ql/src/test/queries/clientpositive/column_access_stats.q @@ -160,3 +160,8 @@ FROM JOIN T3 ON T3.key = T4.key ORDER BY T3.key, T4.key; + +-- for partitioned table +SELECT * FROM srcpart TABLESAMPLE (10 ROWS); +SELECT key,ds FROM srcpart TABLESAMPLE (10 ROWS) WHERE hr='11'; +SELECT value FROM srcpart TABLESAMPLE (10 ROWS) WHERE ds='2008-04-08'; diff --git ql/src/test/results/clientpositive/column_access_stats.q.out ql/src/test/results/clientpositive/column_access_stats.q.out index 7eee4ba..48a7418 100644 --- ql/src/test/results/clientpositive/column_access_stats.q.out +++ ql/src/test/results/clientpositive/column_access_stats.q.out @@ -58,21 +58,21 @@ PREHOOK: type: QUERY PREHOOK: Input: default@t4 #### A masked pattern was here #### Table:default@t4 -Columns:key,val +Columns:key,p,val PREHOOK: query: SELECT val FROM T4 where p=1 PREHOOK: type: QUERY PREHOOK: Input: default@t4 #### A masked pattern was here #### Table:default@t4 -Columns:val +Columns:p,val PREHOOK: query: SELECT p, val FROM T4 where p=1 PREHOOK: type: QUERY PREHOOK: Input: default@t4 #### A masked pattern was here #### Table:default@t4 -Columns:val +Columns:p,val PREHOOK: query: -- More complicated select queries EXPLAIN SELECT key FROM (SELECT key, val FROM T1) subq1 ORDER BY key @@ -934,3 +934,63 @@ Columns:key,val 7 7 17.0 8 8 46.0 8 8 46.0 +PREHOOK: query: -- for partitioned table +SELECT * FROM srcpart TABLESAMPLE (10 ROWS) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +#### A masked pattern was here #### +Table:default@srcpart +Columns:ds,hr,key,value + +238 val_238 2008-04-08 11 +86 val_86 2008-04-08 11 +311 val_311 2008-04-08 11 +27 val_27 2008-04-08 11 +165 val_165 2008-04-08 11 +409 val_409 2008-04-08 11 +255 val_255 2008-04-08 11 +278 val_278 2008-04-08 11 +98 val_98 2008-04-08 11 +484 val_484 2008-04-08 11 +PREHOOK: query: SELECT key,ds FROM srcpart TABLESAMPLE (10 ROWS) WHERE hr='11' +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +#### A masked pattern was here #### +Table:default@srcpart +Columns:ds,hr,key + +238 2008-04-08 +86 2008-04-08 +311 2008-04-08 +27 2008-04-08 +165 2008-04-08 +409 2008-04-08 +255 2008-04-08 +278 2008-04-08 +98 2008-04-08 +484 2008-04-08 +PREHOOK: query: SELECT value FROM srcpart TABLESAMPLE (10 ROWS) WHERE ds='2008-04-08' +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +#### A masked pattern was here #### +Table:default@srcpart +Columns:ds,value + +val_238 +val_86 +val_311 +val_27 +val_165 +val_409 +val_255 +val_278 +val_98 +val_484