diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java index 5d756c5..928afe6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java @@ -11,12 +11,11 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.exec.ColumnInfo; -import org.apache.hadoop.hive.ql.exec.CommonJoinOperator; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.plan.ColStatistics; -import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.eigenbase.rel.RelNode; import org.eigenbase.rel.TableAccessRel; @@ -24,6 +23,10 @@ import org.eigenbase.relopt.RelOptSchema; import org.eigenbase.reltype.RelDataType; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMap.Builder; + /* * Fix Me: * 1. Column Pruning @@ -32,106 +35,190 @@ */ public class RelOptHiveTable extends RelOptAbstractTable { - private final Table m_hiveTblMetadata; - private double m_rowCount = -1; - - final Map m_columnIdxToSizeMap = new HashMap(); - - Map m_bucketingColMap; - Map m_bucketingSortColMap; - - Statistics m_hiveStats; - List m_hiveColStats = new ArrayList(); + private final Table m_hiveTblMetadata; + private double m_rowCount = -1; + private final ImmutableList m_hiveNonPartitionCols; + private final ImmutableMap m_hiveNonPartitionColsMap; + private final ImmutableMap m_hivePartitionColsMap; + Map m_hiveColStatsMap = new HashMap(); + private Integer m_numPartitions; + private final int m_noOfProjs; protected static final Log LOG = LogFactory.getLog(RelOptHiveTable.class .getName()); - // NOTE: name here is the table alias which may or may not be the real name in - // metadata. Use - // m_hiveTblMetadata.getTableName() for table name and - // m_hiveTblMetadata.getDbName() for db name. - public RelOptHiveTable(RelOptSchema schema, String name, RelDataType rowType, - Table hiveTblMetadata, Statistics stats) { - super(schema, name, rowType); - m_hiveTblMetadata = hiveTblMetadata; - } - - public RelOptHiveTable(RelOptSchema optiqSchema, String name, RelDataType rowType, - Table hiveTblMetadata, List hiveSchema) { - super(optiqSchema, name, rowType); - m_hiveTblMetadata = hiveTblMetadata; - - List neededColumns = new ArrayList(); - for (ColumnInfo ci : hiveSchema) { - neededColumns.add(ci.getInternalName()); - } - - //TODO: Fix below two stats - m_hiveColStats = StatsUtils.getTableColumnStats(m_hiveTblMetadata, hiveSchema, neededColumns); - m_rowCount = StatsUtils.getNumRows(m_hiveTblMetadata); - } - - @Override - public boolean isKey(BitSet arg0) { - return false; - } - - @Override - public RelNode toRel(ToRelContext context) { - return new TableAccessRel(context.getCluster(), this); - } - - @Override - public T unwrap(Class arg0) { - return arg0.isInstance(this) ? arg0.cast(this) : null; - } - - @Override - public double getRowCount() { - return m_rowCount; - } - - public Table getHiveTableMD() { - return m_hiveTblMetadata; - } - - public Statistics getHiveStats() { - return m_hiveStats; - } - - private String getColNameList(Set colLst) { - StringBuffer sb = new StringBuffer(); - List schema = m_hiveTblMetadata.getAllCols(); - for (Integer i : colLst) { - String colName = (i < schema.size()) ? m_hiveTblMetadata.getAllCols().get(i).getName() : ""; - if (i == 0) - sb.append(colName); - else - sb.append(", " + colName); - } - return sb.toString(); - } - - public List getColStat(List projIndxLst) { - if (projIndxLst != null) { - Set colsWithoutStats = new HashSet(); - List hiveColStatLst = new LinkedList(); - for (Integer i : projIndxLst) { - if (i >= m_hiveColStats.size()) - colsWithoutStats.add(i); - else - hiveColStatLst.add(m_hiveColStats.get(i)); - } - if (!colsWithoutStats.isEmpty()) { - String logMsg = "No Stats for DB@Table " + m_hiveTblMetadata.getCompleteName() - + ", Columns: " + getColNameList(colsWithoutStats); - LOG.error(logMsg); - throw new RuntimeException(logMsg); - } - - return hiveColStatLst; - } else { - return m_hiveColStats; - } - } + public RelOptHiveTable(RelOptSchema optiqSchema, String name, + RelDataType rowType, Table hiveTblMetadata, + List hiveNonPartitionCols, + List hivePartitionCols) { + super(optiqSchema, name, rowType); + m_hiveTblMetadata = hiveTblMetadata; + m_hiveNonPartitionCols = ImmutableList.copyOf(hiveNonPartitionCols); + m_hiveNonPartitionColsMap = getColInfoMap(hiveNonPartitionCols, 0); + m_hivePartitionColsMap = getColInfoMap(hivePartitionCols, + m_hiveNonPartitionColsMap.size()); + m_noOfProjs = hiveNonPartitionCols.size() + hivePartitionCols.size(); + } + + private static ImmutableMap getColInfoMap( + List hiveCols, int startIndx) { + Builder bldr = ImmutableMap + . builder(); + + int indx = startIndx; + for (ColumnInfo ci : hiveCols) { + bldr.put(indx, ci); + indx++; + } + + return bldr.build(); + } + + @Override + public boolean isKey(BitSet arg0) { + return false; + } + + @Override + public RelNode toRel(ToRelContext context) { + return new TableAccessRel(context.getCluster(), this); + } + + @Override + public T unwrap(Class arg0) { + return arg0.isInstance(this) ? arg0.cast(this) : null; + } + + @Override + public double getRowCount() { + if (m_rowCount == -1) + m_rowCount = StatsUtils.getNumRows(m_hiveTblMetadata); + + return m_rowCount; + } + + public Table getHiveTableMD() { + return m_hiveTblMetadata; + } + + private String getColNamesForLogging(Set colLst) { + StringBuffer sb = new StringBuffer(); + boolean firstEntry = true; + for (String colName : colLst) { + if (firstEntry) { + sb.append(colName); + firstEntry = false; + } else { + sb.append(", " + colName); + } + } + return sb.toString(); + } + + private void updateColStats(Set projIndxLst) { + List nonPartColNamesThatRqrStats = new ArrayList(); + List nonPartColIndxsThatRqrStats = new ArrayList(); + List partColNamesThatRqrStats = new ArrayList(); + List partColIndxsThatRqrStats = new ArrayList(); + Set colNamesFailedStats = new HashSet(); + + // 1. Separate required columns to Non Partition and Partition Cols + ColumnInfo tmp; + for (Integer pi : projIndxLst) { + if (m_hiveColStatsMap.get(pi) == null) { + if ((tmp = m_hiveNonPartitionColsMap.get(pi)) != null) { + nonPartColNamesThatRqrStats.add(tmp.getInternalName()); + nonPartColIndxsThatRqrStats.add(pi); + } else if ((tmp = m_hivePartitionColsMap.get(pi)) != null) { + partColNamesThatRqrStats.add(tmp.getInternalName()); + partColIndxsThatRqrStats.add(pi); + } else { + String logMsg = "Unable to find Column Index: " + pi + + ", in " + m_hiveTblMetadata.getCompleteName(); + LOG.error(logMsg); + throw new RuntimeException(logMsg); + } + } + } + + // 2. Obtain Col Stats for Non Partition Cols + if (nonPartColNamesThatRqrStats.size() > 0) { + List colStats = StatsUtils.getTableColumnStats( + m_hiveTblMetadata, m_hiveNonPartitionCols, + nonPartColNamesThatRqrStats); + if (colStats != null + && colStats.size() == nonPartColNamesThatRqrStats.size()) { + for (int i = 0; i < colStats.size(); i++) { + m_hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), + colStats.get(i)); + } + } else { + // TODO: colNamesFailedStats is designed to be used for both non + // partitioned & partitioned cols; currently only used for non + // partitioned cols. + colNamesFailedStats.addAll(nonPartColNamesThatRqrStats); + } + } + + // 3. Obtain Stats for Partition Cols + // TODO: Fix this as part of Partition Pruning + if (!partColNamesThatRqrStats.isEmpty()) { + if (m_numPartitions == null) { + try { + m_numPartitions = Hive + .get() + .getPartitionNames(m_hiveTblMetadata.getDbName(), + m_hiveTblMetadata.getTableName(), + (short) -1).size(); + } catch (HiveException e) { + String logMsg = "Could not get stats, number of Partitions for " + + m_hiveTblMetadata.getCompleteName(); + LOG.error(logMsg); + throw new RuntimeException(logMsg); + } + } + + ColStatistics cStats = null; + for (int i = 0; i < partColNamesThatRqrStats.size(); i++) { + cStats = new ColStatistics(m_hiveTblMetadata.getTableName(), + partColNamesThatRqrStats.get(i), m_hivePartitionColsMap + .get(partColIndxsThatRqrStats.get(i)) + .getTypeName()); + cStats.setCountDistint(m_numPartitions); + + m_hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats); + } + } + + // 4. Warn user if we could get stats for required columns + if (!colNamesFailedStats.isEmpty()) { + String logMsg = "No Stats for " + + m_hiveTblMetadata.getCompleteName() + ", Columns: " + + getColNamesForLogging(colNamesFailedStats); + LOG.error(logMsg); + throw new RuntimeException(logMsg); + } + } + + public List getColStat(List projIndxLst) { + List hiveColStatLst = new LinkedList(); + + if (projIndxLst != null) { + updateColStats(new HashSet(projIndxLst)); + for (Integer i : projIndxLst) { + hiveColStatLst.add(m_hiveColStatsMap.get(i)); + } + } else { + List pILst = new ArrayList(); + for (Integer i = 0; i < m_noOfProjs; i++) { + pILst.add(i); + } + updateColStats(new HashSet(pILst)); + for (Integer pi : pILst) { + hiveColStatLst.add(m_hiveColStatsMap.get(pi)); + } + } + + return hiveColStatLst; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java index 110cc5f..cf65e10 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java @@ -627,7 +627,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } RelDataType rowType = TypeConverter.getType(ctx.cluster, rr, neededCols); RelOptHiveTable optTable = new RelOptHiveTable(ctx.schema, tableScanOp.getConf().getAlias(), - rowType, ctx.sA.getTable(tableScanOp), stats); + rowType, ctx.sA.getTable(tableScanOp), null, null); TableAccessRelBase tableRel = new HiveTableScanRel(ctx.cluster, ctx.cluster.traitSetOf(HiveRel.CONVENTION), optTable, rowType); ctx.buildColumnMap(tableScanOp, tableRel); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index cfabbf5..03f134d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -12090,8 +12090,9 @@ private RelNode genTableLogicalPlan(String tableAlias, QB qb) { cInfoLst.add(colInfo); } // TODO: Fix this - ArrayList columnsThatNeedsStats = new ArrayList( + ArrayList nonPartitionColumns = new ArrayList( cInfoLst); + ArrayList partitionColumns = new ArrayList(); // 3.2 Add column info corresponding to partition columns for (FieldSchema part_col : tab.getPartCols()) { @@ -12101,6 +12102,7 @@ private RelNode genTableLogicalPlan(String tableAlias, QB qb) { tableAlias, true); rr.put(tableAlias, colName, colInfo); cInfoLst.add(colInfo); + partitionColumns.add(colInfo); } // 3.3 Add column info corresponding to virtual columns @@ -12119,7 +12121,7 @@ private RelNode genTableLogicalPlan(String tableAlias, QB qb) { // 4. Build RelOptAbstractTable RelOptHiveTable optTable = new RelOptHiveTable(m_relOptSchema, - tableAlias, rowType, tab, columnsThatNeedsStats); + tableAlias, rowType, tab, nonPartitionColumns, partitionColumns); // 5. Build Hive Table Scan Rel tableRel = new HiveTableScanRel(m_cluster,