Index: jdbc/src/java/org/apache/hadoop/hive/jdbc/HiveDatabaseMetaData.java =================================================================== --- jdbc/src/java/org/apache/hadoop/hive/jdbc/HiveDatabaseMetaData.java (revision 1187112) +++ jdbc/src/java/org/apache/hadoop/hive/jdbc/HiveDatabaseMetaData.java (working copy) @@ -106,7 +106,7 @@ public ResultSet getCatalogs() throws SQLException { try { // TODO a client call to get the schema's after HIVE-675 is implemented - final List catalogs = new ArrayList(); + final List catalogs = new ArrayList(); catalogs.add("default"); return new HiveMetaDataResultSet(Arrays.asList("TABLE_CAT") , Arrays.asList("STRING") @@ -571,7 +571,7 @@ public ResultSet getTables(String catalog, String schemaPattern, String tableNamePattern, String[] types) throws SQLException { final List tablesstr; - final List resultTables = new ArrayList(); + final List resultTables = new ArrayList(); final String resultCatalog; if (catalog==null) { // On jdbc the default catalog is null but on hive it's "default" resultCatalog = "default"; Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1187112) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -377,6 +377,7 @@ HIVEOPTPPD("hive.optimize.ppd", true), // predicate pushdown HIVEPPDRECOGNIZETRANSITIVITY("hive.ppd.recognizetransivity", true), // predicate pushdown HIVEPPDREMOVEDUPLICATEFILTERS("hive.ppd.remove.duplicatefilters", true), + HIVEMETADATAONLYQUERIES("hive.optimize.metadataonly", true), // push predicates down to storage handlers HIVEOPTPPD_STORAGE("hive.optimize.ppd.storage", true), HIVEOPTGROUPBY("hive.optimize.groupby", true), // optimize group by Index: serde/src/java/org/apache/hadoop/hive/serde2/NullStructSerDe.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/NullStructSerDe.java (revision 0) +++ serde/src/java/org/apache/hadoop/hive/serde2/NullStructSerDe.java (revision 0) @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2; + +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; + +/** + * Placeholder SerDe for cases where neither serialization nor deserialization is needed + * + */ +public class NullStructSerDe implements SerDe { + + class NullStructField implements StructField { + @Override + public String getFieldName() { + return null; + } + + @Override + public ObjectInspector getFieldObjectInspector() { + return null; + } + + @Override + public String getFieldComment() { + return ""; + } + } + + @Override + public Object deserialize(Writable blob) throws SerDeException { + return null; + } + + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return new StructObjectInspector() { + public String getTypeName() { + return "null"; + } + public Category getCategory() { + return Category.PRIMITIVE; + } + @Override + public StructField getStructFieldRef(String fieldName) { + return null; + } + @Override + public List getAllStructFieldRefs() { + return new ArrayList(); + } + @Override + public Object getStructFieldData(Object data, StructField fieldRef) { + return null; + } + @Override + public List getStructFieldsDataAsList(Object data) { + return new ArrayList(); + } + }; + } + + @Override + public SerDeStats getSerDeStats() { + return null; + } + + @Override + public void initialize(Configuration conf, Properties tbl) throws SerDeException { + } + + @Override + public Class getSerializedClass() { + return NullWritable.class; + } + + @Override + public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { + return NullWritable.get(); + } + +} Index: ql/src/test/results/clientpositive/metadataonly1.q.out =================================================================== --- ql/src/test/results/clientpositive/metadataonly1.q.out (revision 0) +++ ql/src/test/results/clientpositive/metadataonly1.q.out (revision 0) @@ -0,0 +1,215 @@ +PREHOOK: query: CREATE TABLE TEST1(A INT, B DOUBLE) partitioned by (ds string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE TEST1(A INT, B DOUBLE) partitioned by (ds string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@TEST1 +PREHOOK: query: explain extended select max(ds) from TEST1 +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select max(ds) from TEST1 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME TEST1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION max (TOK_TABLE_OR_COL ds)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + test1 + TableScan + alias: test1 + GatherStats: false + Select Operator + expressions: + expr: ds + type: string + outputColumnNames: ds + Group By Operator + aggregations: + expr: max(ds) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: string + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: + expr: max(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + directory: file:/var/folders/bZ/bZe+iKfoFTuPoShRd6dy6-tOU9Y/-Tmp-/njain/hive_2011-10-19_14-07-20_855_8153771721844657389/-ext-10001 + NumFilesPerFileSink: 1 + Stats Publishing Key Prefix: file:/var/folders/bZ/bZe+iKfoFTuPoShRd6dy6-tOU9Y/-Tmp-/njain/hive_2011-10-19_14-07-20_855_8153771721844657389/-ext-10001/ + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types string + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select max(ds) from TEST1 +PREHOOK: type: QUERY +PREHOOK: Output: file:/var/folders/bZ/bZe+iKfoFTuPoShRd6dy6-tOU9Y/-Tmp-/njain/hive_2011-10-19_14-07-21_162_1298856526851958828/-mr-10000 +POSTHOOK: query: select max(ds) from TEST1 +POSTHOOK: type: QUERY +POSTHOOK: Output: file:/var/folders/bZ/bZe+iKfoFTuPoShRd6dy6-tOU9Y/-Tmp-/njain/hive_2011-10-19_14-07-21_162_1298856526851958828/-mr-10000 +NULL +PREHOOK: query: alter table TEST1 add partition (ds='1') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Input: default@test1 +POSTHOOK: query: alter table TEST1 add partition (ds='1') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Input: default@test1 +POSTHOOK: Output: default@test1@ds=1 +PREHOOK: query: explain extended select max(ds) from TEST1 +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select max(ds) from TEST1 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME TEST1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION max (TOK_TABLE_OR_COL ds)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + test1 + TableScan + alias: test1 + GatherStats: false + Select Operator + expressions: + expr: ds + type: string + outputColumnNames: ds + Group By Operator + aggregations: + expr: max(ds) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: string + Needs Tagging: false + Path -> Alias: + file:/fake-path/Users/njain/hive/hive1/build/ql/test/data/warehouse/test1/ds=1 [test1] + Path -> Partition: + file:/fake-path/Users/njain/hive/hive1/build/ql/test/data/warehouse/test1/ds=1 + Partition + base file name: ds=1 + input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 1 + properties: + bucket_count -1 + columns a,b + columns.types int:double + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/Users/njain/hive/hive1/build/ql/test/data/warehouse/test1/ds=1 + name default.test1 + partition_columns ds + serialization.ddl struct test1 { i32 a, double b} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1319058448 + serde: org.apache.hadoop.hive.serde2.NullStructSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns a,b + columns.types int:double + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/Users/njain/hive/hive1/build/ql/test/data/warehouse/test1 + name default.test1 + partition_columns ds + serialization.ddl struct test1 { i32 a, double b} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1319058440 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test1 + name: default.test1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: max(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + directory: file:/var/folders/bZ/bZe+iKfoFTuPoShRd6dy6-tOU9Y/-Tmp-/njain/hive_2011-10-19_14-07-28_723_2286954664445341734/-ext-10001 + NumFilesPerFileSink: 1 + Stats Publishing Key Prefix: file:/var/folders/bZ/bZe+iKfoFTuPoShRd6dy6-tOU9Y/-Tmp-/njain/hive_2011-10-19_14-07-28_723_2286954664445341734/-ext-10001/ + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types string + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select max(ds) from TEST1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test1@ds=1 +PREHOOK: Output: file:/var/folders/bZ/bZe+iKfoFTuPoShRd6dy6-tOU9Y/-Tmp-/njain/hive_2011-10-19_14-07-28_854_8627847703459825936/-mr-10000 +POSTHOOK: query: select max(ds) from TEST1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test1@ds=1 +POSTHOOK: Output: file:/var/folders/bZ/bZe+iKfoFTuPoShRd6dy6-tOU9Y/-Tmp-/njain/hive_2011-10-19_14-07-28_854_8627847703459825936/-mr-10000 +1 Index: ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java =================================================================== --- ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java (revision 1187112) +++ ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java (working copy) @@ -88,10 +88,6 @@ private static final Log LOG = LogFactory.getLog("QTestUtil"); private String testWarehouse; - private final String tmpdir= System.getProperty("test.tmp.dir") ; - private final Path tmppath = new Path(tmpdir); - - private final String testFiles; protected final String outDir; protected final String logDir; @@ -234,7 +230,12 @@ initConf(); - testFiles = conf.get("test.data.files").replace('\\', '/') + String dataDir = conf.get("test.data.files"); + if (dataDir == null) { + dataDir = new File(".").getAbsolutePath() + "/data/files"; + } + + testFiles = dataDir.replace('\\', '/') .replace("c:", ""); String ow = System.getProperty("test.output.overwrite"); Index: ql/src/test/queries/clientpositive/metadataonly1.q =================================================================== --- ql/src/test/queries/clientpositive/metadataonly1.q (revision 0) +++ ql/src/test/queries/clientpositive/metadataonly1.q (revision 0) @@ -0,0 +1,10 @@ +CREATE TABLE TEST1(A INT, B DOUBLE) partitioned by (ds string); +explain extended select max(ds) from TEST1; +select max(ds) from TEST1; + +alter table TEST1 add partition (ds='1'); +explain extended select max(ds) from TEST1; +select max(ds) from TEST1; + + + Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1187112) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy) @@ -24,6 +24,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.optimizer.index.RewriteGBUsingIndex; import org.apache.hadoop.hive.ql.optimizer.lineage.Generator; +import org.apache.hadoop.hive.ql.optimizer.moq.MetadataOnlyOptimizer; import org.apache.hadoop.hive.ql.optimizer.pcr.PartitionConditionRemover; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcessor; @@ -74,6 +75,7 @@ if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION)) { transformations.add(new ReduceSinkDeDuplication()); } + transformations.add(new MetadataOnlyOptimizer()); } /** Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/moq/MetadataOnlyOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/moq/MetadataOnlyOptimizer.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/moq/MetadataOnlyOptimizer.java (revision 0) @@ -0,0 +1,163 @@ +package org.apache.hadoop.hive.ql.optimizer.moq; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Stack; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.PreOrderWalker; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.optimizer.Transform; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * + * MetadataOnlyOptimizer determines to which TableScanOperators "metadata only" optimization + * can be applied. Such operator must use only partition columns (it is easy to check, because + * we are after column pruning and all places where the data from the operator is used must + * go through GroupByOperator distinct or distinct-like aggregations. Aggregation is distinct-like + * if adding distinct wouldn't change the result, for example min, max. + * + * We cannot apply the optimization without group by, because the results depend on the + * numbers of rows in partitions, for example count(hr) will count all rows in matching + * partitions. + * + */ +public class MetadataOnlyOptimizer implements Transform { + private static final Log LOG = LogFactory.getLog(MetadataOnlyOptimizer.class + .getName()); + + static private class WalkerCtx implements NodeProcessorCtx { + /* operators for which there is chance the optimization can be applied*/ + private final HashSet possible = new HashSet(); + /* operators for which the optimization will be successful*/ + private final HashSet success = new HashSet(); + /* operators which we consider not applicable for the optimization*/ + private final HashSet banned = new HashSet(); + + /** + * Sets operator as one for which there is a chance to apply optimization + * @param op the operator + */ + void setMayBeMetadataOnly(TableScanOperator op) { + if(!banned.contains(op)) { + possible.add(op); + } + } + + void setMetadataOnly(TableScanOperator tso) { + if (!banned.contains(tso) && possible.contains(tso)) { + success.add(tso); + possible.remove(tso); + } + } + + /** + * Sets operator as one for which we cannot apply the optimization + * @param op the operator + */ + void setIsNotMetadataOnly(TableScanOperator op) { + possible.remove(op); + banned.add(op); + success.remove(op); + } + + /** + * Returns HashSet of collected operators for which the optimization is applicable. + */ + HashSet getMetadataOnlyTableScans() { + return success; + } + + } + + static private class TableScanProcessor implements NodeProcessor { + public TableScanProcessor() { + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + TableScanOperator node = (TableScanOperator) nd; + WalkerCtx walkerCtx = (WalkerCtx) procCtx; + if(node.getNeededColumnIDs().size() == 0) { + // getNeededColumnIDs returns only data columns + walkerCtx.setMayBeMetadataOnly(node); + } + return nd; + } + } + static private class FileSinkProcessor implements NodeProcessor { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + WalkerCtx walkerCtx = (WalkerCtx) procCtx; + TableScanOperator tso = (TableScanOperator) stack.get(0); + assert(tso != null); + if (!walkerCtx.possible.contains(tso)) { + return nd; + } + + for (Node op: stack) { + if (op instanceof GroupByOperator) { + GroupByOperator gby = (GroupByOperator) op; + if (!gby.getConf().isDistinctLike()) { + // GroupBy not distinct like, disabling + walkerCtx.setIsNotMetadataOnly(tso); + return nd; + } + } + } + + walkerCtx.setMetadataOnly(tso); + return nd; + } + } + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + LOG.info("Looking for table scans where optimization is applicable"); + // create a the context for walking operators + WalkerCtx walkerCtx = new WalkerCtx(); + + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", "TS%"), new TableScanProcessor()); + opRules.put(new RuleRegExp("R2", "GBY%.*FS%"), new FileSinkProcessor()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, walkerCtx); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + + LOG.info(String.format("Found %d metadata only table scans", + walkerCtx.getMetadataOnlyTableScans().size())); + Iterator iterator = walkerCtx.getMetadataOnlyTableScans().iterator(); + while (iterator.hasNext()) { + TableScanOperator tso = iterator.next(); + LOG.info("Metadata only table scan for " + tso.getConf().getAlias()); + } + pctx.setMetadataOnlyTableScans(walkerCtx.getMetadataOnlyTableScans()); + + return pctx; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (revision 1187112) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (working copy) @@ -45,6 +45,7 @@ import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.hooks.ReadEntity; +import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRMapJoinCtx; @@ -70,6 +71,7 @@ import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext; +import org.apache.hadoop.hive.serde2.NullStructSerDe; /** * General utility common functions for the Processor to convert operator into @@ -584,7 +586,6 @@ if (aliasPartnDesc == null) { aliasPartnDesc = new PartitionDesc(Utilities.getTableDesc(parseCtx .getTopToTable().get(topOp)), null); - } plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc); @@ -616,19 +617,36 @@ boolean isFirstPart = true; boolean emptyInput = true; boolean singlePartition = (parts.size() == 1); + + boolean metadataOnly = false; + if(parseCtx.isMetadataOnlyTableScan((TableScanOperator)topOp)) { + metadataOnly = true; + } + for (Partition part : parts) { if (part.getTable().isPartitioned()) { inputs.add(new ReadEntity(part)); } else { inputs.add(new ReadEntity(part.getTable())); } - + // Later the properties have to come from the partition as opposed // to from the table in order to support versioning. Path[] paths = null; sampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(topOp); - if (sampleDescr != null) { + if (metadataOnly) { + Path[] allPaths = part.getPath(); + if (allPaths.length > 0) { + String uriPath = "/fake-path" + allPaths[0].toUri().getPath(); + Path fakePath = new Path("file", null, uriPath); + paths = new Path[] { fakePath }; + } else { + // Did not find any partition + paths = new Path[0]; + } + } + else if (sampleDescr != null) { paths = SamplePruner.prune(part, sampleDescr); parseCtx.getGlobalLimitCtx().disableOpt(); } else { @@ -701,12 +719,19 @@ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } + if(metadataOnly) { + PartitionDesc desc = partDesc.get(partDesc.size() - 1); + desc.setInputFileFormatClass(OneNullRowInputFormat.class); + desc.setDeserializerClass(NullStructSerDe.class); + desc.setSerdeClassName(NullStructSerDe.class.getName()); + } } } if (emptyInput) { parseCtx.getGlobalLimitCtx().disableOpt(); } + Iterator iterPath = partDir.iterator(); Iterator iterPartnDesc = partDesc.iterator(); Index: ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (revision 1187112) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (working copy) @@ -18,6 +18,11 @@ package org.apache.hadoop.hive.ql.plan; +import java.util.ArrayList; + +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; + /** * GroupByDesc. * @@ -175,4 +180,22 @@ public void setBucketGroup(boolean dataSorted) { bucketGroup = dataSorted; } + + /** + * Checks if this grouping is like distinct, which means that all non-distinct grouping + * columns behave like they were distinct - for example min and max operators. + */ + public boolean isDistinctLike() { + ArrayList aggregators = getAggregators(); + for(AggregationDesc ad: aggregators){ + if(!ad.getDistinct()) { + GenericUDAFEvaluator udafEval = ad.getGenericUDAFEvaluator(); + UDFType annot = udafEval.getClass().getAnnotation(UDFType.class); + if(annot == null || !annot.distinctLike()) { + return false; + } + } + } + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/io/OneNullRowInputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/OneNullRowInputFormat.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/OneNullRowInputFormat.java (revision 0) @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobConfigurable; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + +/** + * OneNullRowInputFormat outputs one null row. Used in implementation of + * metadata only queries. + * + */ +public class OneNullRowInputFormat implements + InputFormat, JobConfigurable { + private static final Log LOG = LogFactory.getLog(OneNullRowInputFormat.class + .getName()); + MapredWork mrwork = null; + List partitions; + long len; + + static public class DummyInputSplit implements InputSplit { + public DummyInputSplit() { + } + + @Override + public long getLength() throws IOException { + return 1; + } + + @Override + public String[] getLocations() throws IOException { + return new String[0]; + } + + @Override + public void readFields(DataInput arg0) throws IOException { + } + + @Override + public void write(DataOutput arg0) throws IOException { + } + + } + + static public class OneNullRowRecordReader implements RecordReader { + private boolean processed = false; + public OneNullRowRecordReader() { + } + @Override + public void close() throws IOException { + } + + @Override + public NullWritable createKey() { + return NullWritable.get(); + } + + @Override + public NullWritable createValue() { + return NullWritable.get(); + } + + @Override + public long getPos() throws IOException { + return (processed ? 1 : 0); + } + + @Override + public float getProgress() throws IOException { + return (float) (processed ? 1.0 : 0.0); + } + + @Override + public boolean next(NullWritable arg0, NullWritable arg1) throws IOException { + if(processed) { + return false; + } else { + processed = true; + return true; + } + } + + } + + @Override + public RecordReader getRecordReader(InputSplit arg0, JobConf arg1, Reporter arg2) + throws IOException { + return new OneNullRowRecordReader(); + } + + @Override + public InputSplit[] getSplits(JobConf arg0, int arg1) throws IOException { + InputSplit[] ret = new InputSplit[1]; + ret[0] = new DummyInputSplit(); + LOG.info("Calculating splits"); + return ret; + } + + @Override + public void configure(JobConf job) { + LOG.info("Using one null row input format"); + } + +} \ No newline at end of file Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1187112) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -73,7 +73,9 @@ import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; +import org.apache.hadoop.hive.ql.io.HiveNullValueSequenceFileOutputFormat; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; +import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; @@ -155,6 +157,7 @@ import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe; +import org.apache.hadoop.hive.serde2.NullStructSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; @@ -176,6 +179,7 @@ private HashMap opToPartList; private HashMap> topOps; private HashMap> topSelOps; + private HashSet metadataOnlyTableScans; private LinkedHashMap, OpParseContext> opParseCtx; private List loadTableWork; private List loadFileWork; @@ -262,6 +266,7 @@ opToPartPruner = new HashMap(); opToPartList = new HashMap(); opToSamplePruner = new HashMap(); + metadataOnlyTableScans = new HashSet(); nameToSplitSample = new HashMap(); topOps = new HashMap>(); topSelOps = new HashMap>(); @@ -305,6 +310,7 @@ opToPartPruner = pctx.getOpToPartPruner(); opToPartList = pctx.getOpToPartList(); opToSamplePruner = pctx.getOpToSamplePruner(); + metadataOnlyTableScans = pctx.getMetadataOnlyTableScans(); topOps = pctx.getTopOps(); topSelOps = pctx.getTopSelOps(); opParseCtx = pctx.getOpParseCtx(); @@ -327,7 +333,7 @@ topSelOps, opParseCtx, joinContext, topToTable, loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, - opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks); + opToSamplePruner, metadataOnlyTableScans, globalLimitCtx, nameToSplitSample, inputs, rootTasks); } @SuppressWarnings("nls") @@ -7274,7 +7280,7 @@ opToPartList, topOps, topSelOps, opParseCtx, joinContext, topToTable, loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, - opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks); + opToSamplePruner, metadataOnlyTableScans, globalLimitCtx, nameToSplitSample, inputs, rootTasks); Optimizer optm = new Optimizer(); optm.setPctx(pCtx); Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (revision 1187112) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (working copy) @@ -61,6 +61,7 @@ private HashMap opToPartPruner; private HashMap opToPartList; private HashMap opToSamplePruner; + private HashSet metadataOnlyTableScans; private HashMap> topOps; private HashMap> topSelOps; private LinkedHashMap, OpParseContext> opParseCtx; @@ -155,6 +156,7 @@ Map> groupOpToInputTables, Map prunedPartitions, HashMap opToSamplePruner, + HashSet metadataOnlyTableScans, SemanticAnalyzer.GlobalLimitCtx globalLimitCtx, HashMap nameToSplitSample, HashSet semanticInputs, List> rootTasks) { @@ -179,6 +181,7 @@ this.groupOpToInputTables = groupOpToInputTables; this.prunedPartitions = prunedPartitions; this.opToSamplePruner = opToSamplePruner; + this.metadataOnlyTableScans = metadataOnlyTableScans; this.nameToSplitSample = nameToSplitSample; this.globalLimitCtx = globalLimitCtx; this.semanticInputs = semanticInputs; @@ -456,6 +459,17 @@ this.opToSamplePruner = opToSamplePruner; } + public void setMetadataOnlyTableScans(HashSet metadataOnlyTableScans) { + this.metadataOnlyTableScans = metadataOnlyTableScans; + } + + public HashSet getMetadataOnlyTableScans() { + return metadataOnlyTableScans; + } + + public boolean isMetadataOnlyTableScan(TableScanOperator tos) { + return metadataOnlyTableScans != null && metadataOnlyTableScans.contains(tos); + } /** * @return the groupOpToInputTables */ Index: ql/src/java/org/apache/hadoop/hive/ql/udf/UDFType.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDFType.java (revision 1187112) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDFType.java (working copy) @@ -33,4 +33,5 @@ public @interface UDFType { boolean deterministic() default true; boolean stateful() default false; + boolean distinctLike() default false; } Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java (revision 1187112) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java (working copy) @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; @@ -49,6 +50,7 @@ return new GenericUDAFMaxEvaluator(); } + @UDFType(distinctLike=true) public static class GenericUDAFMaxEvaluator extends GenericUDAFEvaluator { ObjectInspector inputOI; Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java (revision 1187112) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java (working copy) @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; @@ -49,6 +50,7 @@ return new GenericUDAFMinEvaluator(); } + @UDFType(distinctLike=true) public static class GenericUDAFMinEvaluator extends GenericUDAFEvaluator { ObjectInspector inputOI;