diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index a21f589..754ce1a 100644
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -215,7 +215,6 @@ public class HiveConf extends Configuration {
METASTORE_PLUGIN_REGISTRY_BUNDLE_CHECK("datanucleus.plugin.pluginRegistryBundleCheck", "LOG"),
METASTORE_BATCH_RETRIEVE_MAX("hive.metastore.batch.retrieve.max", 300),
-
// Default parameters for creating tables
NEWTABLEDEFAULTPARA("hive.table.parameters.default",""),
@@ -347,6 +346,7 @@ public class HiveConf extends Configuration {
// Optimizer
HIVEOPTCP("hive.optimize.cp", true), // column pruner
+ HIVEOPTAUTOINDEX("hive.optimize.autoindex", false), // automatically use indexes
HIVEOPTPPD("hive.optimize.ppd", true), // predicate pushdown
// push predicates down to storage handlers
HIVEOPTPPD_STORAGE("hive.optimize.ppd.storage", true),
@@ -355,6 +355,10 @@ public class HiveConf extends Configuration {
HIVEOPTSORTMERGEBUCKETMAPJOIN("hive.optimize.bucketmapjoin.sortedmerge", false), // try to use sorted merge bucket map join
HIVEOPTREDUCEDEDUPLICATION("hive.optimize.reducededuplication", true),
+ // Indexes
+ HIVE_INDEX_COMPACT_MINSIZE("hive.index.compact.minSize", (long) 1000),
+ HIVE_INDEX_COMPACT_MAXSIZE("hive.index.compact.maxSize", (long) 100000),
+
// Statistics
HIVESTATSAUTOGATHER("hive.stats.autogather", true),
HIVESTATSDBCLASS("hive.stats.dbclass",
diff --git conf/hive-default.xml conf/hive-default.xml
index c42197f..3f88d68 100644
--- conf/hive-default.xml
+++ conf/hive-default.xml
@@ -325,6 +325,12 @@
+ hive.optimize.autoindex
+ false
+ Whether to enable automatic use of indexes
+
+
+
hive.optimize.ppd
true
Whether to enable predicate pushdown
@@ -984,6 +990,18 @@
+ hive.index.compact.minSize
+ 1000
+ Minimum size of the inputs on which a compact index is automatically used.
+
+
+
+ hive.index.compact.maxSize
+ 100000
+ Maximum size of the inputs on which a compact index is automatically used.
+
+
+
hive.exim.uri.scheme.whitelist
hdfs,pfile
A comma separated list of acceptable URI schemes for import and export.
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java
index 6437385..c5c8ddc 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java
@@ -273,9 +273,13 @@ public class ExecDriver extends Task implements Serializable, Hadoop
job.setNumReduceTasks(work.getNumReduceTasks().intValue());
job.setReducerClass(ExecReducer.class);
+ // Set hive input format, and input format file if necessary.
if (work.getInputformat() != null) {
HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, work.getInputformat());
}
+ if (work.getIndexIntermediateFile() != null) {
+ job.set("hive.index.compact.file", work.getIndexIntermediateFile());
+ }
// Turn on speculative execution for reducers
boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job,
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
index c02d90b..0fd63f5 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
@@ -22,6 +22,7 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
@@ -53,6 +54,8 @@ public class TableScanOperator extends Operator implements
private transient String partitionSpecs;
private transient boolean inputFileChanged = false;
private TableDesc tableDesc;
+ private String indexInputFormat;
+ private String indexIntermediateFile;
public TableDesc getTableDesc() {
@@ -63,6 +66,26 @@ public class TableScanOperator extends Operator implements
this.tableDesc = tableDesc;
}
+ public String getIndexInputFormat() {
+ return indexInputFormat;
+ }
+
+ public void setIndexInputFormat(String indexInputFormat) {
+ this.indexInputFormat = indexInputFormat;
+ }
+
+ public String getIndexIntermediateFile() {
+ return indexIntermediateFile;
+ }
+
+ public void setIndexIntermediateFile(String fileName) {
+ this.indexIntermediateFile = fileName;
+ }
+
+ public boolean usesIndex() {
+ return StringUtils.isNotEmpty(indexInputFormat) && StringUtils.isNotEmpty(indexIntermediateFile);
+ }
+
/**
* Other than gathering statistics for the ANALYZE command, the table scan operator
* does not do anything special other than just forwarding the row. Since the table
diff --git ql/src/java/org/apache/hadoop/hive/ql/index/AbstractIndexHandler.java ql/src/java/org/apache/hadoop/hive/ql/index/AbstractIndexHandler.java
index dd0186d..a72abcc 100644
--- ql/src/java/org/apache/hadoop/hive/ql/index/AbstractIndexHandler.java
+++ ql/src/java/org/apache/hadoop/hive/ql/index/AbstractIndexHandler.java
@@ -20,8 +20,13 @@ package org.apache.hadoop.hive.ql.index;
import java.util.List;
+import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.Index;
+import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
/**
* Abstract base class for index handlers. This is provided as insulation
@@ -42,4 +47,12 @@ public abstract class AbstractIndexHandler implements HiveIndexHandler {
return sb.toString();
}
+ public List> generateIndexQuery(Index index, ExprNodeDesc predicate,
+ ParseContext pctx) {
+ return null;
+ }
+
+ public boolean checkQuerySize(long inputSize, HiveConf conf) {
+ return false;
+ }
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexHandler.java ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexHandler.java
index 411b78f..32b68ae 100644
--- ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexHandler.java
+++ ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexHandler.java
@@ -22,12 +22,15 @@ import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.Index;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
-import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
/**
* HiveIndexHandler defines a pluggable interface for adding new index handlers
@@ -114,4 +117,23 @@ public interface HiveIndexHandler extends Configurable {
Set inputs, Set outputs)
throws HiveException;
+ /**
+ * Generate the list of tasks required to run an index sub-query for the
+ * given predicate, using the given index
+ * @param index
+ * @param predicate
+ * @param parseContext
+ * @return list of tasks for index sub-query
+ */
+ List> generateIndexQuery(Index index, ExprNodeDesc predicate,
+ ParseContext pctx);
+
+ /**
+ * Check the size of an input query to make sure it fits within the bounds
+ *
+ * @param inputSize: size of the query in questions
+ * @param conf
+ * @return true if query is within the bounds
+ */
+ boolean checkQuerySize(long inputSize, HiveConf conf);
}
\ No newline at end of file
diff --git ql/src/java/org/apache/hadoop/hive/ql/index/compact/CompactIndexHandler.java ql/src/java/org/apache/hadoop/hive/ql/index/compact/CompactIndexHandler.java
index 1f01446..1674290 100644
--- ql/src/java/org/apache/hadoop/hive/ql/index/compact/CompactIndexHandler.java
+++ ql/src/java/org/apache/hadoop/hive/ql/index/compact/CompactIndexHandler.java
@@ -19,38 +19,50 @@
package org.apache.hadoop.hive.ql.index.compact;
import java.util.ArrayList;
-import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;
import java.util.Map.Entry;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Index;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.Driver;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.index.AbstractIndexHandler;
-import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
+import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
+import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler.DecomposedPredicate;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
public class CompactIndexHandler extends AbstractIndexHandler {
private Configuration configuration;
+ private static final Log LOG = LogFactory.getLog(CompactIndexHandler.class.getName());
+
@Override
public void analyzeIndexDefinition(Table baseTable, Index index,
@@ -104,9 +116,10 @@ public class CompactIndexHandler extends AbstractIndexHandler {
break;
}
}
- if (basePart == null)
+ if (basePart == null) {
throw new RuntimeException(
"Partitions of base table and index table are inconsistent.");
+ }
// for each partition, spawn a map reduce task.
Task> indexBuilder = getIndexBuilderMapRedTask(inputs, outputs, index.getSd().getCols(), true,
new PartitionDesc(indexPart), indexTbl.getTableName(),
@@ -138,9 +151,10 @@ public class CompactIndexHandler extends AbstractIndexHandler {
for (int i = 0; i < ret.size(); i++) {
String partKV = ret.get(i);
command.append(partKV);
- if (i < ret.size() - 1)
+ if (i < ret.size() - 1) {
command.append(",");
}
+ }
command.append(" ) ");
}
@@ -161,10 +175,11 @@ public class CompactIndexHandler extends AbstractIndexHandler {
for (int i = 0; i < pkv.size(); i++) {
String partKV = pkv.get(i);
command.append(partKV);
- if (i < pkv.size() - 1)
+ if (i < pkv.size() - 1) {
command.append(" AND ");
}
}
+ }
command.append(" GROUP BY ");
command.append(indexCols + ", " + VirtualColumn.FILENAME.getName());
@@ -201,6 +216,91 @@ public class CompactIndexHandler extends AbstractIndexHandler {
}
@Override
+ public List> generateIndexQuery(Index index, ExprNodeDesc predicate,
+ ParseContext pctx) {
+
+ DecomposedPredicate decomposedPredicate = decomposePredicate(predicate, index);
+
+ // Build reentrant QL for index query
+ StringBuilder qlCommand = new StringBuilder("INSERT OVERWRITE DIRECTORY ");
+
+ String tmpFile = pctx.getContext().getMRTmpFileURI();
+ qlCommand.append( "\"" + tmpFile + "\" "); // QL includes " around file name
+ qlCommand.append("SELECT `_bucketname` , `_offsets` FROM ");
+ qlCommand.append(index.getIndexTableName());
+ qlCommand.append(" WHERE ");
+
+ String predicateString = decomposedPredicate.pushedPredicate.getExprString();
+ qlCommand.append(predicateString);
+
+ // generate tasks from index query string
+ LOG.info("Generating tasks for re-entrant QL query: " + qlCommand.toString());
+ Driver driver = new Driver(pctx.getConf());
+ driver.compile(qlCommand.toString());
+
+ // setup TableScanOperator to change input format for original query
+ TableScanOperator originalTblScan = (TableScanOperator) pctx.getTopOps().get(index.getOrigTableName());
+ originalTblScan.setIndexInputFormat(HiveCompactIndexInputFormat.class.getName());
+ originalTblScan.setIndexIntermediateFile(tmpFile);
+
+ Set inputs = pctx.getSemanticInputs();
+ inputs.addAll(driver.getPlan().getInputs());
+ return driver.getPlan().getRootTasks();
+ }
+
+ /**
+ * Split the predicate into the piece we can deal with (pushed), and the one we can't (residual)
+ * @param predicate
+ * @param index
+ * @return
+ */
+ private DecomposedPredicate decomposePredicate(ExprNodeDesc predicate, Index index) {
+ IndexPredicateAnalyzer analyzer = getIndexPredicateAnalyzer(index);
+ List searchConditions = new ArrayList();
+ // split predicate into pushed (what we can handle), and residual (what we can't handle)
+ ExprNodeDesc residualPredicate = analyzer.analyzePredicate(predicate, searchConditions);
+
+ DecomposedPredicate decomposedPredicate = new DecomposedPredicate();
+ decomposedPredicate.pushedPredicate = analyzer.translateSearchConditions(searchConditions);
+ decomposedPredicate.residualPredicate = residualPredicate;
+
+ return decomposedPredicate;
+ }
+
+ /**
+ * Instantiate a new predicate analyzer suitable for determining
+ * whether we can use an index, based on rules for indexes in
+ * WHERE clauses that we support
+ *
+ * @return preconfigured predicate analyzer for WHERE queries
+ */
+ private IndexPredicateAnalyzer getIndexPredicateAnalyzer(Index index) {
+ IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();
+
+ analyzer.addComparisonOp(GenericUDFOPEqual.class.getName());
+ analyzer.addComparisonOp(GenericUDFOPLessThan.class.getName());
+ analyzer.addComparisonOp(GenericUDFOPEqualOrLessThan.class.getName());
+ analyzer.addComparisonOp(GenericUDFOPGreaterThan.class.getName());
+ analyzer.addComparisonOp(GenericUDFOPEqualOrGreaterThan.class.getName());
+
+ // only return results for columns in this index
+ List columnSchemas = index.getSd().getCols();
+ for (FieldSchema column : columnSchemas) {
+ analyzer.allowColumnName(column.getName());
+ }
+
+ return analyzer;
+ }
+
+
+ @Override
+ public boolean checkQuerySize(long querySize, HiveConf hiveConf) {
+ long minSize = hiveConf.getLongVar(HiveConf.ConfVars.HIVE_INDEX_COMPACT_MINSIZE);
+ long maxSize = hiveConf.getLongVar(HiveConf.ConfVars.HIVE_INDEX_COMPACT_MAXSIZE);
+ return (querySize > minSize & querySize < maxSize);
+ }
+
+ @Override
public boolean usesIndexTable() {
return true;
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java
index 50db44c..f06a8de 100644
--- ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java
+++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java
@@ -38,6 +38,7 @@ import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.ProtectMode;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.Index;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
@@ -813,4 +814,14 @@ public class Table implements Serializable {
public String getCompleteName() {
return getDbName() + "@" + getTableName();
}
+
+ /**
+ * @return List containing Index Table names if there is exists indexes
+ * on this table
+ * @throws HiveException
+ **/
+ public List getAllIndexes(short max) throws HiveException {
+ Hive hive = Hive.get();
+ return hive.getIndexes(getTTable().getDbName(), getTTable().getTableName(), max);
+ }
};
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java
index 6162676..a9504dd 100644
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java
@@ -33,11 +33,11 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
-import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.QBParseInfo;
import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.StatsWork;
/**
@@ -69,6 +69,15 @@ public class GenMRTableScan1 implements NodeProcessor {
ctx.setCurrTask(currTask);
ctx.setCurrTopOp(currTopOp);
+ // Reset the inputFormat and inputFormatFile if the table scan needs a different one.
+ if (op.usesIndex()) {
+ String indexInputFormat = op.getIndexInputFormat();
+ String indexIntermediateFile = op.getIndexIntermediateFile();
+ ((MapredWork)currTask.getWork()).setInputformat(indexInputFormat);
+ ((MapredWork)currTask.getWork()).setIndexIntermediateFile(indexIntermediateFile);
+ }
+
+
for (String alias : parseCtx.getTopOps().keySet()) {
Operator extends Serializable> currOp = parseCtx.getTopOps().get(alias);
if (currOp == op) {
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/IndexWhereResolver.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/IndexWhereResolver.java
new file mode 100644
index 0000000..d67bd90
--- /dev/null
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/IndexWhereResolver.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.physical;
+
+import java.util.ArrayList;
+
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.optimizer.physical.index.IndexWhereTaskDispatcher;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+public class IndexWhereResolver implements PhysicalPlanResolver {
+
+ @Override
+ public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticException {
+ //Dispatcher dispatcher = getDispatcher(physicalContext);
+ Dispatcher dispatcher = new IndexWhereTaskDispatcher(physicalContext);
+ GraphWalker opGraphWalker = new DefaultGraphWalker(dispatcher);
+ ArrayList topNodes = new ArrayList();
+ topNodes.addAll(physicalContext.rootTasks);
+ opGraphWalker.startWalking(topNodes, null);
+
+ return physicalContext;
+ }
+}
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java
index 0ae9fa2..a336230 100644
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java
@@ -52,6 +52,9 @@ public class PhysicalOptimizer {
if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)) {
resolvers.add(new CommonJoinResolver());
}
+ if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTAUTOINDEX)) {
+ resolvers.add(new IndexWhereResolver());
+ }
resolvers.add(new MapJoinResolver());
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcCtx.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcCtx.java
new file mode 100644
index 0000000..608fa69
--- /dev/null
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcCtx.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.physical.index;
+
+import java.io.Serializable;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+
+public class IndexWhereProcCtx implements NodeProcessorCtx {
+
+ private static final Log LOG = LogFactory.getLog(IndexWhereProcCtx.class.getName());
+
+ private final Task extends Serializable> currentTask;
+ private final ParseContext parseCtx;
+
+ public IndexWhereProcCtx(Task extends Serializable> task, ParseContext parseCtx) {
+ this.currentTask = task;
+ this.parseCtx = parseCtx;
+ }
+
+ public ParseContext getParseContext() {
+ return parseCtx;
+ }
+
+ public Task extends Serializable> getCurrentTask() {
+ return currentTask;
+ }
+}
\ No newline at end of file
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java
new file mode 100644
index 0000000..9ea10e9
--- /dev/null
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java
@@ -0,0 +1,232 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.physical.index;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.Index;
+import org.apache.hadoop.hive.ql.exec.ExplainTask;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.MapRedTask;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.index.HiveIndexHandler;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc;
+
+/**
+*
+* IndexWhereProcessor.
+* Processes Operator Nodes to look for WHERE queries with a predicate column
+* on which we have an index. Creates an index subquery Task for these
+* WHERE queries to use the index automatically.
+*/
+public class IndexWhereProcessor implements NodeProcessor {
+
+ private static final Log LOG = LogFactory.getLog(IndexWhereProcessor.class.getName());
+ private final Map> indexes;
+
+ public IndexWhereProcessor(Map> indexes) {
+ super();
+ this.indexes = indexes;
+ }
+
+ @Override
+ /**
+ * Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
+ */
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+
+ FilterOperator operator = (FilterOperator) nd;
+ FilterDesc operatorDesc = operator.getConf();
+ ExprNodeDesc predicate = operatorDesc.getPredicate();
+
+ IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
+ ParseContext pctx = context.getParseContext();
+
+ // check if we have indexes on all partitions in this table scan
+ try {
+ if (! partitionsOk(operator, pctx)) {
+ return null;
+ }
+ } catch (HiveException e) {
+ LOG.error("Error accessing metastore", e);
+ return null;
+ }
+
+ // get potential reentrant index queries from each index
+ List>> indexQueryTaskList = new ArrayList>>();
+ for (List indexesOnTable : indexes.values()) {
+ for (Index index : indexesOnTable) {
+ List> indexQueryTasks = rewriteForIndex(predicate, index, pctx, context.getCurrentTask());
+ if (indexQueryTasks != null && indexQueryTasks.size() > 0)
+ {
+ indexQueryTaskList.add(indexQueryTasks);
+ }
+ }
+ }
+
+ // choose an index rewrite to use
+ if (indexQueryTaskList.size() > 0) {
+ // TODO This would be a good place for some sort of cost based choice?
+ List> chosenRewrite = indexQueryTaskList.get(0);
+ // add dependencies so index query runs first
+ Task> wholeTableScan = context.getCurrentTask();
+ LinkedHashSet> rewriteLeaves = new LinkedHashSet>();
+ findLeaves(chosenRewrite, rewriteLeaves);
+ ExplainTask taskExplainer = new ExplainTask();
+
+ for (Task> leaf : rewriteLeaves) {
+ leaf.addDependentTask(wholeTableScan); // add full scan task as child for every index query task
+ }
+
+ // relabel the tasks
+ int rootLabel = 0;
+ for (Task> task: chosenRewrite) {
+ rootLabel = relabelTree(task, rootLabel);
+ }
+
+ // replace the original with the index sub-query as a root task
+ pctx.replaceRootTask(wholeTableScan, chosenRewrite);
+ }
+
+ return null;
+ }
+
+ /**
+ * Get a list of Tasks to activate use of indexes.
+ * Generate the tasks for the index query (where we store results of
+ * querying the index in a tmp file) inside the IndexHandler
+ * @param task
+ */
+ private List> rewriteForIndex(ExprNodeDesc predicate, Index index,
+ ParseContext pctx, Task extends Serializable> task)
+ throws SemanticException {
+ HiveIndexHandler indexHandler;
+ try {
+ indexHandler = HiveUtils.getIndexHandler(pctx.getConf(), index.getIndexHandlerClass());
+ } catch (HiveException e) {
+ LOG.error("Exception while loading IndexHandler: " + index.getIndexHandlerClass());
+ throw new SemanticException("Failed to load indexHandler: " + index.getIndexHandlerClass(), e);
+ }
+
+ // check the size
+ try {
+ ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), ((MapRedTask) task).getWork(), null);
+ long inputSize = inputSummary.getLength();
+ if (! indexHandler.checkQuerySize(inputSize, pctx.getConf())) {
+ return null;
+ }
+ } catch (IOException e) {
+ throw new SemanticException("Failed to get task size", e);
+ }
+
+ // use the IndexHandler to generate the index query
+ List> indexQueryTasks = indexHandler.generateIndexQuery(index, predicate, pctx);
+
+ return indexQueryTasks;
+ }
+
+ /**
+ * Check the partitions used by the table scan
+ * @param pctx
+ * @param operator
+ * @return true if all partitions being accessed are present in the index table
+ */
+ private boolean partitionsOk(FilterOperator operator, ParseContext pctx) throws HiveException {
+ TableScanOperator tableScan = (TableScanOperator) operator.getParentOperators().get(0);
+ Hive hive = Hive.get(pctx.getConf());
+
+ // make sure each partition exists on the index table
+ PrunedPartitionList partitionList = pctx.getOpToPartList().get(tableScan);
+ Set partitions = partitionList.getConfirmedPartns();
+ for (Partition part : partitions) {
+ if (! indexes.containsKey(part.getTable()) ) {
+ return false; // something is wrong if the partition's table is not indexed
+ }
+ // every partition's table should have same partition keys as the index table
+ for (Index index : indexes.get(part.getTable())) {
+ Table indexTable = hive.getTable(index.getIndexTableName());
+ List indexPartitions = indexTable.getPartitionKeys();
+ for (FieldSchema col : part.getTable().getPartitionKeys()) {
+ if (! indexPartitions.contains(col)) {
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+ }
+
+ private void findLeaves(List> tasks, Set> leaves) {
+ for (Task> t : tasks) {
+ if (t.getDependentTasks() == null) {
+ leaves.add(t);
+ } else {
+ findLeaves(t.getDependentTasks(), leaves);
+ }
+ }
+ }
+
+ /**
+ * relabel the tasks
+ * @param rootLabel
+ * @return the highest label used in the tree
+ */
+ private int relabelTree(Task> rootTask, int rootLabel) {
+ rootTask.setId("Stage-" + rootLabel);
+
+ if (rootTask.getDependentTasks() == null || rootTask.getDependentTasks().size() == 0) {
+ return rootLabel;
+ }
+
+ int childRoot = rootLabel;
+ for (Task> child : rootTask.getDependentTasks()) {
+ childRoot = relabelTree(child, childRoot + 1);
+ }
+ return childRoot;
+ }
+
+}
+
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereTaskDispatcher.java
new file mode 100644
index 0000000..a5f61a7
--- /dev/null
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereTaskDispatcher.java
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.physical.index;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.hadoop.hive.metastore.api.Index;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+/**
+ *
+ * IndexWhereTaskDispatcher. Walks a Task tree, and for the right kind of Task,
+ * walks the operator tree to create an index subquery. Then attaches the
+ * subquery task to the task tree.
+ *
+ */
+public class IndexWhereTaskDispatcher implements Dispatcher {
+
+ private final PhysicalContext physicalContext;
+
+ public IndexWhereTaskDispatcher(PhysicalContext context) {
+ super();
+ physicalContext = context;
+ }
+
+ @Override
+ public Object dispatch(Node nd, Stack stack, Object... nodeOutputs)
+ throws SemanticException {
+
+ Task extends Serializable> task = (Task extends Serializable>) nd;
+
+ ParseContext pctx = physicalContext.getParseContext();
+
+ // all of our index tasks are MapReduce
+ if (! task.isMapRedTask()) {
+ return null;
+ }
+
+ // create the regex's so the walker can recognize our WHERE queries
+ Map operatorRules = createOperatorRules(pctx);
+
+ // check for no indexes on table
+ if (operatorRules == null) {
+ return null;
+ }
+
+ // create context so the walker can carry the current task with it.
+ IndexWhereProcCtx indexWhereOptimizeCtx = new IndexWhereProcCtx(task, pctx);
+
+ // create the dispatcher, which fires the processor according to the rule that
+ // best matches
+ Dispatcher dispatcher = new DefaultRuleDispatcher(getDefaultProcessor(),
+ operatorRules,
+ indexWhereOptimizeCtx);
+
+ // walk the mapper operator(not task) tree
+ GraphWalker ogw = new DefaultGraphWalker(dispatcher);
+ ArrayList topNodes = new ArrayList();
+ topNodes.addAll(pctx.getTopOps().values());
+ ogw.startWalking(topNodes, null);
+
+ return null;
+ }
+
+ /**
+ * Create a set of rules that only matches WHERE predicates on columns we have
+ * an index on.
+ * @return
+ */
+ private Map createOperatorRules(ParseContext pctx) {
+ Map operatorRules = new LinkedHashMap();
+
+ List supportedIndexes = new ArrayList();
+ supportedIndexes.add(CompactIndexHandler.class.getName());
+
+ // query the metastore to know what columns we have indexed
+ Collection topTables = pctx.getTopToTable().values();
+ Map> indexes = new HashMap>();
+ for (Table tbl : topTables)
+ {
+ List tblIndexes = getIndexes(tbl, supportedIndexes);
+ if (tblIndexes.size() > 0) {
+ indexes.put(tbl, tblIndexes);
+ }
+ }
+
+ // quit if our tables don't have any indexes
+ if (indexes.size() == 0) {
+ return null;
+ }
+
+ // FIL% is a filter operator, a WHERE shows up as a filter on a table scan operator (TS%)
+ operatorRules.put(new RuleRegExp("RULEWhere", "TS%FIL%"), new IndexWhereProcessor(indexes));
+
+ return operatorRules;
+ }
+
+ /**
+ * Get a list of indexes on a table that match given types.
+ * Copied from HIVE-1694 patch
+ */
+ private List getIndexes(Table baseTableMetaData, List matchIndexTypes) {
+ List matchingIndexes = new ArrayList();
+ List indexesOnTable = null;
+
+ try {
+ indexesOnTable = baseTableMetaData.getAllIndexes((short) -1); // get all indexes
+
+ } catch (HiveException e) {
+ return matchingIndexes;
+ /* Return empty list. Trouble doing rewrite shouldn't stop regular
+ * query execution, if there's serious problem with metadata or anything else,
+ * it's assumed to be checked & handled in core hive code itself.
+ */
+ }
+
+ for (Index index : indexesOnTable) {
+ String indexType = index.getIndexHandlerClass();
+ if (matchIndexTypes.contains(indexType)) {
+ matchingIndexes.add(index);
+ }
+ }
+ return matchingIndexes;
+ }
+
+ private NodeProcessor getDefaultProcessor() {
+ return new NodeProcessor() {
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+ return null;
+ }
+ };
+ }
+
+}
diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
index 937a7b3..1451363 100644
--- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
@@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.parse;
import java.io.Serializable;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -33,7 +34,9 @@ import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.hooks.LineageInfo;
+import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
@@ -91,6 +94,9 @@ public class ParseContext {
// a map-reduce job
private boolean hasNonPartCols;
+ private HashSet semanticInputs;
+ private List> rootTasks;
+
public ParseContext() {
}
@@ -126,6 +132,8 @@ public class ParseContext {
* list of map join operators with no reducer
* @param opToSamplePruner
* operator to sample pruner map
+ * @param rootTasks
+ * @param inputs
*/
public ParseContext(
HiveConf conf,
@@ -143,7 +151,8 @@ public class ParseContext {
UnionProcContext uCtx, List> listMapJoinOpsNoReducer,
Map> groupOpToInputTables,
Map prunedPartitions,
- HashMap opToSamplePruner) {
+ HashMap opToSamplePruner,
+ HashSet semanticInputs, List> rootTasks) {
this.conf = conf;
this.qb = qb;
this.ast = ast;
@@ -166,6 +175,8 @@ public class ParseContext {
this.groupOpToInputTables = groupOpToInputTables;
this.prunedPartitions = prunedPartitions;
this.opToSamplePruner = opToSamplePruner;
+ this.semanticInputs = semanticInputs;
+ this.rootTasks = rootTasks;
}
/**
@@ -486,4 +497,14 @@ public class ParseContext {
public void setMapJoinContext(Map mapJoinContext) {
this.mapJoinContext = mapJoinContext;
}
+
+ public HashSet getSemanticInputs() {
+ return semanticInputs;
+ }
+
+ public void replaceRootTask(Task extends Serializable> rootTask,
+ List extends Task extends Serializable>> tasks) {
+ this.rootTasks.remove(rootTask);
+ this.rootTasks.addAll(tasks);
+ }
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index f0aca84..8cb1dd0 100644
--- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -258,11 +258,12 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
}
public ParseContext getParseContext() {
- return new ParseContext(conf, qb, ast, opToPartPruner, opToPartList, topOps,
+ ParseContext pctx = new ParseContext(conf, qb, ast, opToPartPruner, opToPartList, topOps,
topSelOps, opParseCtx, joinContext, topToTable, loadTableWork,
loadFileWork, ctx, idToTableNameMap, destTableId, uCtx,
listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions,
- opToSamplePruner);
+ opToSamplePruner, inputs, rootTasks);
+ return pctx;
}
@SuppressWarnings("nls")
@@ -6623,7 +6624,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
opToPartList, topOps, topSelOps, opParseCtx, joinContext, topToTable,
loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx,
listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions,
- opToSamplePruner);
+ opToSamplePruner,inputs, rootTasks);
Optimizer optm = new Optimizer();
optm.setPctx(pCtx);
diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
index 73391e9..6e661e4 100644
--- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
@@ -70,6 +70,7 @@ public class MapredWork implements Serializable {
private MapredLocalWork mapLocalWork;
private String inputformat;
+ private String indexIntermediateFile;
private boolean gatheringStats;
private String tmpHDFSFileURI;
@@ -363,10 +364,18 @@ public class MapredWork implements Serializable {
return inputformat;
}
+ public String getIndexIntermediateFile() {
+ return indexIntermediateFile;
+ }
+
public void setInputformat(String inputformat) {
this.inputformat = inputformat;
}
+ public void setIndexIntermediateFile(String fileName) {
+ this.indexIntermediateFile = fileName;
+ }
+
public void setGatheringStats(boolean gatherStats) {
this.gatheringStats = gatherStats;
}
diff --git ql/src/test/queries/clientpositive/index_opt_where.q ql/src/test/queries/clientpositive/index_opt_where.q
new file mode 100644
index 0000000..0b92520
--- /dev/null
+++ ql/src/test/queries/clientpositive/index_opt_where.q
@@ -0,0 +1,22 @@
+-- try the query without indexing, with manual indexing, and with automatic indexing
+SELECT * FROM src WHERE key > 80 AND key < 100 ORDER BY key;
+
+CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD;
+ALTER INDEX src_index ON src REBUILD;
+
+SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+INSERT OVERWRITE DIRECTORY "/tmp/index_where" SELECT `_bucketname` , `_offsets` FROM default__src_src_index__ WHERE key > 80 AND key < 100;
+SET hive.index.compact.file=/tmp/index_where;
+SET hive.optimize.autoindex=false;
+SET hive.input.format=org.apache.hadoop.hive.ql.index.compact.HiveCompactIndexInputFormat;
+
+EXPLAIN SELECT key, value FROM src WHERE key > 80 AND key < 100 ORDER BY key;
+SELECT key, value FROM src WHERE key > 80 AND key < 100 ORDER BY key;
+
+SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+SET hive.optimize.autoindex=true;
+
+-- test automatic usage of index in query
+SELECT * FROM src WHERE key > 80 AND key < 100 ORDER BY key;
+
+DROP INDEX src_index on src;
\ No newline at end of file
diff --git ql/src/test/queries/clientpositive/index_opt_where_partitioned.q ql/src/test/queries/clientpositive/index_opt_where_partitioned.q
new file mode 100644
index 0000000..736841c
--- /dev/null
+++ ql/src/test/queries/clientpositive/index_opt_where_partitioned.q
@@ -0,0 +1,11 @@
+-- test automatic use of index on table with partitions
+CREATE INDEX src_part_index ON TABLE srcpart(key) as 'COMPACT' WITH DEFERRED REBUILD;
+ALTER INDEX src_part_index ON srcpart REBUILD;
+
+SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+SET hive.optimize.autoindex=true;
+
+EXPLAIN SELECT key, value FROM srcpart WHERE key=86 ORDER BY key;
+SELECT key, value FROM srcpart WHERE key=86 ORDER BY key;
+
+DROP INDEX src_part_index ON srcpart;
diff --git ql/src/test/queries/clientpositive/index_opt_where_simple.q ql/src/test/queries/clientpositive/index_opt_where_simple.q
new file mode 100644
index 0000000..1612af5
--- /dev/null
+++ ql/src/test/queries/clientpositive/index_opt_where_simple.q
@@ -0,0 +1,26 @@
+-- test automatic use of index and different file formats
+CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD;
+ALTER INDEX src_index ON src REBUILD;
+
+SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+SET hive.optimize.autoindex=false;
+INSERT OVERWRITE DIRECTORY "/tmp/index_result_where1" SELECT `_bucketname` , `_offsets` FROM default__src_src_index__ WHERE key=86;
+SET hive.index.compact.file=/tmp/index_result_where1;
+SET hive.input.format=org.apache.hadoop.hive.ql.index.compact.HiveCompactIndexInputFormat;
+
+EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key;
+SELECT key, value FROM src WHERE key=86 ORDER BY key;
+
+SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+SET hive.optimize.autoindex=true;
+
+EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key;
+SELECT key, value FROM src WHERE key=86 ORDER BY key;
+
+SET hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+SET hive.optimize.autoindex=true;
+
+EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key;
+SELECT key, value FROM src WHERE key=86 ORDER BY key;
+
+DROP INDEX src_index on src;
\ No newline at end of file
diff --git ql/src/test/results/clientpositive/index_opt_where.q.out ql/src/test/results/clientpositive/index_opt_where.q.out
new file mode 100644
index 0000000..e7f2a8e
--- /dev/null
+++ ql/src/test/results/clientpositive/index_opt_where.q.out
@@ -0,0 +1,187 @@
+PREHOOK: query: -- try the query without indexing, with manual indexing, and with automatic indexing
+SELECT * FROM src WHERE key > 80 AND key < 100 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-46-05_776_3701467786693766961/-mr-10000
+POSTHOOK: query: -- try the query without indexing, with manual indexing, and with automatic indexing
+SELECT * FROM src WHERE key > 80 AND key < 100 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-46-05_776_3701467786693766961/-mr-10000
+82 val_82
+83 val_83
+83 val_83
+84 val_84
+84 val_84
+85 val_85
+86 val_86
+87 val_87
+90 val_90
+90 val_90
+90 val_90
+92 val_92
+95 val_95
+95 val_95
+96 val_96
+97 val_97
+97 val_97
+98 val_98
+98 val_98
+PREHOOK: query: CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD
+PREHOOK: type: CREATEINDEX
+POSTHOOK: query: CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD
+POSTHOOK: type: CREATEINDEX
+PREHOOK: query: ALTER INDEX src_index ON src REBUILD
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@default__src_src_index__
+POSTHOOK: query: ALTER INDEX src_index ON src REBUILD
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@default__src_src_index__
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: INSERT OVERWRITE DIRECTORY "/tmp/index_where" SELECT `_bucketname` , `_offsets` FROM default__src_src_index__ WHERE key > 80 AND key < 100
+PREHOOK: type: QUERY
+PREHOOK: Input: default@default__src_src_index__
+PREHOOK: Output: /tmp/index_where
+POSTHOOK: query: INSERT OVERWRITE DIRECTORY "/tmp/index_where" SELECT `_bucketname` , `_offsets` FROM default__src_src_index__ WHERE key > 80 AND key < 100
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@default__src_src_index__
+POSTHOOK: Output: /tmp/index_where
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: EXPLAIN SELECT key, value FROM src WHERE key > 80 AND key < 100 ORDER BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key, value FROM src WHERE key > 80 AND key < 100 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (AND (> (TOK_TABLE_OR_COL key) 80) (< (TOK_TABLE_OR_COL key) 100))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src
+ TableScan
+ alias: src
+ Filter Operator
+ predicate:
+ expr: ((key > 80) and (key < 100))
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: ((key > 80) and (key < 100))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, value FROM src WHERE key > 80 AND key < 100 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-46-25_021_8458385140495465322/-mr-10000
+POSTHOOK: query: SELECT key, value FROM src WHERE key > 80 AND key < 100 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-46-25_021_8458385140495465322/-mr-10000
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+82 val_82
+83 val_83
+83 val_83
+84 val_84
+84 val_84
+85 val_85
+86 val_86
+87 val_87
+90 val_90
+90 val_90
+90 val_90
+92 val_92
+95 val_95
+95 val_95
+96 val_96
+97 val_97
+97 val_97
+98 val_98
+98 val_98
+PREHOOK: query: -- test automatic usage of index in query
+SELECT * FROM src WHERE key > 80 AND key < 100 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@default__src_src_index__
+PREHOOK: Input: default@src
+PREHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-46-28_200_3904715072658165263/-mr-10000
+POSTHOOK: query: -- test automatic usage of index in query
+SELECT * FROM src WHERE key > 80 AND key < 100 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@default__src_src_index__
+POSTHOOK: Input: default@src
+POSTHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-46-28_200_3904715072658165263/-mr-10000
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+82 val_82
+83 val_83
+83 val_83
+84 val_84
+84 val_84
+85 val_85
+86 val_86
+87 val_87
+90 val_90
+90 val_90
+90 val_90
+92 val_92
+95 val_95
+95 val_95
+96 val_96
+97 val_97
+97 val_97
+98 val_98
+98 val_98
+PREHOOK: query: DROP INDEX src_index on src
+PREHOOK: type: DROPINDEX
+POSTHOOK: query: DROP INDEX src_index on src
+POSTHOOK: type: DROPINDEX
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
diff --git ql/src/test/results/clientpositive/index_opt_where_partitioned.q.out ql/src/test/results/clientpositive/index_opt_where_partitioned.q.out
new file mode 100644
index 0000000..3483f9d
--- /dev/null
+++ ql/src/test/results/clientpositive/index_opt_where_partitioned.q.out
@@ -0,0 +1,185 @@
+PREHOOK: query: -- test automatic use of index on table with partitions
+CREATE INDEX src_part_index ON TABLE srcpart(key) as 'COMPACT' WITH DEFERRED REBUILD
+PREHOOK: type: CREATEINDEX
+POSTHOOK: query: -- test automatic use of index on table with partitions
+CREATE INDEX src_part_index ON TABLE srcpart(key) as 'COMPACT' WITH DEFERRED REBUILD
+POSTHOOK: type: CREATEINDEX
+PREHOOK: query: ALTER INDEX src_part_index ON srcpart REBUILD
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+PREHOOK: Output: default@default__srcpart_src_part_index__@ds=2008-04-08/hr=11
+PREHOOK: Output: default@default__srcpart_src_part_index__@ds=2008-04-08/hr=12
+PREHOOK: Output: default@default__srcpart_src_part_index__@ds=2008-04-09/hr=11
+PREHOOK: Output: default@default__srcpart_src_part_index__@ds=2008-04-09/hr=12
+POSTHOOK: query: ALTER INDEX src_part_index ON srcpart REBUILD
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: default@default__srcpart_src_part_index__@ds=2008-04-08/hr=11
+POSTHOOK: Output: default@default__srcpart_src_part_index__@ds=2008-04-08/hr=12
+POSTHOOK: Output: default@default__srcpart_src_part_index__@ds=2008-04-09/hr=11
+POSTHOOK: Output: default@default__srcpart_src_part_index__@ds=2008-04-09/hr=12
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12)._bucketname SIMPLE [(srcpart)srcpart.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12)._offsets EXPRESSION [(srcpart)srcpart.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: EXPLAIN SELECT key, value FROM srcpart WHERE key=86 ORDER BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key, value FROM srcpart WHERE key=86 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12)._bucketname SIMPLE [(srcpart)srcpart.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12)._offsets EXPRESSION [(srcpart)srcpart.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME srcpart))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 86)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+ Stage-1 depends on stages: Stage-0 , consists of Stage-2, Stage-5
+ Stage-2
+ Stage-6 depends on stages: Stage-2, Stage-5
+ Stage-7 depends on stages: Stage-6
+ Stage-5
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__srcpart_src_part_index__
+ TableScan
+ alias: default__srcpart_src_part_index__
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: _bucketname
+ type: string
+ expr: _offsets
+ type: array
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Conditional Operator
+
+ Stage: Stage-2
+ Move Operator
+ files:
+ hdfs directory: true
+ destination: file:/home/rmelick/hive/build/ql/scratchdir/hive_2011-04-06_21-31-20_130_4597666835451892250/-ext-10000
+
+ Stage: Stage-6
+ Move Operator
+ files:
+ hdfs directory: true
+ destination: file:/tmp/rmelick/hive_2011-04-06_21-31-19_891_8095743340770125817/-mr-10002
+
+ Stage: Stage-7
+ Map Reduce
+ Alias -> Map Operator Tree:
+ srcpart
+ TableScan
+ alias: srcpart
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ file:/home/rmelick/hive/build/ql/scratchdir/hive_2011-04-06_21-31-20_130_4597666835451892250/-ext-10001
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, value FROM srcpart WHERE key=86 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@default__srcpart_src_part_index__@ds=2008-04-08/hr=11
+PREHOOK: Input: default@default__srcpart_src_part_index__@ds=2008-04-08/hr=12
+PREHOOK: Input: default@default__srcpart_src_part_index__@ds=2008-04-09/hr=11
+PREHOOK: Input: default@default__srcpart_src_part_index__@ds=2008-04-09/hr=12
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+PREHOOK: Output: file:/tmp/rmelick/hive_2011-04-06_21-31-20_279_8676548334368581351/-mr-10000
+POSTHOOK: query: SELECT key, value FROM srcpart WHERE key=86 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@default__srcpart_src_part_index__@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@default__srcpart_src_part_index__@ds=2008-04-08/hr=12
+POSTHOOK: Input: default@default__srcpart_src_part_index__@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@default__srcpart_src_part_index__@ds=2008-04-09/hr=12
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: file:/tmp/rmelick/hive_2011-04-06_21-31-20_279_8676548334368581351/-mr-10000
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12)._bucketname SIMPLE [(srcpart)srcpart.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12)._offsets EXPRESSION [(srcpart)srcpart.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+86 val_86
+86 val_86
+86 val_86
+86 val_86
+PREHOOK: query: DROP INDEX src_part_index ON srcpart
+PREHOOK: type: DROPINDEX
+POSTHOOK: query: DROP INDEX src_part_index ON srcpart
+POSTHOOK: type: DROPINDEX
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12)._bucketname SIMPLE [(srcpart)srcpart.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12)._offsets EXPRESSION [(srcpart)srcpart.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__srcpart_src_part_index__ PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
diff --git ql/src/test/results/clientpositive/index_opt_where_simple.q.out ql/src/test/results/clientpositive/index_opt_where_simple.q.out
new file mode 100644
index 0000000..cd41c9d
--- /dev/null
+++ ql/src/test/results/clientpositive/index_opt_where_simple.q.out
@@ -0,0 +1,376 @@
+PREHOOK: query: -- test automatic use of index
+CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD
+PREHOOK: type: CREATEINDEX
+POSTHOOK: query: -- test automatic use of index
+CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD
+POSTHOOK: type: CREATEINDEX
+PREHOOK: query: ALTER INDEX src_index ON src REBUILD
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@default__src_src_index__
+POSTHOOK: query: ALTER INDEX src_index ON src REBUILD
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@default__src_src_index__
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: INSERT OVERWRITE DIRECTORY "/tmp/index_result_where1" SELECT `_bucketname` , `_offsets` FROM default__src_src_index__ WHERE key=86
+PREHOOK: type: QUERY
+PREHOOK: Input: default@default__src_src_index__
+PREHOOK: Output: /tmp/index_result_where1
+POSTHOOK: query: INSERT OVERWRITE DIRECTORY "/tmp/index_result_where1" SELECT `_bucketname` , `_offsets` FROM default__src_src_index__ WHERE key=86
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@default__src_src_index__
+POSTHOOK: Output: /tmp/index_result_where1
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 86)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src
+ TableScan
+ alias: src
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, value FROM src WHERE key=86 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-41-56_159_8008023160364399783/-mr-10000
+POSTHOOK: query: SELECT key, value FROM src WHERE key=86 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-41-56_159_8008023160364399783/-mr-10000
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+86 val_86
+PREHOOK: query: EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 86)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+ Stage-1 depends on stages: Stage-0 , consists of Stage-2, Stage-5
+ Stage-2
+ Stage-6 depends on stages: Stage-2, Stage-5
+ Stage-7 depends on stages: Stage-6
+ Stage-5
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__src_src_index__
+ TableScan
+ alias: default__src_src_index__
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: _bucketname
+ type: string
+ expr: _offsets
+ type: array
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Conditional Operator
+
+ Stage: Stage-2
+ Move Operator
+ files:
+ hdfs directory: true
+ destination: file:/home/rmelick/hive/build/ql/scratchdir/hive_2011-03-31_20-41-59_421_4392048238038392412/-ext-10000
+
+ Stage: Stage-6
+ Move Operator
+ files:
+ hdfs directory: true
+ destination: file:/tmp/rmelick/hive_2011-03-31_20-41-59_343_389981196577767521/-mr-10002
+
+ Stage: Stage-7
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src
+ TableScan
+ alias: src
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ file:/home/rmelick/hive/build/ql/scratchdir/hive_2011-03-31_20-41-59_421_4392048238038392412/-ext-10001
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, value FROM src WHERE key=86 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@default__src_src_index__
+PREHOOK: Input: default@src
+PREHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-41-59_506_1420619192864641384/-mr-10000
+POSTHOOK: query: SELECT key, value FROM src WHERE key=86 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@default__src_src_index__
+POSTHOOK: Input: default@src
+POSTHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-41-59_506_1420619192864641384/-mr-10000
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+86 val_86
+PREHOOK: query: EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key, value FROM src WHERE key=86 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 86)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+ Stage-1 depends on stages: Stage-0 , consists of Stage-2, Stage-5
+ Stage-2
+ Stage-6 depends on stages: Stage-2, Stage-5
+ Stage-7 depends on stages: Stage-6
+ Stage-5
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__src_src_index__
+ TableScan
+ alias: default__src_src_index__
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: _bucketname
+ type: string
+ expr: _offsets
+ type: array
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Conditional Operator
+
+ Stage: Stage-2
+ Move Operator
+ files:
+ hdfs directory: true
+ destination: file:/home/rmelick/hive/build/ql/scratchdir/hive_2011-03-31_20-42-05_443_5539312497585389519/-ext-10000
+
+ Stage: Stage-6
+ Move Operator
+ files:
+ hdfs directory: true
+ destination: file:/tmp/rmelick/hive_2011-03-31_20-42-05_373_2840006505831319976/-mr-10002
+
+ Stage: Stage-7
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src
+ TableScan
+ alias: src
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 86)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ file:/home/rmelick/hive/build/ql/scratchdir/hive_2011-03-31_20-42-05_443_5539312497585389519/-ext-10001
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, value FROM src WHERE key=86 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@default__src_src_index__
+PREHOOK: Input: default@src
+PREHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-42-05_532_5550442449992372402/-mr-10000
+POSTHOOK: query: SELECT key, value FROM src WHERE key=86 ORDER BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@default__src_src_index__
+POSTHOOK: Input: default@src
+POSTHOOK: Output: file:/tmp/rmelick/hive_2011-03-31_20-42-05_532_5550442449992372402/-mr-10000
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+86 val_86
+PREHOOK: query: DROP INDEX src_index on src
+PREHOOK: type: DROPINDEX
+POSTHOOK: query: DROP INDEX src_index on src
+POSTHOOK: type: DROPINDEX
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]