From f469d7d4f5da4577ddf1ce1fc784bb8daa9a8c65 Mon Sep 17 00:00:00 2001 From: Syed Albiz Date: Tue, 2 Aug 2011 16:04:47 -0700 Subject: [PATCH 1/1] check index staleness before letting it through diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java index 8295687..dcdfb9e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java @@ -31,6 +31,9 @@ import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.MapRedTask; @@ -68,10 +71,12 @@ public class IndexWhereProcessor implements NodeProcessor { private static final Log LOG = LogFactory.getLog(IndexWhereProcessor.class.getName()); private final Map> indexes; + private Map indexToIndexTable; public IndexWhereProcessor(Map> indexes) { super(); this.indexes = indexes; + this.indexToIndexTable = new HashMap(); } @Override @@ -243,7 +248,7 @@ public class IndexWhereProcessor implements NodeProcessor { for (Partition part : queryPartitions) { List sourceIndexTables = getIndexTables(hive, part); - if (!containsPartition(hive, sourceIndexTables, part)) { + if (!containsPartition(hive, part)) { return null; // problem if it doesn't contain the partition } } @@ -252,6 +257,23 @@ public class IndexWhereProcessor implements NodeProcessor { } /** + * return index tables associated with a given base table + */ + private List
getIndexTables(Hive hive, Table table) throws + HiveException { + List
indexTables = new ArrayList
(); + if (indexes == null || indexes.get(table) == null) { + return indexTables; + } + for (Index index : indexes.get(table)) { + Table indexTable = hive.getTable(index.getIndexTableName()); + indexToIndexTable.put(index, indexTable); + indexTables.add(indexTable); + } + return indexTables; + } + + /** * return index tables associated with the base table of the partition */ private List
getIndexTables(Hive hive, Partition part) throws HiveException { @@ -261,32 +283,99 @@ public class IndexWhereProcessor implements NodeProcessor { return indexTables; } for (Index index : indexes.get(partitionedTable)) { - indexTables.add(hive.getTable(index.getIndexTableName())); + Table indexTable = hive.getTable(index.getIndexTableName()); + indexToIndexTable.put(index, indexTable); + indexTables.add(indexTable); } return indexTables; } /** - * check that every index table contains the given partition + * check that every index table contains the given partition and is fresh */ - private boolean containsPartition(Hive hive, List
indexTables, Partition part) + private boolean containsPartition(Hive hive, Partition part) throws HiveException { HashMap partSpec = part.getSpec(); - if (partSpec.isEmpty()) { - return true; // empty specs come from non-partitioned tables + if (indexes == null || indexes.get(part.getTable()) == null) { + return false; } - if (indexTables == null || indexTables.size() == 0) { - return false; + if (partSpec.isEmpty()) { + // empty specs come from non-partitioned tables + return isIndexTableFresh(hive, indexes.get(part.getTable()), part.getTable()); } - for (Table indexTable : indexTables) { + for (Index index : indexes.get(part.getTable())) { + Table indexTable = indexToIndexTable.get(index); // get partitions that match the spec List matchingPartitions = hive.getPartitions(indexTable, partSpec); if (matchingPartitions == null || matchingPartitions.size() == 0) { LOG.info("Index table " + indexTable + "did not contain built partition that matched " + partSpec); return false; + } else if (!isIndexPartitionFresh(hive, index, part)) { + return false; + } + } + return true; + } + + /** + * Check the index partitions on a parttioned table exist and are fresh + */ + private boolean isIndexPartitionFresh(Hive hive, Index index, + Partition part) throws HiveException { + LOG.info("checking index staleness..."); + try { + FileSystem partFs = part.getPartitionPath().getFileSystem(hive.getConf()); + FileStatus partFss = partFs.getFileStatus(part.getPartitionPath()); + String ts = index.getParameters().get(part.getSpec().toString()); + if (ts == null) { + return false; + } + long indexTs = Long.parseLong(ts); + LOG.info(partFss.getModificationTime()); + LOG.info(ts); + if (partFss.getModificationTime() > indexTs) { + LOG.info("index is stale on the partitions that matched " + part.getSpec()); + return false; + } + } catch (IOException e) { + LOG.info("failed to grab timestamp info"); + throw new HiveException(e); + } + return true; + } + + /** + * Check that the indexes on the unpartioned table exist and are fresh + */ + private boolean isIndexTableFresh(Hive hive, List indexes, Table src) + throws HiveException { + //check that they exist + if (indexes == null || indexes.size() == 0) { + return false; + } + //check that they are not stale + for (Index index : indexes) { + LOG.info("checking index staleness..."); + try { + FileSystem srcFs = src.getPath().getFileSystem(hive.getConf()); + FileStatus srcFss= srcFs.getFileStatus(src.getPath()); + String ts = index.getParameters().get("base_timestamp"); + if (ts == null) { + return false; + } + long indexTs = Long.parseLong(ts); + LOG.info(srcFss.getModificationTime()); + LOG.info(ts); + if (srcFss.getModificationTime() > indexTs) { + LOG.info("index is stale "); + return false; + } + } catch (IOException e) { + LOG.info("failed to grab timestamp info"); + throw new HiveException(e); } } return true; diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java index 3054f76..90795c2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java @@ -42,6 +42,8 @@ import org.antlr.runtime.tree.Tree; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.Warehouse; @@ -779,6 +781,7 @@ public class DDLSemanticAnalyzer extends BaseSemanticAnalyzer { storageFormat.fillDefaultStorageFormat(shared); + CreateIndexDesc crtIndexDesc = new CreateIndexDesc(tableName, indexName, indexedCols, indexTableName, deferredRebuild, storageFormat.inputFormat, storageFormat.outputFormat, storageFormat.storageHandler, typeName, location, idxProps, tblProps, @@ -814,12 +817,52 @@ public class DDLSemanticAnalyzer extends BaseSemanticAnalyzer { String baseTableName = unescapeIdentifier(ast.getChild(0).getText()); String indexName = unescapeIdentifier(ast.getChild(1).getText()); HashMap partSpec = null; + Map, Long> basePartTs = new HashMap, Long>(); + Map props = new HashMap(); Tree part = ast.getChild(2); if (part != null) { partSpec = extractPartitionSpecs(part); } + AlterIndexDesc alterIdxDesc = new AlterIndexDesc(AlterIndexTypes.ADDPROPS); + try { + long timestamp; + Table baseTbl = db.getTable(db.getCurrentDatabase(), baseTableName); + if (baseTbl.isPartitioned()) { + List baseParts; + if (part != null) { + baseParts = db.getPartitions(baseTbl, partSpec); + } else { + baseParts = db.getPartitions(baseTbl); + } + if (baseParts != null) { + for (Partition p : baseParts) { + FileSystem fs = p.getPartitionPath().getFileSystem(db.getConf()); + FileStatus fss = fs.getFileStatus(p.getPartitionPath()); + basePartTs.put(p.getSpec(), fss.getModificationTime()); + } + } + } else { + FileSystem fs = baseTbl.getPath().getFileSystem(db.getConf()); + FileStatus fss = fs.getFileStatus(baseTbl.getPath()); + basePartTs.put(null, fss.getModificationTime()); + } + for (Map spec : basePartTs.keySet()) { + if (spec != null) { + props.put(spec.toString(), basePartTs.get(spec).toString()); + } else { + props.put("base_timestamp", basePartTs.get(null).toString()); + } + } + alterIdxDesc.setProps(props); + } catch (Exception e) { + } + alterIdxDesc.setIndexName(indexName); + alterIdxDesc.setBaseTableName(baseTableName); + alterIdxDesc.setDbName(db.getCurrentDatabase()); + List> indexBuilder = getIndexBuilderMapRed(baseTableName, indexName, partSpec); rootTasks.addAll(indexBuilder); + rootTasks.add(TaskFactory.get(new DDLWork(alterIdxDesc), conf)); } private void analyzeAlterIndexProps(ASTNode ast) diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/AlterIndexDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/AlterIndexDesc.java index cf20b89..f0dfd00 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/AlterIndexDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/AlterIndexDesc.java @@ -20,7 +20,7 @@ package org.apache.hadoop.hive.ql.plan; import java.io.Serializable; import java.util.ArrayList; -import java.util.HashMap; +import java.util.Map; import java.util.List; import org.apache.hadoop.hive.metastore.api.FieldSchema; @@ -37,7 +37,7 @@ public class AlterIndexDesc extends DDLDesc implements Serializable { private String indexName; private String baseTable; private String dbName; - private HashMap props; + private Map props; /** * alterIndexTypes. @@ -121,7 +121,7 @@ public class AlterIndexDesc extends DDLDesc implements Serializable { * @return the props */ @Explain(displayName = "properties") - public HashMap getProps() { + public Map getProps() { return props; } @@ -129,7 +129,7 @@ public class AlterIndexDesc extends DDLDesc implements Serializable { * @param props * the props to set */ - public void setProps(HashMap props) { + public void setProps(Map props) { this.props = props; } } diff --git ql/src/test/queries/clientpositive/index_stale.q ql/src/test/queries/clientpositive/index_stale.q new file mode 100644 index 0000000..82e15b9 --- /dev/null +++ ql/src/test/queries/clientpositive/index_stale.q @@ -0,0 +1,20 @@ +-- test that stale indexes are not used + +CREATE TABLE temp(key STRING, val STRING) STORED AS TEXTFILE; +INSERT OVERWRITE TABLE temp SELECT * FROM src WHERE key < 50; + +-- Build an index on temp. +CREATE INDEX temp_index ON TABLE temp(key) as 'COMPACT' WITH DEFERRED REBUILD; +ALTER INDEX temp_index ON temp REBUILD; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.optimize.index.filter=true; +SET hive.optimize.index.filter.compact.minsize=0; + +-- overwrite temp table so index is out of date +INSERT OVERWRITE TABLE temp SELECT * FROM src; + +-- should return correct results bypassing index +EXPLAIN SELECT * FROM temp WHERE key = 86; +SELECT * FROM temp WHERE key = 86; +DROP table temp; diff --git ql/src/test/queries/clientpositive/index_stale_partitioned.q ql/src/test/queries/clientpositive/index_stale_partitioned.q new file mode 100644 index 0000000..e7cfeff --- /dev/null +++ ql/src/test/queries/clientpositive/index_stale_partitioned.q @@ -0,0 +1,26 @@ +-- Test if index is actually being used. + +-- Create temp, and populate it with some values in src. +CREATE TABLE temp(key STRING, val STRING) PARTITIONED BY (foo string) STORED AS TEXTFILE; +ALTER TABLE temp ADD PARTITION (foo = 'bar'); +INSERT OVERWRITE TABLE temp PARTITION (foo = 'bar') SELECT * FROM src WHERE key < 50; + +-- Build an index on temp. +CREATE INDEX temp_index ON TABLE temp(key) as 'COMPACT' WITH DEFERRED REBUILD; +ALTER INDEX temp_index ON temp PARTITION (foo = 'bar') REBUILD; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.optimize.index.filter=true; +SET hive.optimize.index.filter.compact.minsize=0; + +-- overwrite temp table so index is out of date +INSERT OVERWRITE TABLE temp PARTITION (foo = 'bar') SELECT * FROM src; + +-- query should not return any values +SELECT * FROM default__temp_temp_index__ WHERE key = 86 AND foo='bar'; +EXPLAIN SELECT * FROM temp WHERE key = 86 AND foo = 'bar'; +SELECT * FROM temp WHERE key = 86 AND foo = 'bar'; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.optimize.index.filter=false; +DROP table temp; diff --git ql/src/test/results/clientpositive/index_stale.q.out ql/src/test/results/clientpositive/index_stale.q.out new file mode 100644 index 0000000..c97ea05 --- /dev/null +++ ql/src/test/results/clientpositive/index_stale.q.out @@ -0,0 +1,141 @@ +PREHOOK: query: -- test that stale indexes are not used + +CREATE TABLE temp(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test that stale indexes are not used + +CREATE TABLE temp(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@temp +PREHOOK: query: INSERT OVERWRITE TABLE temp SELECT * FROM src WHERE key < 50 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@temp +POSTHOOK: query: INSERT OVERWRITE TABLE temp SELECT * FROM src WHERE key < 50 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@temp +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Build an index on temp. +CREATE INDEX temp_index ON TABLE temp(key) as 'COMPACT' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +POSTHOOK: query: -- Build an index on temp. +CREATE INDEX temp_index ON TABLE temp(key) as 'COMPACT' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: ALTER INDEX temp_index ON temp REBUILD +PREHOOK: type: QUERY +PREHOOK: Input: default@temp +PREHOOK: Output: default@default__temp_temp_index__ +POSTHOOK: query: ALTER INDEX temp_index ON temp REBUILD +POSTHOOK: type: QUERY +POSTHOOK: Input: default@temp +POSTHOOK: Output: default@default__temp_temp_index__ +POSTHOOK: Lineage: default__temp_temp_index__._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__.key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- overwrite temp table so index is out of date +INSERT OVERWRITE TABLE temp SELECT * FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@temp +POSTHOOK: query: -- overwrite temp table so index is out of date +INSERT OVERWRITE TABLE temp SELECT * FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@temp +POSTHOOK: Lineage: default__temp_temp_index__._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__.key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- should return correct results bypassing index +EXPLAIN SELECT * FROM temp WHERE key = 86 +PREHOOK: type: QUERY +POSTHOOK: query: -- should return correct results bypassing index +EXPLAIN SELECT * FROM temp WHERE key = 86 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__temp_temp_index__._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__.key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME temp))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 86)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + temp + TableScan + alias: temp + filterExpr: + expr: (key = 86) + type: boolean + Filter Operator + predicate: + expr: (key = 86) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM temp WHERE key = 86 +PREHOOK: type: QUERY +PREHOOK: Input: default@temp +PREHOOK: Output: file:/var/folders/5V/5V4Zq77qGD4aSK9m8V3frVsFdRU/-Tmp-/salbiz/hive_2011-08-02_15-49-52_408_6380125594660274506/-mr-10000 +POSTHOOK: query: SELECT * FROM temp WHERE key = 86 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@temp +POSTHOOK: Output: file:/var/folders/5V/5V4Zq77qGD4aSK9m8V3frVsFdRU/-Tmp-/salbiz/hive_2011-08-02_15-49-52_408_6380125594660274506/-mr-10000 +POSTHOOK: Lineage: default__temp_temp_index__._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__.key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +86 val_86 +PREHOOK: query: DROP table temp +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@temp +PREHOOK: Output: default@temp +POSTHOOK: query: DROP table temp +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@temp +POSTHOOK: Output: default@temp +POSTHOOK: Lineage: default__temp_temp_index__._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__.key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp.val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] diff --git ql/src/test/results/clientpositive/index_stale_partitioned.q.out ql/src/test/results/clientpositive/index_stale_partitioned.q.out new file mode 100644 index 0000000..20f6760 --- /dev/null +++ ql/src/test/results/clientpositive/index_stale_partitioned.q.out @@ -0,0 +1,167 @@ +PREHOOK: query: -- Test if index is actually being used. + +-- Create temp, and populate it with some values in src. +CREATE TABLE temp(key STRING, val STRING) PARTITIONED BY (foo string) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Test if index is actually being used. + +-- Create temp, and populate it with some values in src. +CREATE TABLE temp(key STRING, val STRING) PARTITIONED BY (foo string) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@temp +PREHOOK: query: ALTER TABLE temp ADD PARTITION (foo = 'bar') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Input: default@temp +POSTHOOK: query: ALTER TABLE temp ADD PARTITION (foo = 'bar') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Input: default@temp +POSTHOOK: Output: default@temp@foo=bar +PREHOOK: query: INSERT OVERWRITE TABLE temp PARTITION (foo = 'bar') SELECT * FROM src WHERE key < 50 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@temp@foo=bar +POSTHOOK: query: INSERT OVERWRITE TABLE temp PARTITION (foo = 'bar') SELECT * FROM src WHERE key < 50 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@temp@foo=bar +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Build an index on temp. +CREATE INDEX temp_index ON TABLE temp(key) as 'COMPACT' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +POSTHOOK: query: -- Build an index on temp. +CREATE INDEX temp_index ON TABLE temp(key) as 'COMPACT' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: ALTER INDEX temp_index ON temp PARTITION (foo = 'bar') REBUILD +PREHOOK: type: QUERY +PREHOOK: Input: default@temp@foo=bar +PREHOOK: Output: default@default__temp_temp_index__@foo=bar +POSTHOOK: query: ALTER INDEX temp_index ON temp PARTITION (foo = 'bar') REBUILD +POSTHOOK: type: QUERY +POSTHOOK: Input: default@temp@foo=bar +POSTHOOK: Output: default@default__temp_temp_index__@foo=bar +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar).key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- overwrite temp table so index is out of date +INSERT OVERWRITE TABLE temp PARTITION (foo = 'bar') SELECT * FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@temp@foo=bar +POSTHOOK: query: -- overwrite temp table so index is out of date +INSERT OVERWRITE TABLE temp PARTITION (foo = 'bar') SELECT * FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@temp@foo=bar +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar).key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- query should not return any values +SELECT * FROM default__temp_temp_index__ WHERE key = 86 AND foo='bar' +PREHOOK: type: QUERY +PREHOOK: Input: default@default__temp_temp_index__@foo=bar +PREHOOK: Output: file:/var/folders/5V/5V4Zq77qGD4aSK9m8V3frVsFdRU/-Tmp-/salbiz/hive_2011-08-02_16-08-10_164_2202577787586533848/-mr-10000 +POSTHOOK: query: -- query should not return any values +SELECT * FROM default__temp_temp_index__ WHERE key = 86 AND foo='bar' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@default__temp_temp_index__@foo=bar +POSTHOOK: Output: file:/var/folders/5V/5V4Zq77qGD4aSK9m8V3frVsFdRU/-Tmp-/salbiz/hive_2011-08-02_16-08-10_164_2202577787586533848/-mr-10000 +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar).key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN SELECT * FROM temp WHERE key = 86 AND foo = 'bar' +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT * FROM temp WHERE key = 86 AND foo = 'bar' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar).key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME temp))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (AND (= (TOK_TABLE_OR_COL key) 86) (= (TOK_TABLE_OR_COL foo) 'bar'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + temp + TableScan + alias: temp + filterExpr: + expr: ((key = 86) and (foo = 'bar')) + type: boolean + Filter Operator + predicate: + expr: (key = 86) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + expr: foo + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM temp WHERE key = 86 AND foo = 'bar' +PREHOOK: type: QUERY +PREHOOK: Input: default@temp@foo=bar +PREHOOK: Output: file:/var/folders/5V/5V4Zq77qGD4aSK9m8V3frVsFdRU/-Tmp-/salbiz/hive_2011-08-02_16-08-16_084_5388505361351495807/-mr-10000 +POSTHOOK: query: SELECT * FROM temp WHERE key = 86 AND foo = 'bar' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@temp@foo=bar +POSTHOOK: Output: file:/var/folders/5V/5V4Zq77qGD4aSK9m8V3frVsFdRU/-Tmp-/salbiz/hive_2011-08-02_16-08-16_084_5388505361351495807/-mr-10000 +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar).key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +86 val_86 bar +PREHOOK: query: DROP table temp +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@temp +PREHOOK: Output: default@temp +POSTHOOK: query: DROP table temp +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@temp +POSTHOOK: Output: default@temp +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._bucketname SIMPLE [(temp)temp.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar)._offsets EXPRESSION [(temp)temp.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__temp_temp_index__ PARTITION(foo=bar).key SIMPLE [(temp)temp.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: temp PARTITION(foo=bar).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -- 1.7.4.4