diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index b0f124b..9b7878b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -77,6 +77,7 @@ import org.apache.hadoop.hive.metastore.api.Role; import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.SkewedInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.index.HiveIndexHandler; @@ -677,8 +678,9 @@ public void createIndex(String tableName, String indexName, String indexHandlerC } } - org.apache.hadoop.hive.metastore.api.StorageDescriptor storageDescriptor = baseTbl.getSd().deepCopy(); - SerDeInfo serdeInfo = storageDescriptor.getSerdeInfo(); + StorageDescriptor baseTblSd = baseTbl.getSd(); + + SerDeInfo serdeInfo = new SerDeInfo(); if(serde != null) { serdeInfo.setSerializationLib(serde); } else { @@ -717,18 +719,8 @@ public void createIndex(String tableName, String indexName, String indexHandlerC } } - storageDescriptor.setLocation(null); - if (location != null) { - storageDescriptor.setLocation(location); - } - storageDescriptor.setInputFormat(inputFormat); - storageDescriptor.setOutputFormat(outputFormat); - - Map params = new HashMap(); - List indexTblCols = new ArrayList(); List sortCols = new ArrayList(); - storageDescriptor.setBucketCols(null); int k = 0; Table metaBaseTbl = new Table(baseTbl); for (int i = 0; i < metaBaseTbl.getCols().size(); i++) { @@ -744,9 +736,6 @@ public void createIndex(String tableName, String indexName, String indexHandlerC "Check the index columns, they should appear in the table being indexed."); } - storageDescriptor.setCols(indexTblCols); - storageDescriptor.setSortCols(sortCols); - int time = (int) (System.currentTimeMillis() / 1000); org.apache.hadoop.hive.metastore.api.Table tt = null; HiveIndexHandler indexHandler = HiveUtils.getIndexHandler(this.getConf(), indexHandlerClass); @@ -767,8 +756,20 @@ public void createIndex(String tableName, String indexName, String indexHandlerC throw new RuntimeException("Please specify deferred rebuild using \" WITH DEFERRED REBUILD \"."); } + StorageDescriptor indexSd = new StorageDescriptor( + indexTblCols, + location, + inputFormat, + outputFormat, + false/*compressed - not used*/, + 0/*numBuckets*/, + serdeInfo, + null/*bucketCols*/, + sortCols, + null/*parameters*/); + Index indexDesc = new Index(indexName, indexHandlerClass, dbName, tableName, time, time, indexTblName, - storageDescriptor, params, deferredRebuild); + indexSd, new HashMap(), deferredRebuild); if (indexComment != null) { indexDesc.getParameters().put("comment", indexComment); } diff --git ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHive.java ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHive.java index d0cbed6..8368f73 100755 --- ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHive.java +++ ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHive.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.metastore.api.SkewedInfo; import org.apache.hadoop.hive.ql.index.HiveIndex; import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; import org.apache.hadoop.hive.ql.session.SessionState; @@ -384,11 +385,24 @@ public void testPartition() throws Throwable { } /** + * Tests creating a simple index on a skewed table. + * + * @throws Throwable + */ + public void testIndexOnSkewedTable() throws Throwable { + testIndex(true); + } + + /** * Tests creating a simple index on a simple table. * * @throws Throwable */ - public void testIndex() throws Throwable { + public void testIndexOnSimpleTable() throws Throwable { + testIndex(false); + } + + private void testIndex(boolean fSkewTable) throws Throwable { try{ // create a simple table String tableName = "table_for_testindex"; @@ -412,6 +426,27 @@ public void testIndex() throws Throwable { tbl.setOutputFormatClass(HiveIgnoreKeyTextOutputFormat.class); tbl.setInputFormatClass(SequenceFileInputFormat.class); + if (fSkewTable) { + try { + List skewedColNames = new ArrayList(); + List skewedColValues = new ArrayList(); + List> skewedValues = new ArrayList>(); + + skewedColNames.add("col2"); + skewedColValues.add("CC"); + skewedColValues.add("CH"); + skewedValues.add(skewedColValues); + + SkewedInfo skewedInfo = new SkewedInfo(); + skewedInfo.setSkewedColNames(skewedColNames); + skewedInfo.setSkewedColValues(skewedValues); + tbl.setSkewedInfo(skewedInfo); + } catch(HiveException e) { + e.printStackTrace(); + assertTrue("Unable to set skewinfo in table: " + tableName, false); + } + } + // create table try { hm.createTable(tbl); diff --git ql/src/test/queries/clientpositive/index_skewtable.q ql/src/test/queries/clientpositive/index_skewtable.q new file mode 100644 index 0000000..2382b79 --- /dev/null +++ ql/src/test/queries/clientpositive/index_skewtable.q @@ -0,0 +1,22 @@ +-- Test creating an index on skewed table + +-- Create a skew table +CREATE TABLE kv(key STRING, value STRING) SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE kv; + +-- Create and build an index +CREATE INDEX kv_index ON TABLE kv(value) AS 'COMPACT' WITH DEFERRED REBUILD; +DESCRIBE FORMATTED default__kv_kv_index__; +ALTER INDEX kv_index ON kv REBUILD; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.optimize.index.filter=true; +SET hive.optimize.index.filter.compact.minsize=0; + +-- Run a query that uses the index +EXPLAIN SELECT * FROM kv WHERE value > '15' ORDER BY value; +SELECT * FROM kv WHERE value > '15' ORDER BY value; + +DROP INDEX kv_index ON kv; +DROP TABLE kv; \ No newline at end of file diff --git ql/src/test/results/clientpositive/index_skewtable.q.out ql/src/test/results/clientpositive/index_skewtable.q.out new file mode 100644 index 0000000..be91c82 --- /dev/null +++ ql/src/test/results/clientpositive/index_skewtable.q.out @@ -0,0 +1,242 @@ +PREHOOK: query: -- Test creating an index on skewed table + +-- Create a skew table +CREATE TABLE kv(key STRING, value STRING) SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Test creating an index on skewed table + +-- Create a skew table +CREATE TABLE kv(key STRING, value STRING) SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@kv +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE kv +PREHOOK: type: LOAD +PREHOOK: Output: default@kv +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE kv +POSTHOOK: type: LOAD +POSTHOOK: Output: default@kv +PREHOOK: query: -- Create and build an index +CREATE INDEX kv_index ON TABLE kv(value) AS 'COMPACT' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +POSTHOOK: query: -- Create and build an index +CREATE INDEX kv_index ON TABLE kv(value) AS 'COMPACT' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +POSTHOOK: Output: default@default__kv_kv_index__ +PREHOOK: query: DESCRIBE FORMATTED default__kv_kv_index__ +PREHOOK: type: DESCTABLE +POSTHOOK: query: DESCRIBE FORMATTED default__kv_kv_index__ +POSTHOOK: type: DESCTABLE +# col_name data_type comment + +value string None +_bucketname string +_offsets array + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: INDEX_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: 0 +Bucket Columns: [] +Sort Columns: [Order(col:value, order:1)] +PREHOOK: query: ALTER INDEX kv_index ON kv REBUILD +PREHOOK: type: ALTERINDEX_REBUILD +PREHOOK: Input: default@kv +PREHOOK: Output: default@default__kv_kv_index__ +POSTHOOK: query: ALTER INDEX kv_index ON kv REBUILD +POSTHOOK: type: ALTERINDEX_REBUILD +POSTHOOK: Input: default@kv +POSTHOOK: Output: default@default__kv_kv_index__ +POSTHOOK: Lineage: default__kv_kv_index__._bucketname SIMPLE [(kv)kv.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__._offsets EXPRESSION [(kv)kv.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__.value SIMPLE [(kv)kv.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: -- Run a query that uses the index +EXPLAIN SELECT * FROM kv WHERE value > '15' ORDER BY value +PREHOOK: type: QUERY +POSTHOOK: query: -- Run a query that uses the index +EXPLAIN SELECT * FROM kv WHERE value > '15' ORDER BY value +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__kv_kv_index__._bucketname SIMPLE [(kv)kv.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__._offsets EXPRESSION [(kv)kv.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__.value SIMPLE [(kv)kv.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME kv))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (> (TOK_TABLE_OR_COL value) '15')) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL value))))) + +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-8 depends on stages: Stage-3 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-2 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-1 depends on stages: Stage-2 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + default__kv_kv_index__ + TableScan + alias: default__kv_kv_index__ + filterExpr: + expr: (value > '15') + type: boolean + Filter Operator + predicate: + expr: (value > '15') + type: boolean + Select Operator + expressions: + expr: _bucketname + type: string + expr: _offsets + type: array + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + kv + TableScan + alias: kv + filterExpr: + expr: (value > '15') + type: boolean + Filter Operator + predicate: + expr: (value > '15') + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM kv WHERE value > '15' ORDER BY value +PREHOOK: type: QUERY +PREHOOK: Input: default@default__kv_kv_index__ +PREHOOK: Input: default@kv +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM kv WHERE value > '15' ORDER BY value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@default__kv_kv_index__ +POSTHOOK: Input: default@kv +#### A masked pattern was here #### +POSTHOOK: Lineage: default__kv_kv_index__._bucketname SIMPLE [(kv)kv.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__._offsets EXPRESSION [(kv)kv.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__.value SIMPLE [(kv)kv.FieldSchema(name:value, type:string, comment:null), ] +8 18 +8 18 +2 22 +PREHOOK: query: DROP INDEX kv_index ON kv +PREHOOK: type: DROPINDEX +POSTHOOK: query: DROP INDEX kv_index ON kv +POSTHOOK: type: DROPINDEX +POSTHOOK: Lineage: default__kv_kv_index__._bucketname SIMPLE [(kv)kv.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__._offsets EXPRESSION [(kv)kv.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__.value SIMPLE [(kv)kv.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: DROP TABLE kv +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@kv +PREHOOK: Output: default@kv +POSTHOOK: query: DROP TABLE kv +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@kv +POSTHOOK: Output: default@kv +POSTHOOK: Lineage: default__kv_kv_index__._bucketname SIMPLE [(kv)kv.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__._offsets EXPRESSION [(kv)kv.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__.value SIMPLE [(kv)kv.FieldSchema(name:value, type:string, comment:null), ]