From cebdae675ac90f5e5cd40bafea44ba79e12aa905 Mon Sep 17 00:00:00 2001 From: vkorukanti Date: Thu, 13 Nov 2014 14:05:31 -0800 Subject: [PATCH] HIVE-5631: Index creation fails on a table that has skew column --- .../org/apache/hadoop/hive/ql/metadata/Hive.java | 35 ++-- .../test/queries/clientpositive/index_skewtable.q | 22 +++ .../clientpositive/authorization_index.q.out | 2 - .../results/clientpositive/index_skewtable.q.out | 216 +++++++++++++++++++++ 4 files changed, 257 insertions(+), 18 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/index_skewtable.q create mode 100644 ql/src/test/results/clientpositive/index_skewtable.q.out diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index b900627..ee8d295 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -92,6 +92,7 @@ import org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest; import org.apache.hadoop.hive.metastore.api.ShowCompactResponse; import org.apache.hadoop.hive.metastore.api.SkewedInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -748,8 +749,9 @@ public void createIndex(String tableName, String indexName, String indexHandlerC throw new HiveException("Table name " + indexTblName + " already exists. Choose another name."); } - org.apache.hadoop.hive.metastore.api.StorageDescriptor storageDescriptor = baseTbl.getSd().deepCopy(); - SerDeInfo serdeInfo = storageDescriptor.getSerdeInfo(); + SerDeInfo serdeInfo = new SerDeInfo(); + serdeInfo.setName(indexTblName); + if(serde != null) { serdeInfo.setSerializationLib(serde); } else { @@ -762,6 +764,7 @@ public void createIndex(String tableName, String indexName, String indexHandlerC } } + serdeInfo.setParameters(new HashMap()); if (fieldDelim != null) { serdeInfo.getParameters().put(FIELD_DELIM, fieldDelim); serdeInfo.getParameters().put(SERIALIZATION_FORMAT, fieldDelim); @@ -788,18 +791,8 @@ public void createIndex(String tableName, String indexName, String indexHandlerC } } - storageDescriptor.setLocation(null); - if (location != null) { - storageDescriptor.setLocation(location); - } - storageDescriptor.setInputFormat(inputFormat); - storageDescriptor.setOutputFormat(outputFormat); - - Map params = new HashMap(); - List indexTblCols = new ArrayList(); List sortCols = new ArrayList(); - storageDescriptor.setBucketCols(null); int k = 0; Table metaBaseTbl = new Table(baseTbl); for (int i = 0; i < metaBaseTbl.getCols().size(); i++) { @@ -815,9 +808,6 @@ public void createIndex(String tableName, String indexName, String indexHandlerC "Check the index columns, they should appear in the table being indexed."); } - storageDescriptor.setCols(indexTblCols); - storageDescriptor.setSortCols(sortCols); - int time = (int) (System.currentTimeMillis() / 1000); org.apache.hadoop.hive.metastore.api.Table tt = null; HiveIndexHandler indexHandler = HiveUtils.getIndexHandler(this.getConf(), indexHandlerClass); @@ -851,8 +841,21 @@ public void createIndex(String tableName, String indexName, String indexHandlerC String tdname = Utilities.getDatabaseName(tableName); String ttname = Utilities.getTableName(tableName); + + StorageDescriptor indexSd = new StorageDescriptor( + indexTblCols, + location, + inputFormat, + outputFormat, + false/*compressed - not used*/, + -1/*numBuckets - default is -1 when the table has no buckets*/, + serdeInfo, + null/*bucketCols*/, + sortCols, + null/*parameters*/); + Index indexDesc = new Index(indexName, indexHandlerClass, tdname, ttname, time, time, indexTblName, - storageDescriptor, params, deferredRebuild); + indexSd, new HashMap(), deferredRebuild); if (indexComment != null) { indexDesc.getParameters().put("comment", indexComment); } diff --git ql/src/test/queries/clientpositive/index_skewtable.q ql/src/test/queries/clientpositive/index_skewtable.q new file mode 100644 index 0000000..fdf65cd --- /dev/null +++ ql/src/test/queries/clientpositive/index_skewtable.q @@ -0,0 +1,22 @@ +-- Test creating an index on skewed table + +-- Create a skew table +CREATE TABLE kv(key STRING, value STRING) SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE kv; + +-- Create and build an index +CREATE INDEX kv_index ON TABLE kv(value) AS 'COMPACT' WITH DEFERRED REBUILD; +DESCRIBE FORMATTED default__kv_kv_index__; +ALTER INDEX kv_index ON kv REBUILD; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.optimize.index.filter=true; +SET hive.optimize.index.filter.compact.minsize=0; + +-- Run a query that uses the index +EXPLAIN SELECT * FROM kv WHERE value > '15' ORDER BY value; +SELECT * FROM kv WHERE value > '15' ORDER BY value; + +DROP INDEX kv_index ON kv; +DROP TABLE kv; diff --git ql/src/test/results/clientpositive/authorization_index.q.out ql/src/test/results/clientpositive/authorization_index.q.out index 273931a..540d11b 100644 --- ql/src/test/results/clientpositive/authorization_index.q.out +++ ql/src/test/results/clientpositive/authorization_index.q.out @@ -43,8 +43,6 @@ Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [Order(col:a, order:1)] -Storage Desc Params: - serialization.format 1 PREHOOK: query: alter index t1_index on t1 rebuild PREHOOK: type: ALTERINDEX_REBUILD PREHOOK: Input: default@t1 diff --git ql/src/test/results/clientpositive/index_skewtable.q.out ql/src/test/results/clientpositive/index_skewtable.q.out new file mode 100644 index 0000000..02fd1f4 --- /dev/null +++ ql/src/test/results/clientpositive/index_skewtable.q.out @@ -0,0 +1,216 @@ +PREHOOK: query: -- Test creating an index on skewed table + +-- Create a skew table +CREATE TABLE kv(key STRING, value STRING) SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@kv +POSTHOOK: query: -- Test creating an index on skewed table + +-- Create a skew table +CREATE TABLE kv(key STRING, value STRING) SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@kv +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE kv +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@kv +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE kv +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@kv +PREHOOK: query: -- Create and build an index +CREATE INDEX kv_index ON TABLE kv(value) AS 'COMPACT' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +PREHOOK: Input: default@kv +POSTHOOK: query: -- Create and build an index +CREATE INDEX kv_index ON TABLE kv(value) AS 'COMPACT' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +POSTHOOK: Input: default@kv +POSTHOOK: Output: default@default__kv_kv_index__ +PREHOOK: query: DESCRIBE FORMATTED default__kv_kv_index__ +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@default__kv_kv_index__ +POSTHOOK: query: DESCRIBE FORMATTED default__kv_kv_index__ +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@default__kv_kv_index__ +# col_name data_type comment + +value string +_bucketname string +_offsets array + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: INDEX_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [Order(col:value, order:1)] +PREHOOK: query: ALTER INDEX kv_index ON kv REBUILD +PREHOOK: type: ALTERINDEX_REBUILD +PREHOOK: Input: default@kv +PREHOOK: Output: default@default__kv_kv_index__ +POSTHOOK: query: ALTER INDEX kv_index ON kv REBUILD +POSTHOOK: type: ALTERINDEX_REBUILD +POSTHOOK: Input: default@kv +POSTHOOK: Output: default@default__kv_kv_index__ +POSTHOOK: Lineage: default__kv_kv_index__._bucketname SIMPLE [(kv)kv.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__._offsets EXPRESSION [(kv)kv.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__kv_kv_index__.value SIMPLE [(kv)kv.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: -- Run a query that uses the index +EXPLAIN SELECT * FROM kv WHERE value > '15' ORDER BY value +PREHOOK: type: QUERY +POSTHOOK: query: -- Run a query that uses the index +EXPLAIN SELECT * FROM kv WHERE value > '15' ORDER BY value +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-8 depends on stages: Stage-3 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-2 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-1 depends on stages: Stage-2 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + alias: default.default__kv_kv_index__ + filterExpr: (value > '15') (type: boolean) + Filter Operator + predicate: (value > '15') (type: boolean) + Select Operator + expressions: _bucketname (type: string), _offsets (type: array) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: kv + filterExpr: (value > '15') (type: boolean) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (value > '15') (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: string) + sort order: + + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col0 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), KEY.reducesinkkey0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-6 + Map Reduce + Map Operator Tree: + TableScan + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT * FROM kv WHERE value > '15' ORDER BY value +PREHOOK: type: QUERY +PREHOOK: Input: default@default__kv_kv_index__ +PREHOOK: Input: default@kv +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM kv WHERE value > '15' ORDER BY value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@default__kv_kv_index__ +POSTHOOK: Input: default@kv +#### A masked pattern was here #### +8 18 +8 18 +2 22 +PREHOOK: query: DROP INDEX kv_index ON kv +PREHOOK: type: DROPINDEX +PREHOOK: Input: default@kv +POSTHOOK: query: DROP INDEX kv_index ON kv +POSTHOOK: type: DROPINDEX +POSTHOOK: Input: default@kv +PREHOOK: query: DROP TABLE kv +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@kv +PREHOOK: Output: default@kv +POSTHOOK: query: DROP TABLE kv +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@kv +POSTHOOK: Output: default@kv -- 1.8.5.2 (Apple Git-48)