Index: ql/src/test/results/clientpositive/list_bucket_dml_13.q.out =================================================================== --- ql/src/test/results/clientpositive/list_bucket_dml_13.q.out (revision 0) +++ ql/src/test/results/clientpositive/list_bucket_dml_13.q.out (revision 0) @@ -0,0 +1,418 @@ +PREHOOK: query: -- Ensure skewed value map has escaped directory name + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) + +-- test where the skewed values are more than 1 say columns no. 2 and 4 in a table with 5 columns +create table list_bucketing_mul_col (col1 String, col2 String, col3 String, col4 String, col5 string) + partitioned by (ds String, hr String) + skewed by (col2, col4) on (('466','val_466'),('287','val_287'),('82','val_82')) + stored as DIRECTORIES + STORED AS RCFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Ensure skewed value map has escaped directory name + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) + +-- test where the skewed values are more than 1 say columns no. 2 and 4 in a table with 5 columns +create table list_bucketing_mul_col (col1 String, col2 String, col3 String, col4 String, col5 string) + partitioned by (ds String, hr String) + skewed by (col2, col4) on (('466','val_466'),('287','val_287'),('82','val_82')) + stored as DIRECTORIES + STORED AS RCFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@list_bucketing_mul_col +PREHOOK: query: -- list bucketing DML +explain extended +insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '2013-01-23+18:00:99') +select 1, key, 1, value, 1 from src +PREHOOK: type: QUERY +POSTHOOK: query: -- list bucketing DML +explain extended +insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '2013-01-23+18:00:99') +select 1, key, 1, value, 1 from src +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME list_bucketing_mul_col) (TOK_PARTSPEC (TOK_PARTVAL ds '2008-04-08') (TOK_PARTVAL hr '2013-01-23+18:00:99')))) (TOK_SELECT (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL value)) (TOK_SELEXPR 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + GatherStats: false + Select Operator + expressions: + expr: 1 + type: int + expr: key + type: string + expr: 1 + type: int + expr: value + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: UDFToString(_col0) + type: string + expr: _col1 + type: string + expr: UDFToString(_col2) + type: string + expr: _col3 + type: string + expr: UDFToString(_col4) + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Static Partition Specification: ds=2008-04-08/hr=2013-01-23+18%3A00%3A99/ +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + properties: + bucket_count -1 + columns col1,col2,col3,col4,col5 + columns.types string:string:string:string:string +#### A masked pattern was here #### + name default.list_bucketing_mul_col + partition_columns ds/hr + serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.list_bucketing_mul_col + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [src] + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 2008-04-08 + hr 2013-01-23+18:00:99 + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + properties: + bucket_count -1 + columns col1,col2,col3,col4,col5 + columns.types string:string:string:string:string +#### A masked pattern was here #### + name default.list_bucketing_mul_col + partition_columns ds/hr + serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.list_bucketing_mul_col +#### A masked pattern was here #### + + Stage: Stage-2 + Stats-Aggr Operator +#### A masked pattern was here #### + + +PREHOOK: query: insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '2013-01-23+18:00:99') +select 1, key, 1, value, 1 from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@list_bucketing_mul_col@ds=2008-04-08/hr=2013-01-23+18%3A00%3A99 +POSTHOOK: query: insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '2013-01-23+18:00:99') +select 1, key, 1, value, 1 from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@list_bucketing_mul_col@ds=2008-04-08/hr=2013-01-23+18%3A00%3A99 +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col1 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col2 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col3 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col4 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col5 EXPRESSION [] +PREHOOK: query: -- check DML result +show partitions list_bucketing_mul_col +PREHOOK: type: SHOWPARTITIONS +POSTHOOK: query: -- check DML result +show partitions list_bucketing_mul_col +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col1 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col2 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col3 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col4 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col5 EXPRESSION [] +ds=2008-04-08/hr=2013-01-23+18%3A00%3A99 +PREHOOK: query: desc formatted list_bucketing_mul_col partition (ds='2008-04-08', hr='2013-01-23+18:00:99') +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted list_bucketing_mul_col partition (ds='2008-04-08', hr='2013-01-23+18:00:99') +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col1 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col2 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col3 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col4 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col5 EXPRESSION [] +# col_name data_type comment + +col1 string None +col2 string None +col3 string None +col4 string None +col5 string None + +# Partition Information +# col_name data_type comment + +ds string None +hr string None + +# Detailed Partition Information +Partition Value: [2008-04-08, 2013-01-23+18:00:99] +Database: default +Table: list_bucketing_mul_col +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + numFiles 4 + numRows 500 + rawDataSize 6312 + totalSize 7094 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Stored As SubDirectories: Yes +Skewed Columns: [col2, col4] +Skewed Values: [[466, val_466], [287, val_287], [82, val_82]] +#### A masked pattern was here #### +Skewed Value to Truncated Path: {[82, val_82]=/list_bucketing_mul_col/ds=2008-04-08/hr=2013-01-23+18%3A00%3A99/col2=82/col4=val_82, [466, val_466]=/list_bucketing_mul_col/ds=2008-04-08/hr=2013-01-23+18%3A00%3A99/col2=466/col4=val_466, [287, val_287]=/list_bucketing_mul_col/ds=2008-04-08/hr=2013-01-23+18%3A00%3A99/col2=287/col4=val_287} +Storage Desc Params: + serialization.format 1 +PREHOOK: query: explain extended +select * from list_bucketing_mul_col +where ds='2008-04-08' and hr='2013-01-23+18:00:99' and col2 = "466" and col4 = "val_466" +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from list_bucketing_mul_col +where ds='2008-04-08' and hr='2013-01-23+18:00:99' and col2 = "466" and col4 = "val_466" +POSTHOOK: type: QUERY +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col1 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col2 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col3 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col4 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col5 EXPRESSION [] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME list_bucketing_mul_col))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (and (and (and (= (TOK_TABLE_OR_COL ds) '2008-04-08') (= (TOK_TABLE_OR_COL hr) '2013-01-23+18:00:99')) (= (TOK_TABLE_OR_COL col2) "466")) (= (TOK_TABLE_OR_COL col4) "val_466"))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + list_bucketing_mul_col + TableScan + alias: list_bucketing_mul_col + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: ((col2 = '466') and (col4 = 'val_466')) + type: boolean + Select Operator + expressions: + expr: col1 + type: string + expr: col2 + type: string + expr: col3 + type: string + expr: col4 + type: string + expr: col5 + type: string + expr: ds + type: string + expr: hr + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4,_col5,_col6 + columns.types string:string:string:string:string:string:string + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: col4=val_466 + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + partition values: + ds 2008-04-08 + hr 2013-01-23+18:00:99 + properties: + bucket_count -1 + columns col1,col2,col3,col4,col5 + columns.types string:string:string:string:string +#### A masked pattern was here #### + name default.list_bucketing_mul_col + numFiles 4 + numRows 500 + partition_columns ds/hr + rawDataSize 6312 + serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + totalSize 7094 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + properties: + bucket_count -1 + columns col1,col2,col3,col4,col5 + columns.types string:string:string:string:string +#### A masked pattern was here #### + name default.list_bucketing_mul_col + numFiles 4 + numPartitions 1 + numRows 500 + partition_columns ds/hr + rawDataSize 6312 + serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + totalSize 7094 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.list_bucketing_mul_col + name: default.list_bucketing_mul_col + Truncated Path -> Alias: + /list_bucketing_mul_col/ds=2008-04-08/hr=2013-01-23+18%3A00%3A99/col2=466/col4=val_466 [list_bucketing_mul_col] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select * from list_bucketing_mul_col +where ds='2008-04-08' and hr='2013-01-23+18:00:99' and col2 = "466" and col4 = "val_466" +PREHOOK: type: QUERY +PREHOOK: Input: default@list_bucketing_mul_col +PREHOOK: Input: default@list_bucketing_mul_col@ds=2008-04-08/hr=2013-01-23+18%3A00%3A99 +#### A masked pattern was here #### +POSTHOOK: query: select * from list_bucketing_mul_col +where ds='2008-04-08' and hr='2013-01-23+18:00:99' and col2 = "466" and col4 = "val_466" +POSTHOOK: type: QUERY +POSTHOOK: Input: default@list_bucketing_mul_col +POSTHOOK: Input: default@list_bucketing_mul_col@ds=2008-04-08/hr=2013-01-23+18%3A00%3A99 +#### A masked pattern was here #### +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col1 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col2 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col3 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col4 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col5 EXPRESSION [] +1 466 1 val_466 1 2008-04-08 2013-01-23+18:00:99 +1 466 1 val_466 1 2008-04-08 2013-01-23+18:00:99 +1 466 1 val_466 1 2008-04-08 2013-01-23+18:00:99 +PREHOOK: query: drop table list_bucketing_mul_col +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@list_bucketing_mul_col +PREHOOK: Output: default@list_bucketing_mul_col +POSTHOOK: query: drop table list_bucketing_mul_col +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@list_bucketing_mul_col +POSTHOOK: Output: default@list_bucketing_mul_col +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col1 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col2 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col3 EXPRESSION [] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col4 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=2013-01-23+18:00:99).col5 EXPRESSION [] Index: ql/src/test/queries/clientpositive/list_bucket_dml_13.q =================================================================== --- ql/src/test/queries/clientpositive/list_bucket_dml_13.q (revision 0) +++ ql/src/test/queries/clientpositive/list_bucket_dml_13.q (revision 0) @@ -0,0 +1,36 @@ +set hive.mapred.supports.subdirectories=true; +set mapred.input.dir.recursive=true; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- Ensure skewed value map has escaped directory name + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) + +-- test where the skewed values are more than 1 say columns no. 2 and 4 in a table with 5 columns +create table list_bucketing_mul_col (col1 String, col2 String, col3 String, col4 String, col5 string) + partitioned by (ds String, hr String) + skewed by (col2, col4) on (('466','val_466'),('287','val_287'),('82','val_82')) + stored as DIRECTORIES + STORED AS RCFILE; + +-- list bucketing DML +explain extended +insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '2013-01-23+18:00:99') +select 1, key, 1, value, 1 from src; + +insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '2013-01-23+18:00:99') +select 1, key, 1, value, 1 from src; + +-- check DML result +show partitions list_bucketing_mul_col; +desc formatted list_bucketing_mul_col partition (ds='2008-04-08', hr='2013-01-23+18:00:99'); + +set hive.optimize.listbucketing=true; +explain extended +select * from list_bucketing_mul_col +where ds='2008-04-08' and hr='2013-01-23+18:00:99' and col2 = "466" and col4 = "val_466"; +select * from list_bucketing_mul_col +where ds='2008-04-08' and hr='2013-01-23+18:00:99' and col2 = "466" and col4 = "val_466"; + +drop table list_bucketing_mul_col; Index: ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (revision 1438236) +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (working copy) @@ -1282,7 +1282,7 @@ } if ((skewedValue.size() > 0) && (skewedValue.size() == skewedInfo.getSkewedColNames().size()) && !skewedColValueLocationMaps.containsKey(skewedValue)) { - skewedColValueLocationMaps.put(skewedValue, lbDirName); + skewedColValueLocationMaps.put(skewedValue, lbdPath.toString()); } }