diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java index 4b10e8974e..bd2ffad868 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java @@ -174,9 +174,6 @@ protected void optimizeOperatorPlan(ParseContext pCtx, Set inputs, runStatsAnnotation(procCtx); perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Setup stats in the operator plan"); - // Update bucketing version of ReduceSinkOp if needed - updateBucketingVersionForUpgrade(procCtx); - // run Sorted dynamic partition optimization if(HiveConf.getBoolVar(procCtx.conf, HiveConf.ConfVars.DYNAMICPARTITIONING) && HiveConf.getVar(procCtx.conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE).equals("nonstrict") && @@ -229,6 +226,12 @@ protected void optimizeOperatorPlan(ParseContext pCtx, Set inputs, new ConstantPropagate(ConstantPropagateOption.SHORTCUT).transform(procCtx.parseContext); } + // ATTENTION : DO NOT, I REPEAT, DO NOT WRITE ANYTHING AFTER updateBucketingVersionForUpgrade() + // ANYTHING WHICH NEEDS TO BE ADDED MUST BE ADDED ABOVE + + // Update bucketing version of ReduceSinkOp if needed + updateBucketingVersionForUpgrade(procCtx); + } private void runCycleAnalysisForPartitionPruning(OptimizeTezProcContext procCtx, @@ -1834,22 +1837,6 @@ private void updateBucketingVersionForUpgrade(OptimizeTezProcContext procCtx) { for (FileSinkOperator fsOp : fsOpsAll) { Operator parentOfFS = fsOp.getParentOperators().get(0); - if (parentOfFS instanceof GroupByOperator) { - GroupByOperator gbyOp = (GroupByOperator) parentOfFS; - List aggs = gbyOp.getConf().getAggregatorStrings(); - boolean compute_stats = false; - for (String agg : aggs) { - if (agg.equalsIgnoreCase("compute_stats")) { - compute_stats = true; - break; - } - } - if (compute_stats) { - continue; - } - } - - // Not compute_stats Set rsOps = OperatorUtils.findOperatorsUpstream(parentOfFS, ReduceSinkOperator.class); if (rsOps.isEmpty()) { continue; diff --git a/ql/src/test/queries/clientpositive/murmur_hash_migration.q b/ql/src/test/queries/clientpositive/murmur_hash_migration.q index 2b8da9f683..b20ce6797b 100644 --- a/ql/src/test/queries/clientpositive/murmur_hash_migration.q +++ b/ql/src/test/queries/clientpositive/murmur_hash_migration.q @@ -59,3 +59,24 @@ select t1.key, t1.value, t2.key, t2.value from srcbucket_mapjoin_n18 t1, srcbuck explain select t1.key, t1.value, t2.key, t2.value from tab_part_n11 t1, tab_n10 t2 where t1.key = t2.key order by t1.key, t1.value, t2.key, t2.value; select t1.key, t1.value, t2.key, t2.value from tab_part_n11 t1, tab_n10 t2 where t1.key = t2.key order by t1.key, t1.value, t2.key, t2.value; + + +set hive.optimize.ppd=true; +set hive.optimize.index.filter=true; +set hive.tez.bucket.pruning=true; +set hive.fetch.task.conversion=none; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +set hive.exec.dynamic.partition.mode=nonstrict; + +create transactional table acid_ptn_bucket1 (a int, b int) partitioned by(ds string) +clustered by (a) into 2 buckets stored as ORC +TBLPROPERTIES('bucketing_version'='1', 'transactional'='true', 'transactional_properties'='default'); + +explain extended insert into acid_ptn_bucket1 partition (ds) values(1,2,'today'),(1,3,'today'),(1,4,'yesterday'),(2,2,'yesterday'),(2,3,'today'),(2,4,'today'); +insert into acid_ptn_bucket1 partition (ds) values(1,2,'today'),(1,3,'today'),(1,4,'yesterday'),(2,2,'yesterday'),(2,3,'today'),(2,4,'today'); +alter table acid_ptn_bucket1 add columns(c int); +insert into acid_ptn_bucket1 partition (ds) values(3,2,1000,'yesterday'),(3,3,1001,'today'),(3,4,1002,'yesterday'),(4,2,1003,'today'), (4,3,1004,'yesterday'),(4,4,1005,'today'); +select ROW__ID, * from acid_ptn_bucket1 where ROW__ID.bucketid = 536870912 and ds='today'; +select ROW__ID, * from acid_ptn_bucket1 where ROW__ID.bucketid = 536936448 and ds='today'; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/llap/dynpart_sort_opt_vectorization.q.out index 5a2cd47381..44a2b8b645 100644 --- a/ql/src/test/results/clientpositive/llap/dynpart_sort_opt_vectorization.q.out +++ b/ql/src/test/results/clientpositive/llap/dynpart_sort_opt_vectorization.q.out @@ -1329,7 +1329,7 @@ Partition Parameters: numFiles 8 numRows 32 rawDataSize 640 - totalSize 4670 + totalSize 4648 #### A masked pattern was here #### # Storage Information @@ -1407,7 +1407,7 @@ Partition Parameters: numFiles 8 numRows 32 rawDataSize 640 - totalSize 4656 + totalSize 4658 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/llap/murmur_hash_migration.q.out b/ql/src/test/results/clientpositive/llap/murmur_hash_migration.q.out index 5343628252..7c314a38a6 100644 --- a/ql/src/test/results/clientpositive/llap/murmur_hash_migration.q.out +++ b/ql/src/test/results/clientpositive/llap/murmur_hash_migration.q.out @@ -648,3 +648,251 @@ POSTHOOK: Input: default@tab_part_n11@ds=2008-04-08 417 val_417 417 val_417 417 val_417 417 val_417 446 val_446 446 val_446 +PREHOOK: query: create transactional table acid_ptn_bucket1 (a int, b int) partitioned by(ds string) +clustered by (a) into 2 buckets stored as ORC +TBLPROPERTIES('bucketing_version'='1', 'transactional'='true', 'transactional_properties'='default') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_ptn_bucket1 +POSTHOOK: query: create transactional table acid_ptn_bucket1 (a int, b int) partitioned by(ds string) +clustered by (a) into 2 buckets stored as ORC +TBLPROPERTIES('bucketing_version'='1', 'transactional'='true', 'transactional_properties'='default') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_ptn_bucket1 +PREHOOK: query: explain extended insert into acid_ptn_bucket1 partition (ds) values(1,2,'today'),(1,3,'today'),(1,4,'yesterday'),(2,2,'yesterday'),(2,3,'today'),(2,4,'today') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@acid_ptn_bucket1 +POSTHOOK: query: explain extended insert into acid_ptn_bucket1 partition (ds) values(1,2,'today'),(1,3,'today'),(1,4,'yesterday'),(2,2,'yesterday'),(2,3,'today'),(2,4,'today') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false + Select Operator + expressions: array(const struct(1,2,'today'),const struct(1,3,'today'),const struct(1,4,'yesterday'),const struct(2,2,'yesterday'),const struct(2,3,'today'),const struct(2,4,'today')) (type: array>) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + UDTF Operator + Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + function name: inline + Select Operator + expressions: col1 (type: int), col2 (type: int), col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col2 (type: string), _bucket_number (type: string) + null sort order: aa + sort order: ++ + Map-reduce partition columns: _col2 (type: string) + tag: -1 + value expressions: _col0 (type: int), _col1 (type: int) + auto parallelism: true + Execution mode: llap + LLAP IO: no inputs + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dummy_path + input format: org.apache.hadoop.hive.ql.io.NullRowsInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns + columns.comments + columns.types +#### A masked pattern was here #### + name _dummy_database._dummy_table + serialization.ddl struct _dummy_table { } + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe + serde: org.apache.hadoop.hive.serde2.NullStructSerDe + + input format: org.apache.hadoop.hive.ql.io.NullRowsInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns + columns.comments + columns.types +#### A masked pattern was here #### + name _dummy_database._dummy_table + serialization.ddl struct _dummy_table { } + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe + serde: org.apache.hadoop.hive.serde2.NullStructSerDe + name: _dummy_database._dummy_table + name: _dummy_database._dummy_table + Truncated Path -> Alias: +#### A masked pattern was here #### + Reducer 2 + Execution mode: vectorized, llap + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), VALUE._col1 (type: int), KEY._col2 (type: string), KEY._bucket_number (type: string) + outputColumnNames: _col0, _col1, _col2, _bucket_number + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + Dp Sort State: PARTITION_BUCKET_SORTED + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count 2 + bucket_field_name a + bucketing_version 1 + column.name.delimiter , + columns a,b + columns.comments + columns.types int:int +#### A masked pattern was here #### + name default.acid_ptn_bucket1 + partition_columns ds + partition_columns.types string + serialization.ddl struct acid_ptn_bucket1 { i32 a, i32 b} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.acid_ptn_bucket1 + TotalFiles: 1 + Write Type: INSERT + GatherStats: true + MultiFileSpray: false + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: false +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count 2 + bucket_field_name a + bucketing_version 1 + column.name.delimiter , + columns a,b + columns.comments + columns.types int:int +#### A masked pattern was here #### + name default.acid_ptn_bucket1 + partition_columns ds + partition_columns.types string + serialization.ddl struct acid_ptn_bucket1 { i32 a, i32 b} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.acid_ptn_bucket1 + Write Type: INSERT + + Stage: Stage-3 + Stats Work + Basic Stats Work: +#### A masked pattern was here #### + +PREHOOK: query: insert into acid_ptn_bucket1 partition (ds) values(1,2,'today'),(1,3,'today'),(1,4,'yesterday'),(2,2,'yesterday'),(2,3,'today'),(2,4,'today') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@acid_ptn_bucket1 +POSTHOOK: query: insert into acid_ptn_bucket1 partition (ds) values(1,2,'today'),(1,3,'today'),(1,4,'yesterday'),(2,2,'yesterday'),(2,3,'today'),(2,4,'today') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@acid_ptn_bucket1@ds=today +POSTHOOK: Output: default@acid_ptn_bucket1@ds=yesterday +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=today).a SCRIPT [] +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=today).b SCRIPT [] +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=yesterday).a SCRIPT [] +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=yesterday).b SCRIPT [] +PREHOOK: query: alter table acid_ptn_bucket1 add columns(c int) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_ptn_bucket1 +PREHOOK: Output: default@acid_ptn_bucket1 +POSTHOOK: query: alter table acid_ptn_bucket1 add columns(c int) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_ptn_bucket1 +POSTHOOK: Output: default@acid_ptn_bucket1 +PREHOOK: query: insert into acid_ptn_bucket1 partition (ds) values(3,2,1000,'yesterday'),(3,3,1001,'today'),(3,4,1002,'yesterday'),(4,2,1003,'today'), (4,3,1004,'yesterday'),(4,4,1005,'today') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@acid_ptn_bucket1 +POSTHOOK: query: insert into acid_ptn_bucket1 partition (ds) values(3,2,1000,'yesterday'),(3,3,1001,'today'),(3,4,1002,'yesterday'),(4,2,1003,'today'), (4,3,1004,'yesterday'),(4,4,1005,'today') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@acid_ptn_bucket1@ds=today +POSTHOOK: Output: default@acid_ptn_bucket1@ds=yesterday +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=today).a SCRIPT [] +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=today).b SCRIPT [] +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=today).c SCRIPT [] +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=yesterday).a SCRIPT [] +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=yesterday).b SCRIPT [] +POSTHOOK: Lineage: acid_ptn_bucket1 PARTITION(ds=yesterday).c SCRIPT [] +PREHOOK: query: select ROW__ID, * from acid_ptn_bucket1 where ROW__ID.bucketid = 536870912 and ds='today' +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_ptn_bucket1 +PREHOOK: Input: default@acid_ptn_bucket1@ds=today +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, * from acid_ptn_bucket1 where ROW__ID.bucketid = 536870912 and ds='today' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_ptn_bucket1 +POSTHOOK: Input: default@acid_ptn_bucket1@ds=today +#### A masked pattern was here #### +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0} 4 2 1003 today +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1} 4 4 1005 today +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0} 2 3 NULL today +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1} 2 4 NULL today +PREHOOK: query: select ROW__ID, * from acid_ptn_bucket1 where ROW__ID.bucketid = 536936448 and ds='today' +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_ptn_bucket1 +PREHOOK: Input: default@acid_ptn_bucket1@ds=today +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, * from acid_ptn_bucket1 where ROW__ID.bucketid = 536936448 and ds='today' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_ptn_bucket1 +POSTHOOK: Input: default@acid_ptn_bucket1@ds=today +#### A masked pattern was here #### +{"writeid":### Masked writeid ###,"bucketid":536936448,"rowid":0} 3 3 1001 today +{"writeid":### Masked writeid ###,"bucketid":536936448,"rowid":0} 1 2 NULL today +{"writeid":### Masked writeid ###,"bucketid":536936448,"rowid":1} 1 3 NULL today