diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index bd25bc7cad..265cffdbb1 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1709,7 +1709,7 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "This many percentage of rows will be estimated as number of nulls in absence of statistics."), HIVESTATSAUTOGATHER("hive.stats.autogather", true, "A flag to gather statistics (only basic) automatically during the INSERT OVERWRITE command."), - HIVESTATSCOLAUTOGATHER("hive.stats.column.autogather", false, + HIVESTATSCOLAUTOGATHER("hive.stats.column.autogather", true, "A flag to gather column statistics automatically."), HIVESTATSDBCLASS("hive.stats.dbclass", "fs", new PatternSet("custom", "fs"), "The storage that stores temporary Hive statistics. In filesystem based statistics collection ('fs'), \n" + diff --git data/conf/hive-site.xml data/conf/hive-site.xml index a205b8c569..05d6e8ac0e 100644 --- data/conf/hive-site.xml +++ data/conf/hive-site.xml @@ -302,12 +302,15 @@ true - hive.llap.io.allocator.direct false + + hive.stats.column.autogather + true + hive.materializedview.rewriting diff --git ql/src/test/queries/clientpositive/autoColumnStats_3a.q ql/src/test/queries/clientpositive/autoColumnStats_3a.q new file mode 100644 index 0000000000..516d6b89b3 --- /dev/null +++ ql/src/test/queries/clientpositive/autoColumnStats_3a.q @@ -0,0 +1,34 @@ +set hive.stats.column.autogather=false; +set hive.stats.fetch.column.stats=true; +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.auto.convert.join=true; +set hive.join.emit.interval=2; +set hive.auto.convert.join.noconditionaltask=true; +set hive.auto.convert.join.noconditionaltask.size=10000; +set hive.auto.convert.sortmerge.join.bigtable.selection.policy = org.apache.hadoop.hive.ql.optimizer.TableSizeBasedBigTableSelectorForAutoSMJ; +set hive.optimize.bucketingsorting=false; + +set hive.stats.column.autogather=true; + +create table if not exists nzhang_part14 (key string, value string) + partitioned by (ds string, hr string); + + +desc formatted nzhang_part14; +analyze table nzhang_part14 partition (ds,hr) compute statistics for columns; +desc formatted nzhang_part14; + +insert into table nzhang_part14 partition(ds, hr) +select key, value, ds, hr from ( + select '1' as ds,'3' as hr,'k' as key, 'v' as value + union all + select '2' as ds,'1' as hr,'k' as key, 'v' as value +) T; + +desc formatted nzhang_part14 partition(ds='1', hr='3'); + +desc formatted nzhang_part14 partition(ds='2', hr='1'); + +desc formatted nzhang_part14; + diff --git ql/src/test/queries/clientpositive/bucket_map_join_tez2.q ql/src/test/queries/clientpositive/bucket_map_join_tez2.q index b79d89d67a..2ce859f7c6 100644 --- ql/src/test/queries/clientpositive/bucket_map_join_tez2.q +++ ql/src/test/queries/clientpositive/bucket_map_join_tez2.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.strict.checks.bucketing=false; set hive.mapred.mode=nonstrict; diff --git ql/src/test/queries/clientpositive/bucket_num_reducers.q ql/src/test/queries/clientpositive/bucket_num_reducers.q index 06f334e833..5c5008eea7 100644 --- ql/src/test/queries/clientpositive/bucket_num_reducers.q +++ ql/src/test/queries/clientpositive/bucket_num_reducers.q @@ -1,4 +1,4 @@ -; +set hive.stats.column.autogather=false; set hive.exec.mode.local.auto=false; set mapred.reduce.tasks = 10; diff --git ql/src/test/queries/clientpositive/combine1.q ql/src/test/queries/clientpositive/combine1.q index 3bcb8b19c1..b300830884 100644 --- ql/src/test/queries/clientpositive/combine1.q +++ ql/src/test/queries/clientpositive/combine1.q @@ -7,6 +7,8 @@ set mapred.max.split.size=256; set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec; +set hive.stats.column.autogather=false; + -- SORT_QUERY_RESULTS create table combine1_1(key string, value string) stored as textfile; diff --git ql/src/test/queries/clientpositive/correlationoptimizer5.q ql/src/test/queries/clientpositive/correlationoptimizer5.q index 45b8cb955d..002fb12e22 100644 --- ql/src/test/queries/clientpositive/correlationoptimizer5.q +++ ql/src/test/queries/clientpositive/correlationoptimizer5.q @@ -1,3 +1,5 @@ +set hive.stats.column.autogather=false; +-- Currently, a query with multiple FileSinkOperators are not supported. set hive.mapred.mode=nonstrict; CREATE TABLE T1(key INT, val STRING); LOAD DATA LOCAL INPATH '../../data/files/kv1.txt' INTO TABLE T1; diff --git ql/src/test/queries/clientpositive/encryption_insert_values.q ql/src/test/queries/clientpositive/encryption_insert_values.q index 2dd3e9ad1d..c8d1d519f3 100644 --- ql/src/test/queries/clientpositive/encryption_insert_values.q +++ ql/src/test/queries/clientpositive/encryption_insert_values.q @@ -1,4 +1,5 @@ -- SORT_QUERY_RESULTS; +set hive.stats.column.autogather=false; DROP TABLE IF EXISTS encrypted_table PURGE; CREATE TABLE encrypted_table (key INT, value STRING) LOCATION '${hiveconf:hive.metastore.warehouse.dir}/default/encrypted_table'; @@ -12,4 +13,4 @@ select * from encrypted_table; -- this checks that we've actually created temp table data under encrypted_table folder describe formatted values__tmp__table__1; -CRYPTO DELETE_KEY --keyName key_128; \ No newline at end of file +CRYPTO DELETE_KEY --keyName key_128; diff --git ql/src/test/queries/clientpositive/encryption_join_with_different_encryption_keys.q ql/src/test/queries/clientpositive/encryption_join_with_different_encryption_keys.q index 4dcea1f7ce..7159ad5995 100644 --- ql/src/test/queries/clientpositive/encryption_join_with_different_encryption_keys.q +++ ql/src/test/queries/clientpositive/encryption_join_with_different_encryption_keys.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; --SORT_QUERY_RESULTS -- Java JCE must be installed in order to hava a key length of 256 bits diff --git ql/src/test/queries/clientpositive/encryption_move_tbl.q ql/src/test/queries/clientpositive/encryption_move_tbl.q index 0b7771cc4a..8d865aa6e8 100644 --- ql/src/test/queries/clientpositive/encryption_move_tbl.q +++ ql/src/test/queries/clientpositive/encryption_move_tbl.q @@ -1,4 +1,5 @@ -- SORT_QUERY_RESULTS; +set hive.stats.column.autogather=false; -- we're setting this so that TestNegaiveCliDriver.vm doesn't stop processing after ALTER TABLE fails; diff --git ql/src/test/queries/clientpositive/groupby1.q ql/src/test/queries/clientpositive/groupby1.q index a8c9a8dcf8..cd3a12b44e 100755 --- ql/src/test/queries/clientpositive/groupby1.q +++ ql/src/test/queries/clientpositive/groupby1.q @@ -1,3 +1,5 @@ +-- due to testMTQueries1 +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.explain.user=false; set hive.map.aggr=false; diff --git ql/src/test/queries/clientpositive/groupby1_limit.q ql/src/test/queries/clientpositive/groupby1_limit.q index b8e389e511..6c40e19540 100644 --- ql/src/test/queries/clientpositive/groupby1_limit.q +++ ql/src/test/queries/clientpositive/groupby1_limit.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set mapred.reduce.tasks=31; diff --git ql/src/test/queries/clientpositive/groupby_multi_single_reducer.q ql/src/test/queries/clientpositive/groupby_multi_single_reducer.q index 2b799f87eb..40976ee707 100644 --- ql/src/test/queries/clientpositive/groupby_multi_single_reducer.q +++ ql/src/test/queries/clientpositive/groupby_multi_single_reducer.q @@ -1,3 +1,6 @@ +set hive.stats.column.autogather=false; +-- due to L137 in LimitPushDownOptimization Not safe to continue for RS-GBY-GBY-LIM kind of pipelines. See HIVE-10607 for more. + set hive.multigroupby.singlereducer=true; -- SORT_QUERY_RESULTS diff --git ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q index e404dd0f54..8955b6a651 100644 --- ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q +++ ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.explain.user=false; set tez.cartesian-product.max-parallelism=1; diff --git ql/src/test/queries/clientpositive/infer_bucket_sort_convert_join.q ql/src/test/queries/clientpositive/infer_bucket_sort_convert_join.q index 6809b721be..e4170283f3 100644 --- ql/src/test/queries/clientpositive/infer_bucket_sort_convert_join.q +++ ql/src/test/queries/clientpositive/infer_bucket_sort_convert_join.q @@ -1,3 +1,12 @@ +set hive.stats.column.autogather=false; +-- sounds weird: +-- on master, when auto=true, hive.mapjoin.localtask.max.memory.usage will be 0.55 as there is a gby +-- L132 of LocalMapJoinProcFactory +-- when execute in CLI, hive.exec.submit.local.task.via.child is true and we can see the error +-- if set hive.exec.submit.local.task.via.child=false, we can see it. +-- with patch, we just merge the tasks. hive.exec.submit.local.task.via.child=false due to pom.xml setting +-- however, even after change it to true, it still fails. + set hive.mapred.mode=nonstrict; set hive.exec.infer.bucket.sort=true; set hive.exec.infer.bucket.sort.num.buckets.power.two=true; diff --git ql/src/test/queries/clientpositive/infer_bucket_sort_reducers_power_two.q ql/src/test/queries/clientpositive/infer_bucket_sort_reducers_power_two.q index 6824c1c032..c0ddb8bce6 100644 --- ql/src/test/queries/clientpositive/infer_bucket_sort_reducers_power_two.q +++ ql/src/test/queries/clientpositive/infer_bucket_sort_reducers_power_two.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.exec.infer.bucket.sort=true; set hive.exec.infer.bucket.sort.num.buckets.power.two=true; diff --git ql/src/test/queries/clientpositive/input11_limit.q ql/src/test/queries/clientpositive/input11_limit.q index 052a72ee68..211c37adc5 100644 --- ql/src/test/queries/clientpositive/input11_limit.q +++ ql/src/test/queries/clientpositive/input11_limit.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; -- SORT_QUERY_RESULTS diff --git ql/src/test/queries/clientpositive/input14_limit.q ql/src/test/queries/clientpositive/input14_limit.q index 7316752a6d..2f6e4e47c9 100644 --- ql/src/test/queries/clientpositive/input14_limit.q +++ ql/src/test/queries/clientpositive/input14_limit.q @@ -1,3 +1,5 @@ +set hive.stats.column.autogather=false; + CREATE TABLE dest1(key INT, value STRING) STORED AS TEXTFILE; EXPLAIN diff --git ql/src/test/queries/clientpositive/join2.q ql/src/test/queries/clientpositive/join2.q index 8aedd561e2..c3c7c241e9 100644 --- ql/src/test/queries/clientpositive/join2.q +++ ql/src/test/queries/clientpositive/join2.q @@ -1,3 +1,5 @@ +-- due to testMTQueries1 +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; -- SORT_QUERY_RESULTS diff --git ql/src/test/queries/clientpositive/metadata_only_queries.q ql/src/test/queries/clientpositive/metadata_only_queries.q index 8581a46b2d..bcf320b0c5 100644 --- ql/src/test/queries/clientpositive/metadata_only_queries.q +++ ql/src/test/queries/clientpositive/metadata_only_queries.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.explain.user=false; set hive.compute.query.using.stats=true; diff --git ql/src/test/queries/clientpositive/metadata_only_queries_with_filters.q ql/src/test/queries/clientpositive/metadata_only_queries_with_filters.q index 1af813e3ed..692c414354 100644 --- ql/src/test/queries/clientpositive/metadata_only_queries_with_filters.q +++ ql/src/test/queries/clientpositive/metadata_only_queries_with_filters.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.stats.dbclass=fs; set hive.compute.query.using.stats=true; set hive.explain.user=false; diff --git ql/src/test/queries/clientpositive/multiMapJoin1.q ql/src/test/queries/clientpositive/multiMapJoin1.q index 5c49b4c64f..6e16af4617 100644 --- ql/src/test/queries/clientpositive/multiMapJoin1.q +++ ql/src/test/queries/clientpositive/multiMapJoin1.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecutePrinter,org.apache.hadoop.hive.ql.hooks.PrintCompletedTasksHook; diff --git ql/src/test/queries/clientpositive/orc_wide_table.q ql/src/test/queries/clientpositive/orc_wide_table.q index 422a3c24b1..d2ec3857d0 100644 --- ql/src/test/queries/clientpositive/orc_wide_table.q +++ ql/src/test/queries/clientpositive/orc_wide_table.q @@ -1,4 +1,5 @@ set hive.mapred.mode=nonstrict; +set hive.stats.column.autogather=false; drop table if exists test_txt; drop table if exists test_orc; create table test_txt( diff --git ql/src/test/queries/clientpositive/partition_coltype_literals.q ql/src/test/queries/clientpositive/partition_coltype_literals.q index eb56b1a93d..8da4876b70 100644 --- ql/src/test/queries/clientpositive/partition_coltype_literals.q +++ ql/src/test/queries/clientpositive/partition_coltype_literals.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.compute.query.using.stats=false; drop table if exists partcoltypenum; create table partcoltypenum (key int, value string) partitioned by (tint tinyint, sint smallint, bint bigint); diff --git ql/src/test/queries/clientpositive/row__id.q ql/src/test/queries/clientpositive/row__id.q index d9cb7b0ff6..6aaa40f68f 100644 --- ql/src/test/queries/clientpositive/row__id.q +++ ql/src/test/queries/clientpositive/row__id.q @@ -1,3 +1,5 @@ +-- tid is flaky when compute column stats +set hive.stats.column.autogather=false; set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; drop table if exists hello_acid; diff --git ql/src/test/queries/clientpositive/smb_join_partition_key.q ql/src/test/queries/clientpositive/smb_join_partition_key.q index 160bf5e36a..23027f8aa5 100644 --- ql/src/test/queries/clientpositive/smb_join_partition_key.q +++ ql/src/test/queries/clientpositive/smb_join_partition_key.q @@ -1,3 +1,5 @@ +--because p1 is decimal, in derby, when it retrieves partition with decimal, it will use partval = 100.0, rather than 100. As a result, the partition will not be found and it throws exception. +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; SET hive.enforce.sortmergebucketmapjoin=false; SET hive.auto.convert.sortmerge.join=true; diff --git ql/src/test/queries/clientpositive/stats15a.q ql/src/test/queries/clientpositive/stats15a.q new file mode 100644 index 0000000000..194e02c4f3 --- /dev/null +++ ql/src/test/queries/clientpositive/stats15a.q @@ -0,0 +1,25 @@ +set datanucleus.cache.collections=false; + +create table my_src (key string, value string); +insert into my_src values ('1','2'),('3','4'); + +create table my_srcpart (key string, value string) partitioned by (ds string,hr string); + +create table stats_part like my_srcpart; + +insert overwrite table stats_part partition (ds='2010-04-08', hr = '11') select key, value from my_src; +insert overwrite table stats_part partition (ds='2010-04-08', hr = '12') select key, value from my_src; + +analyze table stats_part partition(ds='2010-04-08', hr='11') compute statistics; +analyze table stats_part partition(ds='2010-04-08', hr='12') compute statistics; + +insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from my_src; + +desc formatted stats_part; +desc formatted stats_part partition (ds='2010-04-08', hr = '11'); +desc formatted stats_part partition (ds='2010-04-08', hr = '12'); + +analyze table stats_part partition(ds, hr) compute statistics; +desc formatted stats_part; + +drop table stats_part; diff --git ql/src/test/queries/clientpositive/stats_noscan_2a.q ql/src/test/queries/clientpositive/stats_noscan_2a.q new file mode 100644 index 0000000000..638f33d27a --- /dev/null +++ ql/src/test/queries/clientpositive/stats_noscan_2a.q @@ -0,0 +1,30 @@ +set hive.stats.autogather=true; + +dfs -cp ${system:hive.root}/data/files/ext_test ${system:test.tmp.dir}/analyze_external; +dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/texternal/2008-01-01; + +create external table t0 (key string,val string) location 'pfile://${system:test.tmp.dir}/texternal/2008-01-01'; +explain insert into t0 values ('a','b'),('a','b'), ('a','b'); +insert into t0 values ('a','b'),('a','b'), ('a','b'); +drop table t0; + +-- create external table +CREATE external TABLE anaylyze_external (key string, val string) partitioned by (insertdate string) LOCATION "pfile://${system:test.tmp.dir}/texternal"; +describe formatted anaylyze_external; +explain ALTER TABLE anaylyze_external ADD PARTITION (insertdate='2008-01-01') location 'pfile://${system:test.tmp.dir}/texternal/2008-01-01'; +ALTER TABLE anaylyze_external ADD PARTITION (insertdate='2008-01-01') location 'pfile://${system:test.tmp.dir}/texternal/2008-01-01'; +describe formatted anaylyze_external PARTITION (insertdate='2008-01-01'); +explain select count(*) from anaylyze_external where insertdate='2008-01-01'; +select count(*) from anaylyze_external where insertdate='2008-01-01'; +select count(*) from anaylyze_external; + +-- analyze +analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics noscan; +describe formatted anaylyze_external PARTITION (insertdate='2008-01-01'); +analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics; +describe formatted anaylyze_external PARTITION (insertdate='2008-01-01'); +dfs -rmr ${system:test.tmp.dir}/texternal; +drop table anaylyze_external; + + + diff --git ql/src/test/queries/clientpositive/udf_round_2.q ql/src/test/queries/clientpositive/udf_round_2.q index 43988c1225..38885a97d4 100644 --- ql/src/test/queries/clientpositive/udf_round_2.q +++ ql/src/test/queries/clientpositive/udf_round_2.q @@ -1,4 +1,5 @@ set hive.fetch.task.conversion=more; +set hive.stats.column.autogather=false; -- test for NaN (not-a-number) create table tstTbl1(n double); diff --git ql/src/test/queries/clientpositive/vector_groupby_rollup1.q ql/src/test/queries/clientpositive/vector_groupby_rollup1.q index 17858ff7ec..39bc2c1283 100644 --- ql/src/test/queries/clientpositive/vector_groupby_rollup1.q +++ ql/src/test/queries/clientpositive/vector_groupby_rollup1.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.explain.user=false; SET hive.vectorized.execution.enabled=true; SET hive.vectorized.execution.reduce.enabled=true; diff --git ql/src/test/queries/clientpositive/vector_multi_insert.q ql/src/test/queries/clientpositive/vector_multi_insert.q index c56ee1c4aa..e6bfb96794 100644 --- ql/src/test/queries/clientpositive/vector_multi_insert.q +++ ql/src/test/queries/clientpositive/vector_multi_insert.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.explain.user=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; diff --git ql/src/test/queries/clientpositive/vector_udf_character_length.q ql/src/test/queries/clientpositive/vector_udf_character_length.q index 19a5260ddc..e49a091b34 100644 --- ql/src/test/queries/clientpositive/vector_udf_character_length.q +++ ql/src/test/queries/clientpositive/vector_udf_character_length.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; diff --git ql/src/test/queries/clientpositive/vector_udf_octet_length.q ql/src/test/queries/clientpositive/vector_udf_octet_length.q index 06a49852a2..af4c7c4a7f 100644 --- ql/src/test/queries/clientpositive/vector_udf_octet_length.q +++ ql/src/test/queries/clientpositive/vector_udf_octet_length.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; diff --git ql/src/test/queries/clientpositive/vector_varchar_4.q ql/src/test/queries/clientpositive/vector_varchar_4.q index 80f84d8b9f..b3402d0df2 100644 --- ql/src/test/queries/clientpositive/vector_varchar_4.q +++ ql/src/test/queries/clientpositive/vector_varchar_4.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.explain.user=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; @@ -50,4 +51,4 @@ insert overwrite table varchar_lazy_binary_columnar select t, si, i, b, f, d, s -- insert overwrite table varchar_lazy_binary_columnar select t, si, i, b, f, d, s from vectortab2korc; --- select count(*) as cnt from varchar_lazy_binary_columnar group by vs order by cnt asc; \ No newline at end of file +-- select count(*) as cnt from varchar_lazy_binary_columnar group by vs order by cnt asc; diff --git ql/src/test/queries/clientpositive/vector_varchar_simple.q ql/src/test/queries/clientpositive/vector_varchar_simple.q index 6f753a748d..352ec3aebc 100644 --- ql/src/test/queries/clientpositive/vector_varchar_simple.q +++ ql/src/test/queries/clientpositive/vector_varchar_simple.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.explain.user=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none;