diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 39c77b3fe5..19ffb71f47 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1587,6 +1587,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal HIVEALIAS("hive.alias", "", ""), HIVEMAPSIDEAGGREGATE("hive.map.aggr", true, "Whether to use map-side aggregation in Hive Group By queries"), HIVEGROUPBYSKEW("hive.groupby.skewindata", false, "Whether there is skew in data to optimize group by queries"), + HIVE_GROUPBY_SKEW_USE_RANDOM_SEED("hive.groupby.skewindata.use.random.seed", false, + "Whether to use seed while calling rand function for random distribution of keys"), HIVEJOINEMITINTERVAL("hive.join.emit.interval", 1000, "How many rows in the right-most join operand Hive should buffer before emitting the join result."), HIVEJOINCACHESIZE("hive.join.cache.size", 25000, diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java index 250a085084..2ed36bfb83 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java @@ -875,7 +875,12 @@ public static ReduceSinkDesc getReduceSinkDesc( partitionCols.addAll(keyCols.subList(0, numPartitionFields)); } else { // numPartitionFields = -1 means random partitioning - partitionCols.add(TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand")); + if (!SessionState.get().getConf().getBoolVar(HiveConf.ConfVars.HIVE_GROUPBY_SKEW_USE_RANDOM_SEED)) { + partitionCols.add(TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand")); + } else { + partitionCols.add(TypeCheckProcFactory.DefaultExprProcessor + .getFuncExprNodeDesc("rand", new ExprNodeConstantDesc(461984147))); + } } StringBuilder order = new StringBuilder(); diff --git ql/src/test/queries/clientpositive/groupby_skew_rand_seed.q ql/src/test/queries/clientpositive/groupby_skew_rand_seed.q new file mode 100644 index 0000000000..c005d35181 --- /dev/null +++ ql/src/test/queries/clientpositive/groupby_skew_rand_seed.q @@ -0,0 +1,17 @@ +--! qt:dataset:src +set hive.mapred.mode=nonstrict; +set hive.map.aggr=true; +set hive.groupby.skewindata=true; +set mapred.reduce.tasks=31; +set hive.groupby.skewindata.use.random.seed=true; + +-- SORT_QUERY_RESULTS + +CREATE TABLE groupby_rand_seed(key INT, value DOUBLE) STORED AS TEXTFILE; + +EXPLAIN +FROM src INSERT OVERWRITE TABLE groupby_rand_seed SELECT src.key, sum(substr(src.value,5)) GROUP BY src.key; + +FROM src INSERT OVERWRITE TABLE groupby_rand_seed SELECT src.key, sum(substr(src.value,5)) GROUP BY src.key; + +SELECT groupby_rand_seed.* FROM groupby_rand_seed; diff --git ql/src/test/results/clientpositive/groupby_skew_rand_seed.q.out ql/src/test/results/clientpositive/groupby_skew_rand_seed.q.out new file mode 100644 index 0000000000..58309ad165 --- /dev/null +++ ql/src/test/results/clientpositive/groupby_skew_rand_seed.q.out @@ -0,0 +1,472 @@ +PREHOOK: query: CREATE TABLE groupby_rand_seed(key INT, value DOUBLE) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@groupby_rand_seed +POSTHOOK: query: CREATE TABLE groupby_rand_seed(key INT, value DOUBLE) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@groupby_rand_seed +PREHOOK: query: EXPLAIN +FROM src INSERT OVERWRITE TABLE groupby_rand_seed SELECT src.key, sum(substr(src.value,5)) GROUP BY src.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM src INSERT OVERWRITE TABLE groupby_rand_seed SELECT src.key, sum(substr(src.value,5)) GROUP BY src.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0, Stage-4 + Stage-4 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), substr(value, 5) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: rand(461984147) (type: double) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: double) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: partials + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: double) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: final + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.groupby_rand_seed + Select Operator + expressions: _col0 (type: int), _col1 (type: double) + outputColumnNames: key, value + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: compute_stats(key, 'hll'), compute_stats(value, 'hll') + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 848 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.groupby_rand_seed + + Stage: Stage-3 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: key, value + Column Types: int, double + Table: default.groupby_rand_seed + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 848 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: struct), _col1 (type: struct) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1) + mode: final + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 880 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 880 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + +PREHOOK: query: FROM src INSERT OVERWRITE TABLE groupby_rand_seed SELECT src.key, sum(substr(src.value,5)) GROUP BY src.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@groupby_rand_seed +POSTHOOK: query: FROM src INSERT OVERWRITE TABLE groupby_rand_seed SELECT src.key, sum(substr(src.value,5)) GROUP BY src.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@groupby_rand_seed +POSTHOOK: Lineage: groupby_rand_seed.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: groupby_rand_seed.value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT groupby_rand_seed.* FROM groupby_rand_seed +PREHOOK: type: QUERY +PREHOOK: Input: default@groupby_rand_seed +#### A masked pattern was here #### +POSTHOOK: query: SELECT groupby_rand_seed.* FROM groupby_rand_seed +POSTHOOK: type: QUERY +POSTHOOK: Input: default@groupby_rand_seed +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 200.0 +103 206.0 +104 208.0 +105 105.0 +11 11.0 +111 111.0 +113 226.0 +114 114.0 +116 116.0 +118 236.0 +119 357.0 +12 24.0 +120 240.0 +125 250.0 +126 126.0 +128 384.0 +129 258.0 +131 131.0 +133 133.0 +134 268.0 +136 136.0 +137 274.0 +138 552.0 +143 143.0 +145 145.0 +146 292.0 +149 298.0 +15 30.0 +150 150.0 +152 304.0 +153 153.0 +155 155.0 +156 156.0 +157 157.0 +158 158.0 +160 160.0 +162 162.0 +163 163.0 +164 328.0 +165 330.0 +166 166.0 +167 501.0 +168 168.0 +169 676.0 +17 17.0 +170 170.0 +172 344.0 +174 348.0 +175 350.0 +176 352.0 +177 177.0 +178 178.0 +179 358.0 +18 36.0 +180 180.0 +181 181.0 +183 183.0 +186 186.0 +187 561.0 +189 189.0 +19 19.0 +190 190.0 +191 382.0 +192 192.0 +193 579.0 +194 194.0 +195 390.0 +196 196.0 +197 394.0 +199 597.0 +2 2.0 +20 20.0 +200 400.0 +201 201.0 +202 202.0 +203 406.0 +205 410.0 +207 414.0 +208 624.0 +209 418.0 +213 426.0 +214 214.0 +216 432.0 +217 434.0 +218 218.0 +219 438.0 +221 442.0 +222 222.0 +223 446.0 +224 448.0 +226 226.0 +228 228.0 +229 458.0 +230 1150.0 +233 466.0 +235 235.0 +237 474.0 +238 476.0 +239 478.0 +24 48.0 +241 241.0 +242 484.0 +244 244.0 +247 247.0 +248 248.0 +249 249.0 +252 252.0 +255 510.0 +256 512.0 +257 257.0 +258 258.0 +26 52.0 +260 260.0 +262 262.0 +263 263.0 +265 530.0 +266 266.0 +27 27.0 +272 544.0 +273 819.0 +274 274.0 +275 275.0 +277 1108.0 +278 556.0 +28 28.0 +280 560.0 +281 562.0 +282 564.0 +283 283.0 +284 284.0 +285 285.0 +286 286.0 +287 287.0 +288 576.0 +289 289.0 +291 291.0 +292 292.0 +296 296.0 +298 894.0 +30 30.0 +302 302.0 +305 305.0 +306 306.0 +307 614.0 +308 308.0 +309 618.0 +310 310.0 +311 933.0 +315 315.0 +316 948.0 +317 634.0 +318 954.0 +321 642.0 +322 644.0 +323 323.0 +325 650.0 +327 981.0 +33 33.0 +331 662.0 +332 332.0 +333 666.0 +335 335.0 +336 336.0 +338 338.0 +339 339.0 +34 34.0 +341 341.0 +342 684.0 +344 688.0 +345 345.0 +348 1740.0 +35 105.0 +351 351.0 +353 706.0 +356 356.0 +360 360.0 +362 362.0 +364 364.0 +365 365.0 +366 366.0 +367 734.0 +368 368.0 +369 1107.0 +37 74.0 +373 373.0 +374 374.0 +375 375.0 +377 377.0 +378 378.0 +379 379.0 +382 764.0 +384 1152.0 +386 386.0 +389 389.0 +392 392.0 +393 393.0 +394 394.0 +395 790.0 +396 1188.0 +397 794.0 +399 798.0 +4 4.0 +400 400.0 +401 2005.0 +402 402.0 +403 1209.0 +404 808.0 +406 1624.0 +407 407.0 +409 1227.0 +41 41.0 +411 411.0 +413 826.0 +414 828.0 +417 1251.0 +418 418.0 +419 419.0 +42 84.0 +421 421.0 +424 848.0 +427 427.0 +429 858.0 +43 43.0 +430 1290.0 +431 1293.0 +432 432.0 +435 435.0 +436 436.0 +437 437.0 +438 1314.0 +439 878.0 +44 44.0 +443 443.0 +444 444.0 +446 446.0 +448 448.0 +449 449.0 +452 452.0 +453 453.0 +454 1362.0 +455 455.0 +457 457.0 +458 916.0 +459 918.0 +460 460.0 +462 924.0 +463 926.0 +466 1398.0 +467 467.0 +468 1872.0 +469 2345.0 +47 47.0 +470 470.0 +472 472.0 +475 475.0 +477 477.0 +478 956.0 +479 479.0 +480 1440.0 +481 481.0 +482 482.0 +483 483.0 +484 484.0 +485 485.0 +487 487.0 +489 1956.0 +490 490.0 +491 491.0 +492 984.0 +493 493.0 +494 494.0 +495 495.0 +496 496.0 +497 497.0 +498 1494.0 +5 15.0 +51 102.0 +53 53.0 +54 54.0 +57 57.0 +58 116.0 +64 64.0 +65 65.0 +66 66.0 +67 134.0 +69 69.0 +70 210.0 +72 144.0 +74 74.0 +76 152.0 +77 77.0 +78 78.0 +8 8.0 +80 80.0 +82 82.0 +83 166.0 +84 168.0 +85 85.0 +86 86.0 +87 87.0 +9 9.0 +90 270.0 +92 92.0 +95 190.0 +96 96.0 +97 194.0 +98 196.0