diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index fb62ae2..83ffebf 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2343,6 +2343,10 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { "When pruning is enabled, filters on bucket columns will be processed by \n" + "filtering the splits against a bitset of included buckets. This needs predicates \n"+ "produced by hive.optimize.ppd and hive.optimize.index.filters."), + TEZ_OPTIMIZE_BUCKET_PRUNING_COMPAT( + "hive.tez.bucket.pruning.compat", true, + "When pruning is enabled, handle possibly broken inserts due to negative hashcodes.\n" + + "This occasionally doubles the data scan cost, but is default enabled for safety"), TEZ_DYNAMIC_PARTITION_PRUNING( "hive.tez.dynamic.partition.pruning", true, "When dynamic pruning is enabled, joins on partition keys will be processed by sending\n" + diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index a2ccfe0..894ec66 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -379,6 +379,7 @@ minitez.query.files=bucket_map_join_tez1.q,\ constprog_dpp.q,\ dynamic_partition_pruning.q,\ dynamic_partition_pruning_2.q,\ + bucketpruning1.q,\ explainuser_1.q,\ explainuser_2.q,\ explainuser_3.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java index c63318e..9e9beb0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java @@ -65,6 +65,12 @@ private static final Log LOG = LogFactory .getLog(FixedBucketPruningOptimizer.class.getName()); + private final boolean compat; + + public FixedBucketPruningOptimizer(boolean compat) { + this.compat = compat; + } + public class NoopWalker implements NodeProcessor { @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, @@ -229,6 +235,14 @@ protected void generatePredicate(NodeProcessorCtx procCtx, Object convCols[] = new Object[] {conv.convert(literal)}; int n = ObjectInspectorUtils.getBucketNumber(convCols, new ObjectInspector[]{constOI}, ctxt.getNumBuckets()); bs.set(n); + if (ctxt.isCompat()) { + int h = ObjectInspectorUtils.getBucketHashCode(convCols, new ObjectInspector[]{constOI}); + // -ve hashcodes had conversion to positive done in different ways in the past + // abs() is now obsolete and all inserts now use & Integer.MAX_VALUE + // the compat mode assumes that old data could've been loaded using the other conversion + n = ObjectInspectorUtils.getBucketNumber(Math.abs(h), ctxt.getNumBuckets()); + bs.set(n); + } } if (bs.cardinality() < ctxt.getNumBuckets()) { // there is a valid bucket pruning filter @@ -252,12 +266,14 @@ private boolean addLiteral(List literals, PredicateLeaf leaf) { public final class FixedBucketPruningOptimizerCtxt implements NodeProcessorCtx { public final ParseContext pctx; + private final boolean compat; private int numBuckets; private PrunedPartitionList partitions; private List bucketCols; private List schema; - public FixedBucketPruningOptimizerCtxt(ParseContext pctx) { + public FixedBucketPruningOptimizerCtxt(boolean compat, ParseContext pctx) { + this.compat = compat; this.pctx = pctx; } @@ -292,12 +308,17 @@ public int getNumBuckets() { public void setNumBuckets(int numBuckets) { this.numBuckets = numBuckets; } + + // compatibility mode enabled + public boolean isCompat() { + return this.compat; + } } @Override public ParseContext transform(ParseContext pctx) throws SemanticException { // create a the context for walking operators - FixedBucketPruningOptimizerCtxt opPartWalkerCtx = new FixedBucketPruningOptimizerCtxt( + FixedBucketPruningOptimizerCtxt opPartWalkerCtx = new FixedBucketPruningOptimizerCtxt(compat, pctx); // Retrieve all partitions generated from partition pruner and partition diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 7ec068c..a794ae1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -178,7 +178,9 @@ public void initialize(HiveConf hiveConf) { HiveConf.ConfVars.TEZ_OPTIMIZE_BUCKET_PRUNING) && HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTPPD) && HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTINDEXFILTER)) { - transformations.add(new FixedBucketPruningOptimizer()); + final boolean compatMode = + HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.TEZ_OPTIMIZE_BUCKET_PRUNING_COMPAT); + transformations.add(new FixedBucketPruningOptimizer(compatMode)); } if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.DYNAMICPARTITIONING) && diff --git ql/src/test/queries/clientpositive/bucketpruning1.q ql/src/test/queries/clientpositive/bucketpruning1.q index 6c689f1..0f797f7 100644 --- ql/src/test/queries/clientpositive/bucketpruning1.q +++ ql/src/test/queries/clientpositive/bucketpruning1.q @@ -54,6 +54,11 @@ select * from srcbucket_pruned where (key=1 or key=2) and ds='2008-04-08'; explain extended select * from srcbucket_pruned where (key=1 or key=2) and value = 'One' and ds='2008-04-08'; +-- compat case (-15 = 1 & 15) + +explain extended +select * from srcbucket_pruned where key = -15; + -- valid but irrelevant case (all buckets selected) explain extended @@ -95,4 +100,3 @@ select * from srcbucket_unpruned where key in (3, 5); explain extended select * from srcbucket_unpruned where key = 1; - diff --git ql/src/test/results/clientpositive/tez/bucketpruning1.q.out ql/src/test/results/clientpositive/tez/bucketpruning1.q.out index 3b90687..68b516f 100644 --- ql/src/test/results/clientpositive/tez/bucketpruning1.q.out +++ ql/src/test/results/clientpositive/tez/bucketpruning1.q.out @@ -1237,6 +1237,92 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: -- compat case (-15 = 1 & 15) + +explain extended +select * from srcbucket_pruned where key = -15 +PREHOOK: type: QUERY +POSTHOOK: query: -- compat case (-15 = 1 & 15) + +explain extended +select * from srcbucket_pruned where key = -15 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + srcbucket_pruned + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + TOK_WHERE + = + TOK_TABLE_OR_COL + key + - + 15 + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcbucket_pruned + filterExpr: (key = -15) (type: boolean) + buckets included: [1,15,] of 16 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (key = -15) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: -15 (type: int), value (type: string), ds (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + PREHOOK: query: -- valid but irrelevant case (all buckets selected) explain extended