diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index eb0d1d7..053420d 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -410,7 +410,8 @@ minitez.query.files.shared=acid_globallimit.q,\ union_type_chk.q -minitez.query.files=acid_vectorization_missing_cols.q,\ +minitez.query.files=acid_bucket_pruning.q,\ + acid_vectorization_missing_cols.q,\ bucket_map_join_tez1.q,\ smb_cache.q,\ bucket_map_join_tez2.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 12a929a..196c066 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -1209,6 +1209,11 @@ public static void renameOrMoveFiles(FileSystem fs, Path src, Path dst) throws I private static final Pattern PREFIXED_BUCKET_ID_REGEX = Pattern.compile("^(0*([0-9]+))_([0-9]+).*"); /** + * This breaks a bucket number prefixed by the static value "bucket_" into a single integer + */ + private static final Pattern STATIC_PREFIXED_BUCKET_ID_REGEX = + Pattern.compile("^bucket_([0-9]+).*"); + /** * Get the task id from the filename. It is assumed that the filename is derived from the output * of getTaskId * @@ -1648,6 +1653,15 @@ public static int getBucketIdFromFile(String bucketName) { } return Integer.parseInt(m.group(2)); } + // Check to see if the bucketName matches the pattern "bucket_([0-9]+).*" + // This can happen in ACID cases when we have splits on delta files, where the filenames + // are of the form delta_x_y/bucket_a. + m = STATIC_PREFIXED_BUCKET_ID_REGEX.matcher(bucketName); + if (m.matches()) { + if (!m.group(1).isEmpty()) { + return Integer.parseInt(m.group(1)); + } + } return -1; } diff --git ql/src/test/queries/clientpositive/acid_bucket_pruning.q ql/src/test/queries/clientpositive/acid_bucket_pruning.q new file mode 100644 index 0000000..24f8de1 --- /dev/null +++ ql/src/test/queries/clientpositive/acid_bucket_pruning.q @@ -0,0 +1,21 @@ +set hive.mapred.mode=nonstrict; +set hive.optimize.ppd=true; +set hive.optimize.index.filter=true; +set hive.tez.bucket.pruning=true; +set hive.explain.user=false; +set hive.fetch.task.conversion=none; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +-- Bucket pruning only works for ACID when split-update (U=D+I) has been enabled for the table. +-- For e.g., this can be done by setting 'transactional_properties' = 'default'. +-- This also means that bucket pruning will not work for ACID tables with legacy behaviour. + +CREATE TABLE acidTblDefault(a INT) CLUSTERED BY(a) INTO 16 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true', 'transactional_properties'='default'); +INSERT INTO TABLE acidTblDefault SELECT cint FROM alltypesorc WHERE cint IS NOT NULL ORDER BY cint; +INSERT INTO TABLE acidTblDefault VALUES (1); + +-- Exactly one of the buckets should be selected out of the 16 buckets +-- by the following selection query. +EXPLAIN EXTENDED +SELECT * FROM acidTblDefault WHERE a = 1; \ No newline at end of file diff --git ql/src/test/results/clientpositive/tez/acid_bucket_pruning.q.out ql/src/test/results/clientpositive/tez/acid_bucket_pruning.q.out new file mode 100644 index 0000000..e71bc12 --- /dev/null +++ ql/src/test/results/clientpositive/tez/acid_bucket_pruning.q.out @@ -0,0 +1,151 @@ +PREHOOK: query: -- Bucket pruning only works for ACID when split-update (U=D+I) has been enabled for the table. +-- For e.g., this can be done by setting 'transactional_properties' = 'default'. +-- This also means that bucket pruning will not work for ACID tables with legacy behaviour. + +CREATE TABLE acidTblDefault(a INT) CLUSTERED BY(a) INTO 16 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true', 'transactional_properties'='default') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acidTblDefault +POSTHOOK: query: -- Bucket pruning only works for ACID when split-update (U=D+I) has been enabled for the table. +-- For e.g., this can be done by setting 'transactional_properties' = 'default'. +-- This also means that bucket pruning will not work for ACID tables with legacy behaviour. + +CREATE TABLE acidTblDefault(a INT) CLUSTERED BY(a) INTO 16 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true', 'transactional_properties'='default') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acidTblDefault +PREHOOK: query: INSERT INTO TABLE acidTblDefault SELECT cint FROM alltypesorc WHERE cint IS NOT NULL ORDER BY cint +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acidtbldefault +POSTHOOK: query: INSERT INTO TABLE acidTblDefault SELECT cint FROM alltypesorc WHERE cint IS NOT NULL ORDER BY cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acidtbldefault +POSTHOOK: Lineage: acidtbldefault.a SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +PREHOOK: query: INSERT INTO TABLE acidTblDefault VALUES (1) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@acidtbldefault +POSTHOOK: query: INSERT INTO TABLE acidTblDefault VALUES (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@acidtbldefault +POSTHOOK: Lineage: acidtbldefault.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: -- Exactly one of the buckets should be selected out of the 16 buckets +-- by the following selection query. +EXPLAIN EXTENDED +SELECT * FROM acidTblDefault WHERE a = 1 +PREHOOK: type: QUERY +POSTHOOK: query: -- Exactly one of the buckets should be selected out of the 16 buckets +-- by the following selection query. +EXPLAIN EXTENDED +SELECT * FROM acidTblDefault WHERE a = 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: acidtbldefault + filterExpr: (a = 1) (type: boolean) + buckets included: [1,] of 16 + Statistics: Num rows: 8983 Data size: 35932 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (a = 1) (type: boolean) + Statistics: Num rows: 4491 Data size: 17964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 4491 Data size: 17964 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 4491 Data size: 17964 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: acidtbldefault + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count 16 + bucket_field_name a + columns a + columns.comments + columns.types int +#### A masked pattern was here #### + name default.acidtbldefault + numFiles 17 + numRows 0 + rawDataSize 0 + serialization.ddl struct acidtbldefault { i32 a} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 35932 + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count 16 + bucket_field_name a + columns a + columns.comments + columns.types int +#### A masked pattern was here #### + name default.acidtbldefault + numFiles 17 + numRows 0 + rawDataSize 0 + serialization.ddl struct acidtbldefault { i32 a} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 35932 + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.acidtbldefault + name: default.acidtbldefault + Truncated Path -> Alias: + /acidtbldefault [acidtbldefault] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +