Index: build-common.xml =================================================================== --- build-common.xml (revision 1420918) +++ build-common.xml (working copy) @@ -57,7 +57,7 @@ - + Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (revision 1420918) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (working copy) @@ -137,7 +137,7 @@ addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf()); } - if ((mvTask != null) && !mvTask.isLocal()) { + if ((mvTask != null) && !mvTask.isLocal() && fsOp.getConf().canBeMerged()) { if (fsOp.getConf().isLinkedFileSink()) { // If the user has HIVEMERGEMAPREDFILES set to false, the idea was the // number of reducers are few, so the number of files anyway are small. Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1420918) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -4488,9 +4488,6 @@ ctx.setNumFiles(numFiles); ctx.setPartnCols(partnColsNoConvert); ctx.setTotalFiles(totalFiles); - //disable "merge mapfiles" and "merge mapred files". - HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVEMERGEMAPFILES, false); - HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVEMERGEMAPREDFILES, false); } return input; } @@ -4872,12 +4869,21 @@ RowSchema fsRS = new RowSchema(vecCol); + // The output files of a FileSink can be merged if they are either not being written to a table + // or are being written to a table which is either not bucketed or enforce bucketing is not set + // and table the table is either not sorted or enforce sorting is not set + boolean canBeMerged = (dest_tab == null || !(dest_tab.getNumBuckets() > 0 && + conf.getBoolVar(HiveConf.ConfVars.HIVEENFORCEBUCKETING)) || + !(dest_tab.getSortCols() != null && dest_tab.getSortCols().size() > 0 && + conf.getBoolVar(HiveConf.ConfVars.HIVEENFORCESORTING))); + FileSinkDesc fileSinkDesc = new FileSinkDesc( queryTmpdir, table_desc, conf.getBoolVar(HiveConf.ConfVars.COMPRESSRESULT), currentTableId, rsCtx.isMultiFileSpray(), + canBeMerged, rsCtx.getNumFiles(), rsCtx.getTotalFiles(), rsCtx.getPartnCols(), Index: ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (revision 1420918) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (working copy) @@ -40,6 +40,9 @@ private String compressCodec; private String compressType; private boolean multiFileSpray; + // Whether the files output by this FileSink can be merged, e.g. if they are to be put into a + // bucketed or sorted table/partition they cannot be merged. + private boolean canBeMerged; private int totalFiles; private ArrayList partitionCols; private int numFiles; @@ -68,14 +71,15 @@ public FileSinkDesc(final String dirName, final TableDesc tableInfo, final boolean compressed, final int destTableId, final boolean multiFileSpray, - final int numFiles, final int totalFiles, final ArrayList partitionCols, - final DynamicPartitionCtx dpCtx) { + final boolean canBeMerged, final int numFiles, final int totalFiles, + final ArrayList partitionCols, final DynamicPartitionCtx dpCtx) { this.dirName = dirName; this.tableInfo = tableInfo; this.compressed = compressed; this.destTableId = destTableId; this.multiFileSpray = multiFileSpray; + this.canBeMerged = canBeMerged; this.numFiles = numFiles; this.totalFiles = totalFiles; this.partitionCols = partitionCols; @@ -90,6 +94,7 @@ this.compressed = compressed; destTableId = 0; this.multiFileSpray = false; + this.canBeMerged = false; this.numFiles = 1; this.totalFiles = 1; this.partitionCols = null; @@ -98,7 +103,7 @@ @Override public Object clone() throws CloneNotSupportedException { FileSinkDesc ret = new FileSinkDesc(dirName, tableInfo, compressed, - destTableId, multiFileSpray, numFiles, totalFiles, + destTableId, multiFileSpray, canBeMerged, numFiles, totalFiles, partitionCols, dpCtx); ret.setCompressCodec(compressCodec); ret.setCompressType(compressType); @@ -184,6 +189,14 @@ this.multiFileSpray = multiFileSpray; } + public boolean canBeMerged() { + return canBeMerged; + } + + public void setCanBeMerged(boolean canBeMerged) { + this.canBeMerged = canBeMerged; + } + /** * @return the totalFiles */ Index: ql/src/test/queries/clientpositive/bucket5.q =================================================================== --- ql/src/test/queries/clientpositive/bucket5.q (revision 0) +++ ql/src/test/queries/clientpositive/bucket5.q (working copy) @@ -0,0 +1,31 @@ +set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles = true; +set hive.merge.mapredfiles = true; +set mapred.reduce.tasks = 2; + +-- Tests that when a multi insert inserts into a bucketed table and a table which is not bucketed +-- the bucketed table is not merged and the table which is not bucketed is + +CREATE TABLE bucketed_table(key INT, value STRING) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE unbucketed_table(key INT, value STRING); + +EXPLAIN EXTENDED +FROM src +INSERT OVERWRITE TABLE bucketed_table SELECT key, value +INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key; + +FROM src +INSERT OVERWRITE TABLE bucketed_table SELECT key, value +INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key; + +SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 1 OUT OF 2) s LIMIT 10; +SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 2 OUT OF 2) s LIMIT 10; + +-- Should be 2 (not merged) +SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM bucketed_table; + +-- Should be 1 (merged) +SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM unbucketed_table; Index: ql/src/test/results/clientpositive/bucket5.q.out =================================================================== --- ql/src/test/results/clientpositive/bucket5.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucket5.q.out (working copy) @@ -0,0 +1,545 @@ +PREHOOK: query: -- Tests that when a multi insert inserts into a bucketed table and a table which is not bucketed +-- the bucketed table is not merged and the table which is not bucketed is + +CREATE TABLE bucketed_table(key INT, value STRING) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Tests that when a multi insert inserts into a bucketed table and a table which is not bucketed +-- the bucketed table is not merged and the table which is not bucketed is + +CREATE TABLE bucketed_table(key INT, value STRING) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucketed_table +PREHOOK: query: CREATE TABLE unbucketed_table(key INT, value STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE unbucketed_table(key INT, value STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@unbucketed_table +PREHOOK: query: EXPLAIN EXTENDED +FROM src +INSERT OVERWRITE TABLE bucketed_table SELECT key, value +INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM src +INSERT OVERWRITE TABLE bucketed_table SELECT key, value +INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME bucketed_table))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME unbucketed_table))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_CLUSTERBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-10 depends on stages: Stage-4 , consists of Stage-7, Stage-6, Stage-8 + Stage-7 + Stage-1 depends on stages: Stage-7, Stage-6, Stage-9 + Stage-5 depends on stages: Stage-1 + Stage-6 + Stage-8 + Stage-9 depends on stages: Stage-8 + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: UDFToInteger(_col0) + type: int + sort order: + + Map-reduce partition columns: + expr: UDFToInteger(_col0) + type: int + tag: -1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,string + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Reduce Operator Tree: + Extract + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.bucketed_table + serialization.ddl struct bucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketed_table + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Truncated Path -> Alias: + /src [src] + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.bucketed_table + serialization.ddl struct bucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketed_table +#### A masked pattern was here #### + + Stage: Stage-3 + Stats-Aggr Operator +#### A masked pattern was here #### + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10004 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,string + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,string + escape.delim \ + Reduce Operator Tree: + Extract + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 2 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.unbucketed_table + serialization.ddl struct unbucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.unbucketed_table + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-10 + Conditional Operator + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-1 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.unbucketed_table + serialization.ddl struct unbucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.unbucketed_table +#### A masked pattern was here #### + + Stage: Stage-5 + Stats-Aggr Operator +#### A masked pattern was here #### + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.unbucketed_table + serialization.ddl struct unbucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.unbucketed_table + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10005 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.unbucketed_table + serialization.ddl struct unbucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.unbucketed_table + serialization.ddl struct unbucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.unbucketed_table + name: default.unbucketed_table + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.unbucketed_table + serialization.ddl struct unbucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.unbucketed_table + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10005 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.unbucketed_table + serialization.ddl struct unbucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.unbucketed_table + serialization.ddl struct unbucketed_table { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.unbucketed_table + name: default.unbucketed_table + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-9 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE bucketed_table SELECT key, value +INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@bucketed_table +PREHOOK: Output: default@unbucketed_table +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE bucketed_table SELECT key, value +INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@bucketed_table +POSTHOOK: Output: default@unbucketed_table +POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 1 OUT OF 2) s LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@bucketed_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 1 OUT OF 2) s LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucketed_table +#### A masked pattern was here #### +POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 val_0 +0 val_0 +0 val_0 +2 val_2 +4 val_4 +8 val_8 +10 val_10 +12 val_12 +12 val_12 +18 val_18 +PREHOOK: query: SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 2 OUT OF 2) s LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@bucketed_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 2 OUT OF 2) s LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucketed_table +#### A masked pattern was here #### +POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +5 val_5 +5 val_5 +5 val_5 +9 val_9 +11 val_11 +15 val_15 +15 val_15 +17 val_17 +19 val_19 +27 val_27 +PREHOOK: query: -- Should be 2 (not merged) +SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM bucketed_table +PREHOOK: type: QUERY +PREHOOK: Input: default@bucketed_table +#### A masked pattern was here #### +POSTHOOK: query: -- Should be 2 (not merged) +SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM bucketed_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucketed_table +#### A masked pattern was here #### +POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +2 +PREHOOK: query: -- Should be 1 (merged) +SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM unbucketed_table +PREHOOK: type: QUERY +PREHOOK: Input: default@unbucketed_table +#### A masked pattern was here #### +POSTHOOK: query: -- Should be 1 (merged) +SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM unbucketed_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@unbucketed_table +#### A masked pattern was here #### +POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +1