Index: ql/src/test/results/clientpositive/sample10.q.out =================================================================== --- ql/src/test/results/clientpositive/sample10.q.out (revision 0) +++ ql/src/test/results/clientpositive/sample10.q.out (revision 0) @@ -0,0 +1,441 @@ +PREHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS(0.17, 0.18, 0.19) + +create table srcpartbucket (key string, value string) partitioned by (ds string, hr string) clustered by (key) into 4 buckets +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS(0.17, 0.18, 0.19) + +create table srcpartbucket (key string, value string) partitioned by (ds string, hr string) clustered by (key) into 4 buckets +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@srcpartbucket +PREHOOK: query: insert overwrite table srcpartbucket partition(ds, hr) select * from srcpart where ds is not null and key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +POSTHOOK: query: insert overwrite table srcpartbucket partition(ds, hr) select * from srcpart where ds is not null and key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +POSTHOOK: Output: default@srcpartbucket@ds=2008-04-08/hr=11 +POSTHOOK: Output: default@srcpartbucket@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@srcpartbucket@ds=2008-04-09/hr=11 +POSTHOOK: Output: default@srcpartbucket@ds=2008-04-09/hr=12 +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +PREHOOK: query: explain extended +select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds +POSTHOOK: type: QUERY +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF srcpartbucket (TOK_TABLESAMPLE 1 4 (TOK_TABLE_OR_COL key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL ds)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_WHERE (TOK_FUNCTION TOK_ISNOTNULL (TOK_TABLE_OR_COL ds))) (TOK_GROUPBY (TOK_TABLE_OR_COL ds)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + srcpartbucket + TableScan + alias: srcpartbucket + Filter Operator + isSamplingPred: false + predicate: + expr: ((((hash(key) & 2147483647) % 4) = 0) and ds is not null) + type: boolean + Filter Operator + isSamplingPred: true + predicate: + expr: (((hash(key) & 2147483647) % 4) = 0) + type: boolean + Filter Operator + isSamplingPred: false + predicate: + expr: ds is not null + type: boolean + Select Operator + expressions: + expr: ds + type: string + outputColumnNames: ds + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: ds + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: + file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-08/hr=11/attempt_local_0001_r_000000_0 [srcpartbucket] + file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-08/hr=12/attempt_local_0001_r_000000_0 [srcpartbucket] + file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-09/hr=11/attempt_local_0001_r_000000_0 [srcpartbucket] + file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-09/hr=12/attempt_local_0001_r_000000_0 [srcpartbucket] + Path -> Partition: + file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-08/hr=11/attempt_local_0001_r_000000_0 + Partition + base file name: attempt_local_0001_r_000000_0 + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + partition values: + ds 2008-04-08 + hr 11 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat + location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket + name srcpartbucket + partition_columns ds/hr + serialization.ddl struct srcpartbucket { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + transient_lastDdlTime 1277145923 + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat + location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket + name srcpartbucket + partition_columns ds/hr + serialization.ddl struct srcpartbucket { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + transient_lastDdlTime 1277145923 + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: srcpartbucket + name: srcpartbucket + file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-08/hr=12/attempt_local_0001_r_000000_0 + Partition + base file name: attempt_local_0001_r_000000_0 + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + partition values: + ds 2008-04-08 + hr 12 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat + location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket + name srcpartbucket + partition_columns ds/hr + serialization.ddl struct srcpartbucket { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + transient_lastDdlTime 1277145923 + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat + location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket + name srcpartbucket + partition_columns ds/hr + serialization.ddl struct srcpartbucket { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + transient_lastDdlTime 1277145923 + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: srcpartbucket + name: srcpartbucket + file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-09/hr=11/attempt_local_0001_r_000000_0 + Partition + base file name: attempt_local_0001_r_000000_0 + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + partition values: + ds 2008-04-09 + hr 11 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat + location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket + name srcpartbucket + partition_columns ds/hr + serialization.ddl struct srcpartbucket { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + transient_lastDdlTime 1277145923 + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat + location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket + name srcpartbucket + partition_columns ds/hr + serialization.ddl struct srcpartbucket { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + transient_lastDdlTime 1277145923 + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: srcpartbucket + name: srcpartbucket + file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-09/hr=12/attempt_local_0001_r_000000_0 + Partition + base file name: attempt_local_0001_r_000000_0 + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + partition values: + ds 2008-04-09 + hr 12 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat + location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket + name srcpartbucket + partition_columns ds/hr + serialization.ddl struct srcpartbucket { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + transient_lastDdlTime 1277145923 + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat + location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket + name srcpartbucket + partition_columns ds/hr + serialization.ddl struct srcpartbucket { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + transient_lastDdlTime 1277145923 + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: srcpartbucket + name: srcpartbucket + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + directory: file:/data/users/nzhang/work/999/apache-hive/build/ql/scratchdir/hive_2010-06-21_11-45-29_813_4438091765019255104/10001 + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + serialization.format 1 + TotalFiles: 1 + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12 +PREHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-30_185_5515955290012905688/10000 +POSTHOOK: query: select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12 +POSTHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-30_185_5515955290012905688/10000 +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +2008-04-08 10 +2008-04-09 10 +PREHOOK: query: select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 2 on key) where ds is not null group by ds +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12 +PREHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-35_252_5650802559039809329/10000 +POSTHOOK: query: select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 2 on key) where ds is not null group by ds +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12 +POSTHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-35_252_5650802559039809329/10000 +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +2008-04-08 12 +2008-04-09 12 +PREHOOK: query: select * from srcpartbucket where ds is not null +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12 +PREHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-40_176_6924299231993417982/10000 +POSTHOOK: query: select * from srcpartbucket where ds is not null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12 +POSTHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-40_176_6924299231993417982/10000 +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +0 val_0 2008-04-08 11 +4 val_4 2008-04-08 11 +8 val_8 2008-04-08 11 +0 val_0 2008-04-08 11 +0 val_0 2008-04-08 11 +5 val_5 2008-04-08 11 +5 val_5 2008-04-08 11 +2 val_2 2008-04-08 11 +5 val_5 2008-04-08 11 +9 val_9 2008-04-08 11 +0 val_0 2008-04-08 12 +4 val_4 2008-04-08 12 +8 val_8 2008-04-08 12 +0 val_0 2008-04-08 12 +0 val_0 2008-04-08 12 +5 val_5 2008-04-08 12 +5 val_5 2008-04-08 12 +2 val_2 2008-04-08 12 +5 val_5 2008-04-08 12 +9 val_9 2008-04-08 12 +0 val_0 2008-04-09 11 +4 val_4 2008-04-09 11 +8 val_8 2008-04-09 11 +0 val_0 2008-04-09 11 +0 val_0 2008-04-09 11 +5 val_5 2008-04-09 11 +5 val_5 2008-04-09 11 +2 val_2 2008-04-09 11 +5 val_5 2008-04-09 11 +9 val_9 2008-04-09 11 +0 val_0 2008-04-09 12 +4 val_4 2008-04-09 12 +8 val_8 2008-04-09 12 +0 val_0 2008-04-09 12 +0 val_0 2008-04-09 12 +5 val_5 2008-04-09 12 +5 val_5 2008-04-09 12 +2 val_2 2008-04-09 12 +5 val_5 2008-04-09 12 +9 val_9 2008-04-09 12 +PREHOOK: query: drop table srcpartbucket +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table srcpartbucket +POSTHOOK: type: DROPTABLE +POSTHOOK: Output: default@srcpartbucket +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ] +POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ] Index: ql/src/test/queries/clientpositive/sample10.q =================================================================== --- ql/src/test/queries/clientpositive/sample10.q (revision 0) +++ ql/src/test/queries/clientpositive/sample10.q (revision 0) @@ -0,0 +1,24 @@ + +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.enforce.bucketing=true; +set hive.exec.reducers.max=4; +set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; +set hive.default.fileformat=RCFILE; + +-- EXCLUDE_HADOOP_MAJOR_VERSIONS(0.17, 0.18, 0.19) + +create table srcpartbucket (key string, value string) partitioned by (ds string, hr string) clustered by (key) into 4 buckets; + +insert overwrite table srcpartbucket partition(ds, hr) select * from srcpart where ds is not null and key < 10; + +explain extended +select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds; + +select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds; + +select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 2 on key) where ds is not null group by ds; + +select * from srcpartbucket where ds is not null; + +drop table srcpartbucket; Index: ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java (revision 956664) +++ ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java (working copy) @@ -23,9 +23,11 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.Queue; +import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -36,9 +38,9 @@ import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; -import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim; import org.apache.hadoop.hive.shims.HadoopShims.InputSplitShim; +import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.compress.CompressionCodecFactory; @@ -230,8 +232,8 @@ // combine splits only from same tables and same partitions. Do not combine splits from multiple // tables or multiple partitions. Path[] paths = combine.getInputPathsShim(job); + Set poolSet = new HashSet(); for (Path path : paths) { - LOG.info("CombineHiveInputSplit creating pool for " + path); PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, path); TableDesc tableDesc = part.getTableDesc(); @@ -283,7 +285,24 @@ return super.getSplits(job, numSplits); } - combine.createPool(job, new CombineFilter(path)); + // In the case of tablesample, the input paths are pointing to files rather than directories. + // We need to get the parent directory as the filtering path so that all files in the same + // parent directory will be grouped into one pool but not files from different parent + // directories. This guarantees that a split will combine all files in the same partition + // but won't cross multiple partitions. + Path filterPath = path; + if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory + filterPath = path.getParent(); + } + if (!poolSet.contains(filterPath)) { + LOG.info("CombineHiveInputSplit creating pool for " + path + + "; using filter path " + filterPath); + combine.createPool(job, new CombineFilter(filterPath)); + poolSet.add(filterPath); + } else { + LOG.info("CombineHiveInputSplit: pool is already created for " + path + + "; using filter path " + filterPath); + } } InputSplitShim[] iss = combine.getSplits(job, 1); for (InputSplitShim is : iss) { @@ -389,10 +408,12 @@ private final String pString; // store a path prefix in this TestFilter + // PRECONDITION: p should always be a directory public CombineFilter(Path p) { // we need to keep the path part only because the Hadoop CombineFileInputFormat will // pass the path part only to accept(). - pString = p.toUri().getPath().toString() + File.separator; + // Trailing the path with a separator to prevent partial matching. + pString = p.toUri().getPath().toString() + File.separator;; } // returns true if the specified path matches the prefix stored