diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index fb3570b..3d2137b 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -635,7 +635,7 @@ // standard error allowed for ndv estimates. A lower value indicates higher accuracy and a // higher compute cost. HIVE_STATS_NDV_ERROR("hive.stats.ndv.error", (float)20.0), - HIVE_STATS_KEY_PREFIX_MAX_LENGTH("hive.stats.key.prefix.max.length", 200), + HIVE_STATS_KEY_PREFIX_MAX_LENGTH("hive.stats.key.prefix.max.length", 150), HIVE_STATS_KEY_PREFIX("hive.stats.key.prefix", ""), // internal usage only // Concurrency diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index 7c78b38..85a1192 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -1114,7 +1114,7 @@ private String createKeyForStatsPublisher(String taskID, String spSpec, String f } } String keyPrefix = Utilities.getHashedStatsPrefix( - conf.getStatsAggPrefix() + spSpec + newFspKey + Path.SEPARATOR, + conf.getStatsAggPrefix() + spSpec + newFspKey, conf.getMaxStatsKeyPrefixLength()); key = keyPrefix + storedAsDirPostFix + taskID; return key; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index e538092..ba02130 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -299,7 +299,7 @@ private void publishStats() throws HiveException { // In case of a partition, the key for temp storage is // "tableName + partitionSpecs + taskID" String keyPrefix = Utilities.getHashedStatsPrefix( - conf.getStatsAggPrefix() + pspecs + Path.SEPARATOR, conf.getMaxStatsKeyPrefixLength()); + conf.getStatsAggPrefix() + pspecs, conf.getMaxStatsKeyPrefixLength()); key = keyPrefix + taskID; } for(String statType : stats.get(pspecs).getStoredStats()) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index b575e22..81b59e8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -2287,11 +2287,11 @@ public static StatsPublisher getStatsPublisher(JobConf jc) { * @return */ public static String getHashedStatsPrefix(String statsPrefix, int maxPrefixLength) { - String ret = statsPrefix; + String ret = appendPathSeparator(statsPrefix); if (maxPrefixLength >= 0 && statsPrefix.length() > maxPrefixLength) { try { MessageDigest digester = MessageDigest.getInstance("MD5"); - digester.update(statsPrefix.getBytes()); + digester.update(ret.getBytes()); ret = new String(digester.digest()) + Path.SEPARATOR; } catch (NoSuchAlgorithmException e) { throw new RuntimeException(e); @@ -2300,6 +2300,13 @@ public static String getHashedStatsPrefix(String statsPrefix, int maxPrefixLengt return ret; } + private static String appendPathSeparator(String path) { + if (!path.endsWith(Path.SEPARATOR)) { + path = path + Path.SEPARATOR; + } + return path; + } + public static void setColumnNameList(JobConf jobConf, Operator op) { RowSchema rowSchema = op.getSchema(); if (rowSchema == null) { diff --git ql/src/test/queries/clientpositive/stats_list_bucket.q ql/src/test/queries/clientpositive/stats_list_bucket.q new file mode 100644 index 0000000..5982643 --- /dev/null +++ ql/src/test/queries/clientpositive/stats_list_bucket.q @@ -0,0 +1,45 @@ + +set hive.mapred.supports.subdirectories=true; + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) + +drop table stats_list_bucket; +drop table stats_list_bucket_1; + +create table stats_list_bucket ( + c1 string, + c2 string +) partitioned by (ds string, hr string) +skewed by (c1, c2) on (('466','val_466'),('287','val_287'),('82','val_82')) +stored as directories +stored as rcfile; + +set hive.stats.key.prefix.max.length=1; + +-- Make sure we use hashed IDs during stats publishing. +-- Try partitioned table with list bucketing. +-- The stats should show 500 rows loaded, as many rows as the src table has. + +insert overwrite table stats_list_bucket partition (ds = '2008-04-08', hr = '11') + select key, value from src; + +desc formatted stats_list_bucket partition (ds = '2008-04-08', hr = '11'); + +-- Also try non-partitioned table with list bucketing. +-- Stats should show the same number of rows. + +create table stats_list_bucket_1 ( + c1 string, + c2 string +) +skewed by (c1, c2) on (('466','val_466'),('287','val_287'),('82','val_82')) +stored as directories +stored as rcfile; + +insert overwrite table stats_list_bucket_1 + select key, value from src; + +desc formatted stats_list_bucket_1; + +drop table stats_list_bucket; +drop table stats_list_bucket_1; diff --git ql/src/test/results/clientpositive/stats_list_bucket.q.out ql/src/test/results/clientpositive/stats_list_bucket.q.out new file mode 100644 index 0000000..6a319a0 --- /dev/null +++ ql/src/test/results/clientpositive/stats_list_bucket.q.out @@ -0,0 +1,199 @@ +PREHOOK: query: -- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) + +drop table stats_list_bucket +PREHOOK: type: DROPTABLE +POSTHOOK: query: -- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) + +drop table stats_list_bucket +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table stats_list_bucket_1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table stats_list_bucket_1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table stats_list_bucket ( + c1 string, + c2 string +) partitioned by (ds string, hr string) +skewed by (c1, c2) on (('466','val_466'),('287','val_287'),('82','val_82')) +stored as directories +stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table stats_list_bucket ( + c1 string, + c2 string +) partitioned by (ds string, hr string) +skewed by (c1, c2) on (('466','val_466'),('287','val_287'),('82','val_82')) +stored as directories +stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@stats_list_bucket +PREHOOK: query: -- Make sure we use hashed IDs during stats publishing. +-- Try partitioned table with list bucketing. +-- The stats should show 500 rows loaded, as many rows as the src table has. + +insert overwrite table stats_list_bucket partition (ds = '2008-04-08', hr = '11') + select key, value from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@stats_list_bucket@ds=2008-04-08/hr=11 +POSTHOOK: query: -- Make sure we use hashed IDs during stats publishing. +-- Try partitioned table with list bucketing. +-- The stats should show 500 rows loaded, as many rows as the src table has. + +insert overwrite table stats_list_bucket partition (ds = '2008-04-08', hr = '11') + select key, value from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@stats_list_bucket@ds=2008-04-08/hr=11 +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: desc formatted stats_list_bucket partition (ds = '2008-04-08', hr = '11') +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted stats_list_bucket partition (ds = '2008-04-08', hr = '11') +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +# col_name data_type comment + +c1 string None +c2 string None + +# Partition Information +# col_name data_type comment + +ds string None +hr string None + +# Detailed Partition Information +Partition Value: [2008-04-08, 11] +Database: default +Table: stats_list_bucket +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + numFiles 4 + numRows 500 + rawDataSize 4812 + totalSize 5522 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Stored As SubDirectories: Yes +Skewed Columns: [c1, c2] +Skewed Values: [[466, val_466], [287, val_287], [82, val_82]] +#### A masked pattern was here #### +Skewed Value to Truncated Path: {[82, val_82]=/stats_list_bucket/ds=2008-04-08/hr=11/c1=82/c2=val_82, [466, val_466]=/stats_list_bucket/ds=2008-04-08/hr=11/c1=466/c2=val_466, [287, val_287]=/stats_list_bucket/ds=2008-04-08/hr=11/c1=287/c2=val_287} +Storage Desc Params: + serialization.format 1 +PREHOOK: query: -- Also try non-partitioned table with list bucketing. +-- Stats should show the same number of rows. + +create table stats_list_bucket_1 ( + c1 string, + c2 string +) +skewed by (c1, c2) on (('466','val_466'),('287','val_287'),('82','val_82')) +stored as directories +stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Also try non-partitioned table with list bucketing. +-- Stats should show the same number of rows. + +create table stats_list_bucket_1 ( + c1 string, + c2 string +) +skewed by (c1, c2) on (('466','val_466'),('287','val_287'),('82','val_82')) +stored as directories +stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@stats_list_bucket_1 +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table stats_list_bucket_1 + select key, value from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@stats_list_bucket_1 +POSTHOOK: query: insert overwrite table stats_list_bucket_1 + select key, value from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@stats_list_bucket_1 +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket_1.c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket_1.c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: desc formatted stats_list_bucket_1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted stats_list_bucket_1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket_1.c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket_1.c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +# col_name data_type comment + +c1 string None +c2 string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 4 + numPartitions 0 + numRows 500 + rawDataSize 4812 + totalSize 5522 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Stored As SubDirectories: Yes +Skewed Columns: [c1, c2] +Skewed Values: [[466, val_466], [287, val_287], [82, val_82]] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: drop table stats_list_bucket +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@stats_list_bucket +PREHOOK: Output: default@stats_list_bucket +POSTHOOK: query: drop table stats_list_bucket +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@stats_list_bucket +POSTHOOK: Output: default@stats_list_bucket +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket_1.c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket_1.c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: drop table stats_list_bucket_1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@stats_list_bucket_1 +PREHOOK: Output: default@stats_list_bucket_1 +POSTHOOK: query: drop table stats_list_bucket_1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@stats_list_bucket_1 +POSTHOOK: Output: default@stats_list_bucket_1 +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket PARTITION(ds=2008-04-08,hr=11).c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket_1.c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_list_bucket_1.c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]