diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index e386717..1657acf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -229,9 +229,11 @@ private void commit(FileSystem fs) throws HiveException { needToRename = false; } } - if (needToRename && outPaths[idx] != null && !fs.rename(outPaths[idx], finalPaths[idx])) { - throw new HiveException("Unable to rename output from: " + - outPaths[idx] + " to: " + finalPaths[idx]); + if (needToRename && outPaths[idx] != null) { + String name = finalPaths[idx].getName(); + for (int counter = 1; !fs.rename(outPaths[idx],finalPaths[idx]); counter++) { + finalPaths[idx] = new Path(finalPaths[idx].getParent(), name + "_copy_" + counter); + } } updateProgress(); } catch (IOException e) { @@ -890,20 +892,26 @@ protected String generateListBucketingDirName(Object row) { skewedValsCandidate.add(posPair.getSkewColPosition(), standObjs.get(posPair.getTblColPosition()).toString()); } - /* The row matches skewed column names. */ - if (allSkewedVals.contains(skewedValsCandidate)) { - /* matches skewed values. */ - lbDirName = FileUtils.makeListBucketingDirName(skewedCols, skewedValsCandidate); - locationMap.put(skewedValsCandidate, lbDirName); - } else { - /* create default directory. */ - lbDirName = FileUtils.makeDefaultListBucketingDirName(skewedCols, - lbCtx.getDefaultDirName()); - List defaultKey = Lists.newArrayList(lbCtx.getDefaultKey()); - if (!locationMap.containsKey(defaultKey)) { - locationMap.put(defaultKey, lbDirName); + + // Get the mapped folder first in case the location is set explicitly. + // Create the listBucketing folder only when it's not set. + lbDirName = locationMap.get(skewedValsCandidate); + if (lbDirName == null) { + if (allSkewedVals.contains(skewedValsCandidate)) { + /* matches skewed values. */ + lbDirName = FileUtils.makeListBucketingDirName(skewedCols, skewedValsCandidate); + locationMap.put(skewedValsCandidate, lbDirName); + } else { + /* create default directory. */ + lbDirName = FileUtils.makeDefaultListBucketingDirName(skewedCols, + lbCtx.getDefaultDirName()); + List defaultKey = Lists.newArrayList(lbCtx.getDefaultKey()); + if (!locationMap.containsKey(defaultKey)) { + locationMap.put(defaultKey, lbDirName); + } } } + return lbDirName; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index da46854..1cfc9cc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -1564,17 +1564,6 @@ public Partition loadPartition(Path loadPath, Table tbl, //column stats will be inaccurate StatsSetupConst.clearColumnStatsState(newTPart.getParameters()); - // recreate the partition if it existed before - if (isSkewedStoreAsSubdir) { - org.apache.hadoop.hive.metastore.api.Partition newCreatedTpart = newTPart.getTPartition(); - SkewedInfo skewedInfo = newCreatedTpart.getSd().getSkewedInfo(); - /* Construct list bucketing location mappings from sub-directory name. */ - Map, String> skewedColValueLocationMaps = constructListBucketingLocationMap( - newPartPath, skewedInfo); - /* Add list bucketing location mappings. */ - skewedInfo.setSkewedColValueLocationMaps(skewedColValueLocationMaps); - newCreatedTpart.getSd().setSkewedInfo(skewedInfo); - } if (!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { StatsSetupConst.setBasicStatsState(newTPart.getParameters(), StatsSetupConst.FALSE); } @@ -1634,90 +1623,9 @@ private void setStatsPropAndAlterPartition(boolean hasFollowingStatsTask, Table } /** - * Walk through sub-directory tree to construct list bucketing location map. - * - * @param fSta - * @param fSys - * @param skewedColValueLocationMaps - * @param newPartPath - * @param skewedInfo - * @throws IOException - */ -private void walkDirTree(FileStatus fSta, FileSystem fSys, - Map, String> skewedColValueLocationMaps, Path newPartPath, SkewedInfo skewedInfo) - throws IOException { - /* Base Case. It's leaf. */ - if (!fSta.isDir()) { - /* construct one location map if not exists. */ - constructOneLBLocationMap(fSta, skewedColValueLocationMaps, newPartPath, skewedInfo); - return; - } - - /* dfs. */ - FileStatus[] children = fSys.listStatus(fSta.getPath(), FileUtils.HIDDEN_FILES_PATH_FILTER); - if (children != null) { - for (FileStatus child : children) { - walkDirTree(child, fSys, skewedColValueLocationMaps, newPartPath, skewedInfo); - } - } -} - -/** - * Construct a list bucketing location map - * @param fSta - * @param skewedColValueLocationMaps - * @param newPartPath - * @param skewedInfo - */ -private void constructOneLBLocationMap(FileStatus fSta, - Map, String> skewedColValueLocationMaps, - Path newPartPath, SkewedInfo skewedInfo) { - Path lbdPath = fSta.getPath().getParent(); - List skewedValue = new ArrayList(); - String lbDirName = FileUtils.unescapePathName(lbdPath.toString()); - String partDirName = FileUtils.unescapePathName(newPartPath.toString()); - String lbDirSuffix = lbDirName.replace(partDirName, ""); - String[] dirNames = lbDirSuffix.split(Path.SEPARATOR); - for (String dirName : dirNames) { - if ((dirName != null) && (dirName.length() > 0)) { - // Construct skewed-value to location map except default directory. - // why? query logic knows default-dir structure and don't need to get from map - if (!dirName - .equalsIgnoreCase(ListBucketingPrunerUtils.HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME)) { - String[] kv = dirName.split("="); - if (kv.length == 2) { - skewedValue.add(kv[1]); - } - } - } - } - if ((skewedValue.size() > 0) && (skewedValue.size() == skewedInfo.getSkewedColNames().size()) - && !skewedColValueLocationMaps.containsKey(skewedValue)) { - skewedColValueLocationMaps.put(skewedValue, lbdPath.toString()); - } -} - - /** - * Construct location map from path - * - * @param newPartPath - * @param skewedInfo - * @return - * @throws IOException - * @throws FileNotFoundException - */ - private Map, String> constructListBucketingLocationMap(Path newPartPath, - SkewedInfo skewedInfo) throws IOException, FileNotFoundException { - Map, String> skewedColValueLocationMaps = new HashMap, String>(); - FileSystem fSys = newPartPath.getFileSystem(conf); - walkDirTree(fSys.getFileStatus(newPartPath), fSys, skewedColValueLocationMaps, newPartPath, - skewedInfo); - return skewedColValueLocationMaps; - } - - /** - * Get the valid partitions from the path - * @param numDP number of dynamic partitions + * Given a source directory name of the load path, load all dynamically generated partitions + * into the specified table and return a list of strings that represent the dynamic partition + * paths. * @param loadPath * @return Set of valid partitions * @throws HiveException @@ -1924,20 +1832,6 @@ public void loadTable(Path loadPath, String tableName, boolean replace, boolean //column stats will be inaccurate StatsSetupConst.clearColumnStatsState(tbl.getParameters()); - try { - if (isSkewedStoreAsSubdir) { - SkewedInfo skewedInfo = tbl.getSkewedInfo(); - // Construct list bucketing location mappings from sub-directory name. - Map, String> skewedColValueLocationMaps = constructListBucketingLocationMap( - tbl.getPath(), skewedInfo); - // Add list bucketing location mappings. - skewedInfo.setSkewedColValueLocationMaps(skewedColValueLocationMaps); - } - } catch (IOException e) { - LOG.error(StringUtils.stringifyException(e)); - throw new HiveException(e); - } - EnvironmentContext environmentContext = null; if (hasFollowingStatsTask) { environmentContext = new EnvironmentContext(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/formatting/MetaDataFormatUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/formatting/MetaDataFormatUtils.java index ba4f6a7..e62e2b5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/formatting/MetaDataFormatUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/formatting/MetaDataFormatUtils.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.metadata.formatting; import org.apache.commons.lang.StringEscapeUtils; +import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; @@ -406,6 +407,13 @@ private static void getStorageDescriptorInfo(StringBuilder tableInfo, Map, String> skewedColMap = new TreeMap<>(new VectorComparator()); skewedColMap.putAll(storageDesc.getSkewedInfo().getSkewedColValueLocationMaps()); + // Add the skewed location mappings which are not set explicitly so HMS doesn't have them + for(List skewedColValue : skewedColValues) { + if (!skewedColMap.containsKey(skewedColValue)) { + skewedColMap.put(skewedColValue, storageDesc.getLocation() + "/" + + FileUtils.makeListBucketingDirName(skewedColNames, skewedColValue)); + } + } if ((skewedColMap!=null) && (skewedColMap.size() > 0)) { formatOutput("Skewed Value to Path:", skewedColMap.toString(), tableInfo); diff --git a/ql/src/test/queries/clientpositive/create_alter_list_bucketing_table1.q b/ql/src/test/queries/clientpositive/create_alter_list_bucketing_table1.q index bf89e8f..ea6a6ca 100644 --- a/ql/src/test/queries/clientpositive/create_alter_list_bucketing_table1.q +++ b/ql/src/test/queries/clientpositive/create_alter_list_bucketing_table1.q @@ -29,7 +29,14 @@ alter table stored_as_dirs_single SKEWED BY (key) ON ('1','5','6') stored as DIRECTORIES; describe formatted stored_as_dirs_single; --- 7. create table like +-- 7. alter skew location +insert into stored_as_dirs_single values('1', 'value1'), ('3', 'value3'); +describe formatted stored_as_dirs_single; +alter table stored_as_dirs_single set skewed location('1'='file:${system:test.tmp.dir}/stored_as_dirs_single/key=new1'); +insert into stored_as_dirs_single values('1', 'value1'), ('3', 'value3'); +describe formatted stored_as_dirs_single; + +-- 8. create table like create table stored_as_dirs_single_like like stored_as_dirs_single; describe formatted stored_as_dirs_single_like; diff --git a/ql/src/test/results/clientpositive/create_alter_list_bucketing_table1.q.out b/ql/src/test/results/clientpositive/create_alter_list_bucketing_table1.q.out index 216d3be..f628d1d 100644 --- a/ql/src/test/results/clientpositive/create_alter_list_bucketing_table1.q.out +++ b/ql/src/test/results/clientpositive/create_alter_list_bucketing_table1.q.out @@ -53,6 +53,8 @@ Sort Columns: [] Stored As SubDirectories: Yes Skewed Columns: [col1, col2] Skewed Values: [[s1, 1], [s13, 13], [s3, 3], [s78, 78]] +#### A masked pattern was here #### +Skewed Value to Truncated Path: {[s1, 1]=/stored_as_dirs_multiple/col1=s1/col2=1, [s13, 13]=/stored_as_dirs_multiple/col1=s13/col2=13, [s3, 3]=/stored_as_dirs_multiple/col1=s3/col2=3, [s78, 78]=/stored_as_dirs_multiple/col1=s78/col2=78} Storage Desc Params: serialization.format 1 PREHOOK: query: -- 2. turn off stored as directories but table is still a skewed table @@ -101,6 +103,8 @@ Bucket Columns: [] Sort Columns: [] Skewed Columns: [col1, col2] Skewed Values: [[s1, 1], [s13, 13], [s3, 3], [s78, 78]] +#### A masked pattern was here #### +Skewed Value to Truncated Path: {[s1, 1]=/stored_as_dirs_multiple/col1=s1/col2=1, [s13, 13]=/stored_as_dirs_multiple/col1=s13/col2=13, [s3, 3]=/stored_as_dirs_multiple/col1=s3/col2=3, [s78, 78]=/stored_as_dirs_multiple/col1=s78/col2=78} Storage Desc Params: serialization.format 1 PREHOOK: query: -- 3. turn off skewed @@ -205,6 +209,8 @@ Sort Columns: [] Stored As SubDirectories: Yes Skewed Columns: [key] Skewed Values: [[1], [5], [6]] +#### A masked pattern was here #### +Skewed Value to Truncated Path: {[1]=/stored_as_dirs_single/key=1, [5]=/stored_as_dirs_single/key=5, [6]=/stored_as_dirs_single/key=6} Storage Desc Params: serialization.format 1 PREHOOK: query: -- 5. turn off skewed should turn off stored as directories too @@ -300,14 +306,126 @@ Sort Columns: [] Stored As SubDirectories: Yes Skewed Columns: [key] Skewed Values: [[1], [5], [6]] +#### A masked pattern was here #### +Skewed Value to Truncated Path: {[1]=/stored_as_dirs_single/key=1, [5]=/stored_as_dirs_single/key=5, [6]=/stored_as_dirs_single/key=6} +Storage Desc Params: + serialization.format 1 +PREHOOK: query: -- 7. alter skew location +insert into stored_as_dirs_single values('1', 'value1'), ('3', 'value3') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@stored_as_dirs_single +POSTHOOK: query: -- 7. alter skew location +insert into stored_as_dirs_single values('1', 'value1'), ('3', 'value3') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@stored_as_dirs_single +POSTHOOK: Lineage: stored_as_dirs_single.key SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: stored_as_dirs_single.value SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: describe formatted stored_as_dirs_single +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@stored_as_dirs_single +POSTHOOK: query: describe formatted stored_as_dirs_single +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@stored_as_dirs_single +# col_name data_type comment + +key string +value string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + numFiles 2 + numRows 0 + rawDataSize 0 + totalSize 18 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Stored As SubDirectories: Yes +Skewed Columns: [key] +Skewed Values: [[1], [5], [6]] +#### A masked pattern was here #### +Skewed Value to Truncated Path: {[1]=/stored_as_dirs_single/key=1, [5]=/stored_as_dirs_single/key=5, [6]=/stored_as_dirs_single/key=6} +Storage Desc Params: + serialization.format 1 +#### A masked pattern was here #### +PREHOOK: type: ALTERTBLPART_SKEWED_LOCATION +PREHOOK: Input: default@stored_as_dirs_single +PREHOOK: Output: default@stored_as_dirs_single +#### A masked pattern was here #### +POSTHOOK: type: ALTERTBLPART_SKEWED_LOCATION +POSTHOOK: Input: default@stored_as_dirs_single +POSTHOOK: Output: default@stored_as_dirs_single +#### A masked pattern was here #### +PREHOOK: query: insert into stored_as_dirs_single values('1', 'value1'), ('3', 'value3') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@stored_as_dirs_single +POSTHOOK: query: insert into stored_as_dirs_single values('1', 'value1'), ('3', 'value3') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@stored_as_dirs_single +POSTHOOK: Lineage: stored_as_dirs_single.key SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: stored_as_dirs_single.value SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: describe formatted stored_as_dirs_single +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@stored_as_dirs_single +POSTHOOK: query: describe formatted stored_as_dirs_single +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@stored_as_dirs_single +# col_name data_type comment + +key string +value string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + numFiles 3 + numRows 0 + rawDataSize 0 + totalSize 27 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Stored As SubDirectories: Yes +Skewed Columns: [key] +Skewed Values: [[1], [5], [6]] +#### A masked pattern was here #### Storage Desc Params: serialization.format 1 -PREHOOK: query: -- 7. create table like +PREHOOK: query: -- 8. create table like create table stored_as_dirs_single_like like stored_as_dirs_single PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@stored_as_dirs_single_like -POSTHOOK: query: -- 7. create table like +POSTHOOK: query: -- 8. create table like create table stored_as_dirs_single_like like stored_as_dirs_single POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default @@ -348,6 +466,7 @@ Sort Columns: [] Stored As SubDirectories: Yes Skewed Columns: [key] Skewed Values: [[1], [5], [6]] +#### A masked pattern was here #### Storage Desc Params: serialization.format 1 PREHOOK: query: -- cleanup