commit 77e6e7b16494a72751cf96356ca8f3dc31acbb3b Author: Misha Dmitriev Date: Tue Feb 14 13:40:36 2017 -0800 Intern strings in various critical places to reduce memory consumption. Also, where maps in several places are created from other maps, use the original map's size for the new map. This is to avoid the situation when a map with default capacity (typically 16) is created to hold just 2-3 entries, and the rest of the internal 16-entry array is wasted. diff --git a/common/src/java/org/apache/hadoop/hive/common/StringInternUtils.java b/common/src/java/org/apache/hadoop/hive/common/StringInternUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..a0d50741ee793137fad9f9d2e7a159b96beab076 --- /dev/null +++ b/common/src/java/org/apache/hadoop/hive/common/StringInternUtils.java @@ -0,0 +1,131 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common; + +import org.apache.hadoop.fs.Path; + +import java.lang.reflect.Field; +import java.net.URI; +import java.util.List; +import java.util.ListIterator; + +/** + * Collection of utilities for string interning, common across Hive. + * We use the standard String.intern() call, that performs very well + * (no problems with PermGen overflowing, etc.) starting from JDK 7. + */ +public class StringInternUtils { + + // When a URI instance is initialized, it creates a bunch of private String + // fields, never bothering about their possible duplication. It would be + // best if we could tell URI constructor to intern these strings right away. + // Without this option, we can only use reflection to "fix" strings in these + // fields after a URI has been created. + private static Class uriClass = URI.class; + private static Field stringField, schemeField, authorityField, hostField, pathField, + fragmentField, schemeSpecificPartField; + + static { + try { + stringField = uriClass.getDeclaredField("string"); + schemeField = uriClass.getDeclaredField("scheme"); + authorityField = uriClass.getDeclaredField("authority"); + hostField = uriClass.getDeclaredField("host"); + pathField = uriClass.getDeclaredField("path"); + fragmentField = uriClass.getDeclaredField("fragment"); + schemeSpecificPartField = uriClass.getDeclaredField("schemeSpecificPart"); + } catch (NoSuchFieldException e) { + throw new RuntimeException(e); + } + + // Note that the calls below will throw an exception if a Java SecurityManager + // is installed and configured to forbid invoking setAccessible(). In practice + // this is not a problem in Hive. + stringField.setAccessible(true); + schemeField.setAccessible(true); + authorityField.setAccessible(true); + hostField.setAccessible(true); + pathField.setAccessible(true); + fragmentField.setAccessible(true); + schemeSpecificPartField.setAccessible(true); + } + + public static URI internStringsInUri(URI uri) { + if (uri == null) return null; + try { + String string = (String) stringField.get(uri); + if (string != null) stringField.set(uri, string.intern()); + String scheme = (String) schemeField.get(uri); + if (scheme != null) schemeField.set(uri, scheme.intern()); + String authority = (String) authorityField.get(uri); + if (authority != null) authorityField.set(uri, authority.intern()); + String host = (String) hostField.get(uri); + if (host != null) hostField.set(uri, host.intern()); + String path = (String) pathField.get(uri); + if (path != null) pathField.set(uri, path.intern()); + String fragment = (String) fragmentField.get(uri); + if (fragment != null) fragmentField.set(uri, fragment.intern()); + String schemeSpecificPart = (String) schemeSpecificPartField.get(uri); + if (schemeSpecificPart != null) schemeSpecificPartField.set(uri, schemeSpecificPart.intern()); + } catch (Exception e) { + throw new RuntimeException(e); + } + return uri; + } + + public static Path internUriStringsInPath(Path path) { + if (path != null) internStringsInUri(path.toUri()); + return path; + } + + public static Path[] internUriStringsInPathArray(Path[] paths) { + if (paths != null) { + for (Path path : paths) { + internUriStringsInPath(path); + } + } + return paths; + } + + /** + * This method interns all the strings in the given list in place. That is, + * it iterates over the list, replaces each element with the interned copy + * and eventually returns the same list. + */ + public static List internStringsInList(List list) { + if (list != null) { + ListIterator it = list.listIterator(); + while (it.hasNext()) { + it.set(it.next().intern()); + } + } + return list; + } + + /** Interns all the strings in the given array in place, returning the same array */ + public static String[] internStringsInArray(String[] strings) { + for (int i = 0; i < strings.length; i++) { + if (strings[i] != null) { + strings[i] = strings[i].intern(); + } + } + return strings; + } + + public static String internIfNotNull(String s) { + if (s != null) s = s.intern(); + return s; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index e81cbce3e333d44a4088c10491f399e92a505293..6d06dfaada49bdff233032845d3566dd7d96352a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -43,6 +43,7 @@ import org.apache.hadoop.hive.common.HiveStatsUtils; import org.apache.hadoop.hive.common.JavaUtils; import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.MetaStoreUtils; @@ -3016,6 +3017,7 @@ public static double getHighestSamplePercentage (MapWork work) { continue; } + StringInternUtils.internUriStringsInPath(file); pathsProcessed.add(file); if (LOG.isDebugEnabled()) { @@ -3117,7 +3119,7 @@ private static Path createEmptyFile(Path hiveScratchDir, } recWriter.close(false); - return newPath; + return StringInternUtils.internUriStringsInPath(newPath); } @SuppressWarnings("rawtypes") @@ -3140,15 +3142,13 @@ private static Path createDummyFileForEmptyPartition(Path path, JobConf job, Map boolean oneRow = partDesc.getInputFileFormatClass() == OneNullRowInputFormat.class; - Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, - props, oneRow); + Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, oneRow); if (LOG.isInfoEnabled()) { LOG.info("Changed input file " + strPath + " to empty file " + newPath + " (" + oneRow + ")"); } // update the work - String strNewPath = newPath.toString(); work.addPathToAlias(newPath, work.getPathToAliases().get(path)); work.removePathToAlias(path); @@ -3175,6 +3175,7 @@ private static Path createDummyFileForEmptyTable(JobConf job, MapWork work, Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, false); + StringInternUtils.internUriStringsInPath(newPath); if (LOG.isInfoEnabled()) { LOG.info("Changed input file for alias " + alias + " to " + newPath); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/hooks/Entity.java b/ql/src/java/org/apache/hadoop/hive/ql/hooks/Entity.java index 08420664d59f28f75872c25c9f8ee42577b23451..131c1e1bb5d8b3c399ae3c0cab47651e799ec362 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/hooks/Entity.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/hooks/Entity.java @@ -327,6 +327,10 @@ public String toString() { } private String computeName() { + return doComputeName().intern(); + } + + private String doComputeName() { switch (typ) { case DATABASE: return "database:" + database.getName(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java index e91064b9c75e8adb2b36f21ff19ec0c1539b03b9..7a113bf8e5c4dd8c2c486741a5ebc7b8940e746b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java @@ -35,6 +35,7 @@ import java.util.concurrent.Future; import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.hive.common.StringInternUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -340,7 +341,7 @@ public int hashCode() { // combine splits only from same tables and same partitions. Do not combine splits from multiple // tables or multiple partitions. - Path[] paths = combine.getInputPathsShim(job); + Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job)); List inpDirs = new ArrayList(); List inpFiles = new ArrayList(); @@ -660,6 +661,7 @@ private void processPaths(JobConf job, CombineFileInputFormatShim combine, Map> result = new HashMap<>(); for (Map.Entry > entry : pathToAliases.entrySet()) { Path newKey = Path.getPathWithoutSchemeAndAuthority(entry.getKey()); + StringInternUtils.internUriStringsInPath(newKey); result.put(newKey, entry.getValue()); } return result; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java index 51530ac16c92cc75d501bfcb573557754ba0c964..1cb9557a2fdc89591ff1f2458a838cda8a6f6f04 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java @@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.Map.Entry; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -57,7 +58,6 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; -import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorDeserializeType; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorMapOperatorReadType; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; @@ -454,6 +454,7 @@ private void addSplitsForGroup(List dirs, TableScanOperator tableScan, Job } } } + StringInternUtils.internUriStringsInPathArray(dirs); return dirs; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java index 55b3b551a1dac92583b6e03b10beb8172ca93d45..b534e35689bb5a2959fa9458a52f41e5745a5cf2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java @@ -31,6 +31,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; @@ -75,10 +76,11 @@ public void rework(HiveConf job, MapredWork work) throws IOException { // no check for the line? How to check? // if the line is invalid for any reason, the job will fail. FileStatus[] matches = fileSystem.globStatus(new Path(line)); - for(FileStatus fileStatus :matches) { + for (FileStatus fileStatus : matches) { Path schemaLessPath = Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath()); - toAddPathToPart.put(schemaLessPath, partDesc); - pathToAliases.put(schemaLessPath, aliases); + StringInternUtils.internUriStringsInPath(schemaLessPath); + toAddPathToPart.put(schemaLessPath, partDesc); + pathToAliases.put(schemaLessPath, aliases); } } } finally { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/HiveLockObject.java b/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/HiveLockObject.java index 82dc89803be9cf9e0018720eeceb90ff450bfdc8..fff03df4736bb8df03656f6cef3028063bd4fb42 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/HiveLockObject.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/HiveLockObject.java @@ -22,6 +22,7 @@ import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.ql.metadata.DummyPartition; import org.apache.hadoop.hive.ql.metadata.Hive; @@ -53,9 +54,10 @@ public HiveLockObjectData(String queryId, String lockMode, String queryStr) { this.queryId = removeDelimiter(queryId); - this.lockTime = removeDelimiter(lockTime); + this.lockTime = StringInternUtils.internIfNotNull(removeDelimiter(lockTime)); this.lockMode = removeDelimiter(lockMode); - this.queryStr = removeDelimiter(queryStr == null ? null : queryStr.trim()); + this.queryStr = StringInternUtils.internIfNotNull( + removeDelimiter(queryStr == null ? null : queryStr.trim())); } /** @@ -71,9 +73,9 @@ public HiveLockObjectData(String data) { String[] elem = data.split(":"); queryId = elem[0]; - lockTime = elem[1]; + lockTime = StringInternUtils.internIfNotNull(elem[1]); lockMode = elem[2]; - queryStr = elem[3]; + queryStr = StringInternUtils.internIfNotNull(elem[3]); if (elem.length >= 5) { clientIp = elem[4]; } @@ -178,12 +180,12 @@ public HiveLockObject() { public HiveLockObject(String path, HiveLockObjectData lockData) { this.pathNames = new String[1]; - this.pathNames[0] = path; + this.pathNames[0] = StringInternUtils.internIfNotNull(path); this.data = lockData; } public HiveLockObject(String[] paths, HiveLockObjectData lockData) { - this.pathNames = paths; + this.pathNames = StringInternUtils.internStringsInArray(paths); this.data = lockData; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java index c0edde9e92314d86482b5c46178987e79fae57fe..bff1688a2bf8e8413b5b6a9ba8855bd3810f5daf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java @@ -27,6 +27,7 @@ import java.util.Map; import java.util.Properties; +import org.apache.hadoop.hive.common.StringInternUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.FileStatus; @@ -156,7 +157,7 @@ protected void initialize(Table table, org.apache.hadoop.hive.metastore.api.Partition tPartition) throws HiveException { this.table = table; - this.tPartition = tPartition; + setTPartition(tPartition); if (table.isView()) { return; @@ -458,6 +459,7 @@ public void setTable(Table table) { */ public void setTPartition( org.apache.hadoop.hive.metastore.api.Partition partition) { + StringInternUtils.internStringsInList(partition.getValues()); tPartition = partition; } @@ -522,7 +524,7 @@ public void setValues(Map partSpec) throw new HiveException( "partition spec is invalid. field.getName() does not exist in input."); } - pvals.add(val); + pvals.add(val.intern()); } tPartition.setValues(pvals); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java index c6ae6f290857cfd10f1023058ede99bf4a10f057..3e771ad8e831e7d4acb03595e82ec32e9e24573b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java @@ -865,7 +865,7 @@ public boolean isIndexTable() { List fsl = getPartCols(); List tpl = tp.getValues(); - LinkedHashMap spec = new LinkedHashMap(); + LinkedHashMap spec = new LinkedHashMap(fsl.size()); for (int i = 0; i < fsl.size(); i++) { FieldSchema fs = fsl.get(i); String value = tpl.get(i); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index 24d16812515bdfa90b4be7a295c0388fcdfe95ef..0e67ea64c719063748c2881f25ccf3c4a8a69f7b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -38,6 +38,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.BlobStorageUtils; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.Warehouse; @@ -249,9 +250,11 @@ private static void setUnionPlan(GenMRProcContext opProcCtx, TableDesc tt_desc = tt_descLst.get(pos); MapWork mWork = plan.getMapWork(); if (mWork.getPathToAliases().get(taskTmpDir) == null) { - mWork.removePathToAlias(new Path(taskTmpDir)); - mWork.addPathToAlias(new Path(taskTmpDir),taskTmpDir); - mWork.addPathToPartitionInfo(new Path(taskTmpDir), new PartitionDesc(tt_desc, null)); + taskTmpDir = taskTmpDir.intern(); + Path taskTmpDirPath = StringInternUtils.internUriStringsInPath(new Path(taskTmpDir)); + mWork.removePathToAlias(taskTmpDirPath); + mWork.addPathToAlias(taskTmpDirPath, taskTmpDir); + mWork.addPathToPartitionInfo(taskTmpDirPath, new PartitionDesc(tt_desc, null)); mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos)); } } @@ -771,7 +774,7 @@ public static void setTaskPlan(Path path, String alias, if (topOp instanceof TableScanOperator) { try { - Utilities.addSchemaEvolutionToTableScanOperator( + Utilities.addSchemaEvolutionToTableScanOperator( (StructObjectInspector) tt_desc.getDeserializer().getObjectInspector(), (TableScanOperator) topOp); } catch (Exception e) { @@ -780,7 +783,7 @@ public static void setTaskPlan(Path path, String alias, } if (!local) { - plan.addPathToAlias(path,alias); + plan.addPathToAlias(path, alias); plan.addPathToPartitionInfo(path, new PartitionDesc(tt_desc, null)); plan.getAliasToWork().put(alias, topOp); } else { @@ -1543,16 +1546,17 @@ private static MapWork createMRWorkForMergingFiles (HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) { ArrayList aliases = new ArrayList(); - Path inputDir = fsDesc.getFinalDirName(); + Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getFinalDirName()); + String inputDirStr = inputDir.toString().intern(); TableDesc tblDesc = fsDesc.getTableInfo(); - aliases.add(inputDir.toString()); // dummy alias: just use the input path + aliases.add(inputDirStr); // dummy alias: just use the input path // constructing the default MapredWork MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf); MapWork cplan = cMrPlan.getMapWork(); cplan.addPathToAlias(inputDir, aliases); cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null)); - cplan.getAliasToWork().put(inputDir.toString(), topOp); + cplan.getAliasToWork().put(inputDirStr, topOp); cplan.setMapperCannotSpanPartns(true); return cplan; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/GenMRSkewJoinProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/GenMRSkewJoinProcessor.java index ede4fcbe342052ad86dadebcc49da2c0f515ea98..93202c36bd822e57ea82b3f9683fc5316f3bb57e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/GenMRSkewJoinProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/GenMRSkewJoinProcessor.java @@ -26,6 +26,7 @@ import java.util.Map; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.exec.ConditionalTask; @@ -261,7 +262,7 @@ public static void processSkewJoin(JoinOperator joinOp, Operator tblScan_op = parentOps[i]; ArrayList aliases = new ArrayList(); - String alias = src.toString(); + String alias = src.toString().intern(); aliases.add(alias); Path bigKeyDirPath = bigKeysDirMap.get(src); newPlan.addPathToAlias(bigKeyDirPath, aliases); @@ -389,18 +390,21 @@ public static boolean skewJoinEnabled(HiveConf conf, JoinOperator joinOp) { private static String RESULTS = "results"; static Path getBigKeysDir(Path baseDir, Byte srcTbl) { - return new Path(baseDir, skewJoinPrefix + UNDERLINE + BIGKEYS + UNDERLINE + srcTbl); + return StringInternUtils.internUriStringsInPath( + new Path(baseDir, skewJoinPrefix + UNDERLINE + BIGKEYS + UNDERLINE + srcTbl)); } static Path getBigKeysSkewJoinResultDir(Path baseDir, Byte srcTbl) { - return new Path(baseDir, skewJoinPrefix + UNDERLINE + BIGKEYS - + UNDERLINE + RESULTS + UNDERLINE + srcTbl); + return StringInternUtils.internUriStringsInPath( + new Path(baseDir, skewJoinPrefix + UNDERLINE + BIGKEYS + + UNDERLINE + RESULTS + UNDERLINE + srcTbl)); } static Path getSmallKeysDir(Path baseDir, Byte srcTblBigTbl, Byte srcTblSmallTbl) { - return new Path(baseDir, skewJoinPrefix + UNDERLINE + SMALLKEYS - + UNDERLINE + srcTblBigTbl + UNDERLINE + srcTblSmallTbl); + return StringInternUtils.internUriStringsInPath( + new Path(baseDir, skewJoinPrefix + UNDERLINE + SMALLKEYS + + UNDERLINE + srcTblBigTbl + UNDERLINE + srcTblSmallTbl)); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java index 0882ae2c6205b1636cbc92e76ef66bb70faadc76..fb7e18e73f076a4f0bddef3faa6d7db67a44a0b4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java @@ -18,10 +18,9 @@ package org.apache.hadoop.hive.ql.optimizer.physical; -import org.apache.hadoop.mapred.InputFormat; - import java.io.IOException; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.ZeroRowsInputFormat; @@ -35,7 +34,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.ServiceLoader; import java.util.Map.Entry; import java.util.Stack; @@ -125,8 +123,9 @@ private void processAlias(MapWork work, Path path, ArrayList aliasesAffe // Prefix partition with something to avoid it being a hidden file. Path fakePath = new Path(NullScanFileSystem.getBase() + newPartition.getTableName() + "/part" + encode(newPartition.getPartSpec())); + StringInternUtils.internUriStringsInPath(fakePath); work.addPathToPartitionInfo(fakePath, newPartition); - work.addPathToAlias(fakePath, new ArrayList(allowed)); + work.addPathToAlias(fakePath, new ArrayList<>(allowed)); aliasesAffected.removeAll(allowed); if (aliasesAffected.isEmpty()) { work.removePathToAlias(path); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java index 68b0ad9ea63f051f16fec3652d8525f7ab07eb3f..426656962aef2871f376fe587c9210f81c48d1c4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java @@ -317,10 +317,9 @@ private void generateActualTasks(HiveConf conf, List fullPartSpec = new LinkedHashMap( - dpCtx.getPartSpec()); + LinkedHashMap fullPartSpec = new LinkedHashMap<>(dpCtx.getPartSpec()); Warehouse.makeSpecFromName(fullPartSpec, status[i].getPath()); - PartitionDesc pDesc = new PartitionDesc(tblDesc, (LinkedHashMap) fullPartSpec); + PartitionDesc pDesc = new PartitionDesc(tblDesc, fullPartSpec); return pDesc; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java index d4bdd96eaf8d179bed43b8a8c3be0d338940154a..e76cf7370c6e339785f74e58bd86c462eb870e83 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.plan; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -169,7 +170,10 @@ public MapWork(String name) { } public void setPathToAliases(final LinkedHashMap> pathToAliases) { - this.pathToAliases = pathToAliases; + this.pathToAliases = new LinkedHashMap<>(pathToAliases.size()); + for (Map.Entry> e : pathToAliases.entrySet()) { + this.pathToAliases.put(StringInternUtils.internUriStringsInPath(e.getKey()), e.getValue()); + } } public void addPathToAlias(Path path, ArrayList aliases){ @@ -179,10 +183,10 @@ public void addPathToAlias(Path path, ArrayList aliases){ public void addPathToAlias(Path path, String newAlias){ ArrayList aliases = pathToAliases.get(path); if (aliases == null) { - aliases=new ArrayList(); + aliases = new ArrayList<>(); pathToAliases.put(path, aliases); } - aliases.add(newAlias); + aliases.add(newAlias.intern()); } @@ -391,10 +395,11 @@ public void setNumMapTasks(Integer numMapTasks) { @SuppressWarnings("nls") public void addMapWork(Path path, String alias, Operator work, PartitionDesc pd) { + StringInternUtils.internUriStringsInPath(path); ArrayList curAliases = pathToAliases.get(path); if (curAliases == null) { assert (pathToPartitionInfo.get(path) == null); - curAliases = new ArrayList(); + curAliases = new ArrayList<>(); pathToAliases.put(path, curAliases); pathToPartitionInfo.put(path, pd); } else { @@ -425,6 +430,7 @@ public void setInputFormatSorted(boolean inputFormatSorted) { public void resolveDynamicPartitionStoredAsSubDirsMerge(HiveConf conf, Path path, TableDesc tblDesc, ArrayList aliases, PartitionDesc partDesc) { + StringInternUtils.internUriStringsInPath(path); pathToAliases.put(path, aliases); pathToPartitionInfo.put(path, partDesc); } @@ -491,9 +497,11 @@ public void replaceRoots(Map, Operator> replacementMap) { } public void mergeAliasedInput(String alias, Path pathDir, PartitionDesc partitionInfo) { + StringInternUtils.internUriStringsInPath(pathDir); + alias = alias.intern(); ArrayList aliases = pathToAliases.get(pathDir); if (aliases == null) { - aliases = new ArrayList(Arrays.asList(alias)); + aliases = new ArrayList<>(Arrays.asList(alias)); pathToAliases.put(pathDir, aliases); pathToPartitionInfo.put(pathDir, partitionInfo); } else { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/MsckDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/MsckDesc.java index b7a7e4b7a5f8941b080c7805d224d3885885f444..68a016402f6dd52e85d59c6625010478eb0c66fe 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/MsckDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/MsckDesc.java @@ -59,8 +59,8 @@ public MsckDesc(String tableName, List> partSpecs, super(); this.tableName = tableName; this.partSpecs = new ArrayList>(partSpecs.size()); - for (int i = 0; i < partSpecs.size(); i++) { - this.partSpecs.add(new LinkedHashMap(partSpecs.get(i))); + for (Map partSpec : partSpecs) { + this.partSpecs.add(new LinkedHashMap<>(partSpec)); } this.resFile = resFile.toString(); this.repairPartitions = repairPartitions; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java index 73981e826870139a42ad881103fdb0a2ef8433a2..731ffacbb83dbe77ea63325c1b453438ab1b4d41 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java @@ -73,7 +73,7 @@ private VectorPartitionDesc vectorPartitionDesc; public void setBaseFileName(String baseFileName) { - this.baseFileName = baseFileName; + this.baseFileName = baseFileName.intern(); } public PartitionDesc() { @@ -81,12 +81,12 @@ public PartitionDesc() { public PartitionDesc(final TableDesc table, final LinkedHashMap partSpec) { this.tableDesc = table; - this.partSpec = partSpec; + setPartSpec(partSpec); } public PartitionDesc(final Partition part) throws HiveException { PartitionDescConstructorHelper(part, getTableDesc(part.getTable()), true); - if(Utilities.isInputFileFormatSelfDescribing(this)) { + if (Utilities.isInputFileFormatSelfDescribing(this)) { // if IF is self describing no need to send column info per partition, since its not used anyway. Table tbl = part.getTable(); setProperties(MetaStoreUtils.getSchemaWithoutCols(part.getTPartition().getSd(), part.getTPartition().getSd(), @@ -107,7 +107,7 @@ public PartitionDesc(final Partition part) throws HiveException { public PartitionDesc(final Partition part,final TableDesc tblDesc, boolean usePartSchemaProperties) throws HiveException { - PartitionDescConstructorHelper(part,tblDesc, usePartSchemaProperties); + PartitionDescConstructorHelper(part, tblDesc, usePartSchemaProperties); //We use partition schema properties to set the partition descriptor properties // if usePartSchemaProperties is set to true. if (usePartSchemaProperties) { @@ -121,7 +121,7 @@ public PartitionDesc(final Partition part,final TableDesc tblDesc, private void PartitionDescConstructorHelper(final Partition part,final TableDesc tblDesc, boolean setInputFileFormat) throws HiveException { this.tableDesc = tblDesc; - this.partSpec = part.getSpec(); + setPartSpec(part.getSpec()); if (setInputFileFormat) { setInputFileFormatClass(part.getInputFormatClass()); } else { @@ -145,10 +145,18 @@ public void setTableDesc(TableDesc tableDesc) { } public void setPartSpec(final LinkedHashMap partSpec) { + if (partSpec != null) { + for (String key : partSpec.keySet()) { + String value = partSpec.get(key); + if (value != null) { + partSpec.put(key, value.intern()); + } + } + } this.partSpec = partSpec; } - public Class getInputFileFormatClass() { + public Class getInputFileFormatClass() { if (inputFileFormatClass == null && tableDesc != null) { setInputFileFormatClass(tableDesc.getInputFileFormatClass()); } @@ -289,8 +297,7 @@ public PartitionDesc clone() { ret.tableDesc = (TableDesc) tableDesc.clone(); // The partition spec is not present if (partSpec != null) { - ret.partSpec = new java.util.LinkedHashMap(); - ret.partSpec.putAll(partSpec); + ret.partSpec = new LinkedHashMap<>(partSpec); } if (vectorPartitionDesc != null) { ret.vectorPartitionDesc = vectorPartitionDesc.clone(); @@ -379,7 +386,7 @@ public void deriveBaseFileName(Path path) { if (path == null) { return; } - baseFileName = path.getName(); + baseFileName = path.getName().intern(); } public void intern(Interner interner) {