diff --git common/src/java/org/apache/hadoop/hive/common/FileUtils.java common/src/java/org/apache/hadoop/hive/common/FileUtils.java index a8ed8af..ab3439c 100644 --- common/src/java/org/apache/hadoop/hive/common/FileUtils.java +++ common/src/java/org/apache/hadoop/hive/common/FileUtils.java @@ -916,6 +916,16 @@ private static boolean isPathWithinSubtree(Path path, Path subtree, int subtreeD return false; } + public static void populateParentPaths(Set parents, Path path) { + if (parents == null) { + return; + } + while(path != null) { + parents.add(path); + path = path.getParent(); + } + } + /** * Get the URI of the path. Assume to be local file system if no scheme. */ diff --git common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java index d82c531..5705028 100644 --- common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java +++ common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java @@ -26,6 +26,7 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.HashSet; import java.util.Set; import org.apache.hadoop.fs.LocalFileSystem; @@ -146,4 +147,54 @@ public void testRelativePathToAbsolutePath() throws IOException { assertEquals(unchangedPath.toString(), absolutePath.toString()); } + + @Test + public void testIsPathWithinSubtree() throws IOException { + Path splitPath = new Path("file:///user/hive/warehouse/src/data.txt"); + Path splitPathWithNoSchema = Path.getPathWithoutSchemeAndAuthority(splitPath); + + Set parents = new HashSet<>(); + FileUtils.populateParentPaths(parents, splitPath); + FileUtils.populateParentPaths(parents, splitPathWithNoSchema); + + Path key = new Path("/user/hive/warehouse/src"); + verifyIsPathWithInSubTree(splitPath, key, false); + verifyIsPathWithInSubTree(splitPathWithNoSchema, key, true); + verifyIfParentsContainPath(key, parents, true); + + key = new Path("/user/hive/warehouse/src_2"); + verifyIsPathWithInSubTree(splitPath, key, false); + verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false); + verifyIfParentsContainPath(key, parents, false); + + key = new Path("/user/hive/warehouse/src/data.txt"); + verifyIsPathWithInSubTree(splitPath, key, false); + verifyIsPathWithInSubTree(splitPathWithNoSchema, key, true); + verifyIfParentsContainPath(key, parents, true); + + key = new Path("file:///user/hive/warehouse/src"); + verifyIsPathWithInSubTree(splitPath, key, true); + verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false); + verifyIfParentsContainPath(key, parents, true); + + key = new Path("file:///user/hive/warehouse/src_2"); + verifyIsPathWithInSubTree(splitPath, key, false); + verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false); + verifyIfParentsContainPath(key, parents, false); + + key = new Path("file:///user/hive/warehouse/src/data.txt"); + verifyIsPathWithInSubTree(splitPath, key, true); + verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false); + verifyIfParentsContainPath(key, parents, true); + } + + private void verifyIsPathWithInSubTree(Path splitPath, Path key, boolean expected) { + boolean result = FileUtils.isPathWithinSubtree(splitPath, key); + assertEquals("splitPath=" + splitPath + ", key=" + key, expected, result); + } + + private void verifyIfParentsContainPath(Path key, Set parents, boolean expected) { + boolean result = parents.contains(key); + assertEquals("key=" + key, expected, result); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java index 6de2c18..2e0ef74 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java @@ -114,6 +114,7 @@ protected String getNominalPath(Path fpath) { throw new IllegalStateException("Ambiguous input path " + fpath); } nominal = onefile; + break; } if (nominal == null) { throw new IllegalStateException("Invalid input path " + fpath); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java index 69956ec..94fcd60 100755 --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java @@ -23,9 +23,11 @@ import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.Map.Entry; @@ -595,6 +597,8 @@ protected void pushProjectionsAndFilters(JobConf jobConf, Class inputFormatClass Iterator>> iterator = this.mrwork .getPathToAliases().entrySet().iterator(); + Set splitParentPaths = null; + int pathsSize = this.mrwork.getPathToAliases().entrySet().size(); while (iterator.hasNext()) { Entry> entry = iterator.next(); Path key = entry.getKey(); @@ -610,7 +614,20 @@ protected void pushProjectionsAndFilters(JobConf jobConf, Class inputFormatClass // subdirectories. (Unlike non-native tables, prefix mixups don't seem // to be a potential problem here since we are always dealing with the // path to something deeper than the table location.) - match = FileUtils.isPathWithinSubtree(splitPath, key) || FileUtils.isPathWithinSubtree(splitPathWithNoSchema, key); + if (pathsSize > 1) { + // Comparing paths multiple times creates lots of objects & + // creates GC pressure for tables having large number of partitions. + // In such cases, use pre-computed paths for comparison + if (splitParentPaths == null) { + splitParentPaths = new HashSet<>(); + FileUtils.populateParentPaths(splitParentPaths, splitPath); + FileUtils.populateParentPaths(splitParentPaths, splitPathWithNoSchema); + } + match = splitParentPaths.contains(key); + } else { + match = FileUtils.isPathWithinSubtree(splitPath, key) + || FileUtils.isPathWithinSubtree(splitPathWithNoSchema, key); + } } if (match) { ArrayList list = entry.getValue();