commit e6cbbd121a097acddd07c9aecd37c838874cc9e0 Author: Vihang Karajgaonkar Date: Thu Mar 30 13:02:01 2017 -0700 HIVE-16299 : MSCK REPAIR TABLE should enforce partition key order when adding unknown partitions diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java index 84c090239df39d7ea987d561bf4ab1e852f75624..fc0554f4f2483ff01260a22b2d073e56d33ab99e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java @@ -35,6 +35,7 @@ import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; +import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.metastore.api.FieldSchema; @@ -338,19 +339,14 @@ void findUnknownPartitions(Table table, Set partPaths, // remove the partition paths we know about allPartDirs.removeAll(partPaths); - Set partColNames = Sets.newHashSet(); - for(FieldSchema fSchema : table.getPartCols()) { - partColNames.add(fSchema.getName()); - } - // we should now only have the unexpected folders left for (Path partPath : allPartDirs) { FileSystem fs = partPath.getFileSystem(conf); String partitionName = getPartitionName(fs.makeQualified(tablePath), - partPath, partColNames); - LOG.debug("PartitionName: " + partitionName); + partPath, table.getPartColNames()); if (partitionName != null) { + LOG.debug("PartitionName: " + partitionName); PartitionResult pr = new PartitionResult(); pr.setPartitionName(partitionName); pr.setTableName(table.getTableName()); @@ -369,15 +365,16 @@ void findUnknownPartitions(Table table, Set partPaths, * @param partitionPath * Path of the partition. * @param partCols - * Set of partition columns from table definition + * List of partition columns from table definition * @return Partition name, for example partitiondate=2008-01-01 */ static String getPartitionName(Path tablePath, Path partitionPath, - Set partCols) { + List partCols) { String result = null; Path currPath = partitionPath; LOG.debug("tablePath:" + tablePath + ", partCols: " + partCols); + int partColIndex = partCols.size() - 1; while (currPath != null && !tablePath.equals(currPath)) { // format: partition=p_val // Add only when table partition colName matches @@ -385,19 +382,29 @@ static String getPartitionName(Path tablePath, Path partitionPath, if (parts != null && parts.length > 0) { if (parts.length != 2) { LOG.warn(currPath.getName() + " is not a valid partition name"); - return result; + return null; } String partitionName = parts[0]; - if (partCols.contains(partitionName)) { + if (partColIndex < 0) { + LOG.warn(currPath + + " has more than valid number of partition cols. Expected partitions columns are " + + partCols.toArray(new String[partCols.size()]).toString()); + return null; + } + if (partCols.get(partColIndex).equals(partitionName)) { if (result == null) { result = currPath.getName(); } else { result = currPath.getName() + Path.SEPARATOR + result; } + } else { + LOG.warn(currPath + " has invalid partition sub-directory structure"); + return null; } } currPath = currPath.getParent(); + partColIndex--; LOG.debug("currPath=" + currPath); } return result; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java index 21bc8eed06c8fcd99d1e4eee3b5e5f8a4fb812be..c6a64a24141725e67efc7af97f5a8407c60d78d4 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java @@ -431,6 +431,26 @@ public void testErrorForMissingPartitionColumn() throws AlreadyExistsException, assertTrue("Expected HiveException", exception!=null && exception instanceof HiveException); } + /** + * Tests if there exists a unknown partition directory on the FS with in-valid order of partition + * keys than what is specified in table specification. + * + * @throws AlreadyExistsException + * @throws HiveException + * @throws IOException + */ + public void testInvalidOrderForPartitionKeysOnFS() + throws AlreadyExistsException, HiveException, IOException { + Table testTable = createPartitionedTestTable(dbName, tableName, 2, 0); + // add 10 partitions on the filesystem + createInvalidPartitionDirsOnFS(testTable, 10); + CheckResult result = new CheckResult(); + checker.checkMetastore(dbName, tableName, null, result); + assertEquals(Collections. emptySet(), result.getTablesNotInMs()); + assertEquals(Collections. emptySet(), result.getTablesNotOnFs()); + assertEquals(Collections. emptySet(), result.getPartitionsNotOnFs()); + assertEquals(0, result.getPartitionsNotInMs().size()); + } /* * Test if single-threaded implementation checker throws HiveException when the there is a dummy * directory present in the nested level @@ -530,29 +550,59 @@ private Table createPartitionedTestTable(String dbName, String tableName, int nu * @param table - Table which provides the base locations and partition specs for creating the * sub-directories * @param numPartitions - Number of partitions to be created + * @param reverseOrder - If set to true creates the partition sub-directories in the reverse order + * of specified by partition keys defined for the table * @throws IOException */ - private void createPartitionsDirectoriesOnFS(Table table, int numPartitions) throws IOException { + private void createPartitionsDirectoriesOnFS(Table table, int numPartitions, boolean reverseOrder) throws IOException { String path = table.getDataLocation().toString(); fs = table.getPath().getFileSystem(hive.getConf()); int numPartKeys = table.getPartitionKeys().size(); for (int i = 0; i < numPartitions; i++) { StringBuilder partPath = new StringBuilder(path); partPath.append(Path.SEPARATOR); - for (int j = 0; j < numPartKeys; j++) { - FieldSchema field = table.getPartitionKeys().get(j); - partPath.append(field.getName()); - partPath.append('='); - partPath.append("val_"); - partPath.append(i); - if (j < (numPartKeys - 1)) { - partPath.append(Path.SEPARATOR); + if (!reverseOrder) { + for (int j = 0; j < numPartKeys; j++) { + FieldSchema field = table.getPartitionKeys().get(j); + partPath.append(field.getName()); + partPath.append('='); + partPath.append("val_"); + partPath.append(i); + if (j < (numPartKeys - 1)) { + partPath.append(Path.SEPARATOR); + } + } + } else { + for (int j = numPartKeys - 1; j >= 0; j--) { + FieldSchema field = table.getPartitionKeys().get(j); + partPath.append(field.getName()); + partPath.append('='); + partPath.append("val_"); + partPath.append(i); + if (j > 0) { + partPath.append(Path.SEPARATOR); + } } } createDirectory(partPath.toString()); } } + private void createPartitionsDirectoriesOnFS(Table table, int numPartitions) throws IOException { + createPartitionsDirectoriesOnFS(table, numPartitions, false); + } + /** + * Creates a partition directory structure on file system but with a reverse order + * of sub-directories compared to the partition keys defined in the table. Eg. if the + * partition keys defined in table are (a int, b int, c int) this method will create + * an invalid directory c=val_1/b=val_1/a=val_1 + * @param table + * @throws IOException + */ + private void createInvalidPartitionDirsOnFS(Table table, int numPartitions) throws IOException { + createPartitionsDirectoriesOnFS(table, numPartitions, true); + } + private void createFile(String partPath, String filename) throws IOException { Path part = new Path(partPath); fs.mkdirs(part);