commit 78e382dd75c0776754b0a2e6b041b769ca13cf5c Author: ArkoSharma Date: Fri Dec 11 23:06:21 2020 +0530 HIVE-24526: Get grouped locations of external table data using metatool. diff --git a/standalone-metastore/metastore-server/pom.xml b/standalone-metastore/metastore-server/pom.xml index d1a36315d7..1fb0896467 100644 --- a/standalone-metastore/metastore-server/pom.xml +++ b/standalone-metastore/metastore-server/pom.xml @@ -302,6 +302,12 @@ curator-test test + + org.apache.hive + hive-common + ${version} + compile + diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java index 760d78df24..e7de8784a9 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java @@ -50,6 +50,8 @@ public static void main(String[] args) { task = new MetaToolTaskExecuteJDOQLQuery(); } else if (cl.isUpdateLocation()) { task = new MetaToolTaskUpdateLocation(); + } else if (cl.isListExtTblLocs()) { + task = new MetaToolTaskListExtTblLocs(); } else { throw new IllegalArgumentException("No task was specified!"); } diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java index 1223f0d623..2a20c375c3 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java @@ -57,6 +57,16 @@ "serde-prop-key/table-prop-key that is specified and updates its value if found.") .create("updateLocation"); + @SuppressWarnings("static-access") + private static final Option LIST_EXT_TBL_LOCS = OptionBuilder + .withArgName("dbName> " + " > coverageList = new HashMap<>(); + private final HashMap inputLocations = new HashMap<>(); + + @Override + void execute() { + String[] loc = getCl().getListExtTblLocsParams(); + try{ + generateExternalTableInfo(loc[0], loc[1]); + } catch (IOException | TException e) { + LOG.error("Listing external table locations failed: ", e); + } + } + + private void generateExternalTableInfo(String dbPattern, String outputDir) throws TException, IOException { + Configuration conf = msConf != null ? msConf : MetastoreConf.newMetastoreConf(); + IMetaStoreClient hms = getHMS(conf); + List databases = hms.getDatabases(dbPattern); + System.out.println("Number of databases found for given pattern: " + databases.size()); + TreeSet locations = new TreeSet<>(); + for (String db : databases) { + List tables = hms.getAllTables(db); + for(String tblName : tables) { + Table t = hms.getTable(db, tblName); + if((t.getCatName() == null || t.getCatName().equalsIgnoreCase(MetaStoreUtils.getDefaultCatalog(conf))) && + TableType.EXTERNAL_TABLE.name().equalsIgnoreCase(t.getTableType())) { + String tblLocation = t.getSd().getLocation(); + inputLocations.put(tblLocation, new DataLocation(db, tblName, null)); + Path tblPath = new Path(tblLocation); + if (!isCovered(locations, tblPath)) { + locations.add(tblLocation); + } + //retreiving partition locations in batches + List partNames = hms.listPartitionNames(t.getDbName(), t.getTableName(), Short.MAX_VALUE); + int batchSize = PARTITION_BATCH_SIZE; + int numWholeBatches = partNames.size() / batchSize; + for (int i = 0; i < numWholeBatches; i++) { + List partitionList = hms.getPartitionsByNames(t.getDbName(), t.getTableName(), + partNames.subList(i * batchSize, (i + 1) * batchSize)); + for (Partition part : partitionList) { + if (part.getCatName() == null || part.getCatName().equalsIgnoreCase(MetaStoreUtils.getDefaultCatalog(conf))) { + String partLocation = part.getSd().getLocation(); + inputLocations.put(tblLocation, new DataLocation(db, tblName, null)); + Path partPath = new Path(partLocation); + boolean partitionLocOutsideTableLoc = !org.apache.hadoop.hive.common.FileUtils.isPathWithinSubtree( + partPath, tblPath); + if (partitionLocOutsideTableLoc && !isCovered(locations, partPath)) { + locations.add(partLocation); + } + } + } + } + if(numWholeBatches * batchSize < partNames.size()) { + //last partial batch + List partitionList = hms.getPartitionsByNames(t.getDbName(), t.getTableName(), + partNames.subList(numWholeBatches * batchSize, partNames.size())); + for (Partition part : partitionList) { + if (part.getCatName() == null || part.getCatName().equalsIgnoreCase(MetaStoreUtils.getDefaultCatalog(conf))) { + String partLocation = part.getSd().getLocation(); + inputLocations.put(tblLocation, new DataLocation(db, tblName, null)); + Path partPath = new Path(partLocation); + boolean partitionLocOutsideTableLoc = !org.apache.hadoop.hive.common.FileUtils.isPathWithinSubtree( + partPath, tblPath); + if (partitionLocOutsideTableLoc && !isCovered(locations, partPath)) { + locations.add(partLocation); + } + } + } + } + } + } + } + if(!locations.isEmpty()) { + removeNestedStructure(locations); + createOutputList(locations, outputDir, dbPattern); + } + else { + System.out.println("No external tables found to process."); + } + } + + /* + * Method to determine if an existing location covers the given location and record the coverage in output. + */ + private boolean isCovered(TreeSet locations, Path path) { + Path originalPath = new Path(path.toString()); + while(path != null){ + if(locations.contains(path.toString())){ + addCoverage(path, originalPath, true); + return true; + } + path = path.getParent(); + } + return false; + } + + /* + * Method to cover a child node using a parent. + * Removes the child and marks all nodes covered by the child as being covered by the parent. + */ + private void addCoverage(Path parentPath, Path childPath, boolean addChild) { + String childLoc = childPath.toString(); + String parentLoc = parentPath.toString(); + HashSet pathsUnderChild = coverageList.get(childLoc); + coverageList.remove(childLoc); + if(coverageList.get(parentLoc) == null) { + coverageList.put(parentLoc, new HashSet<>()); + } + HashSet pathsUnderParent = coverageList.get(parentLoc); + if(addChild) { + pathsUnderParent.add(childPath.toString()); + } + if(pathsUnderChild != null) { + pathsUnderParent.addAll(pathsUnderChild); + pathsUnderChild = null; + } + } + + /* + * Transforms a collection so that no element is an ancestor of another. + */ + private void removeNestedStructure(TreeSet locations) { + List locationList = new ArrayList<>(); + locationList.addAll(locations); + for(int i = 0; i < locationList.size(); i++) { + String currLoc = locationList.get(i); + Path currPath = new Path(currLoc); + for(int j = i + 1; j < locationList.size(); j++) { + String nextLoc = locationList.get(j); + Path nextPath = new Path (nextLoc); + if(org.apache.hadoop.hive.common.FileUtils.isPathWithinSubtree(nextPath, currPath)) { + addCoverage(currPath, nextPath, true); + locations.remove(nextLoc); + i = j; + } + else { + i = j - 1; + break; + } + } + } + } + + /* + * Method to write the output to the given location. + * We construct a tree out of external table - locations and use it to determine suitable directories covering all locations. + */ + private void createOutputList(TreeSet locations, String outputDir, String dbPattern) throws IOException { + ExternalTableGraphNode rootNode = constructTree(locations); + //Traverse through the tree in breadth-first manner and decide which nodes to include. + //For every node, either cover all leaves under it in + Queue queue = new LinkedList<>(); + queue.add(rootNode); + while(!queue.isEmpty()){ + ExternalTableGraphNode current = queue.remove(); + if(current.isLeaf()) { + // in this case, the leaf needs to be added to the solution, i.e. marked as being covered. + // This was done during graph construction, so we continue. + continue; + } + int nonTrivialCoverage = 0; + List childNodes = current.getChildNodes(); + for(ExternalTableGraphNode child : childNodes) { + if (child.getNumLeavesCovered() > 1) { + nonTrivialCoverage += child.getNumLeavesCovered(); + } + } + if(nonTrivialCoverage >= (current.getNumLeavesCovered() + 1) / 2) { + queue.addAll(childNodes); + } else { + addToSolution(current); + } + } + String outFileName = "externalTableLocations_" + dbPattern + "_" + System.currentTimeMillis() + ".txt"; + LOG.debug("Writing output to " + outFileName); + FileWriter fw = new FileWriter(outputDir + "/" + outFileName); + PrintWriter pw = new PrintWriter(fw); + JsonObject jsonObject = new JsonObject(); + for(String outputLocation : coverageList.keySet()) { + HashSet coveredLocations = coverageList.get(outputLocation); + String consolidatedLocations = String.join(",", coveredLocations); + if(coverageList.size() == 1 && coveredLocations.size() == 1) { + jsonObject.addProperty(consolidatedLocations, consolidatedLocations); + } + else { + jsonObject.addProperty(outputLocation, consolidatedLocations); + } + } + JSONObject js = new JSONObject(jsonObject.toString()); + pw.println(js.toString(4)); + pw.close(); + } + + private ExternalTableGraphNode constructTree(TreeSet locations) { + ExternalTableGraphNode rootNode = null; + HashMap locationGraph = new HashMap<>(); + // Every location is represented by a leaf in the tree. + // We traverse through the input locations and construct the tree. + for (String leaf : locations) { + ExternalTableGraphNode currNode = new ExternalTableGraphNode(leaf, new ArrayList<>(), true); + locationGraph.put(leaf, currNode); + if (coverageList.get(leaf) == null) { + coverageList.put(leaf, new HashSet<>()); + } + //mark the leaf as being covered as itself + coverageList.get(leaf).add(leaf); + Path parent = new Path(leaf).getParent(); + ExternalTableGraphNode parNode; + //traverse upward to the root in order to construct the graph + while (parent != null) { + String parentLoc = parent.toString(); + if (!locationGraph.containsKey(parentLoc)) { + //if parent doesn't exist in graph then create it + parNode = new ExternalTableGraphNode(parentLoc, new ArrayList<>(), false); + locationGraph.put(parentLoc, parNode); + } + else { + parNode = locationGraph.get(parentLoc); + } + if(currNode.getParent() == null) { + parNode.addChild(currNode); + currNode.setParent(parNode); + } + else { + break; + } + currNode = parNode; + parent = parent.getParent(); + } + if (parent == null && rootNode == null) { + rootNode = currNode; + rootNode.setParent(rootNode); + } + } + rootNode.updateNumLeavesCovered(); + return rootNode; + } + + private void addToSolution(ExternalTableGraphNode node) { + //since this node is in the solution, all its children should be covered using this node. + if(!node.isLeaf()) { + addCoverageRecursive(node); + } + } + + private void addCoverageRecursive(ExternalTableGraphNode node) { + for(ExternalTableGraphNode child : node.getChildNodes()) { + if(child.isLeaf()) { + addCoverage(new Path(node.getLocation()), new Path(child.getLocation()), true); + } + else { + addCoverageRecursive(child); + addCoverage(new Path(node.getLocation()), new Path(child.getLocation()), false); + } + } + } + + private static IMetaStoreClient getHMS(Configuration conf) { + UserGroupInformation loggedInUser = null; + try { + loggedInUser = UserGroupInformation.getLoginUser(); + } catch (IOException e) { + LOG.warn("Unable to get logged in user via UGI. err: {}", e.getMessage()); + } + boolean secureMode = loggedInUser != null && loggedInUser.hasKerberosCredentials(); + if (secureMode) { + MetastoreConf.setBoolVar(conf, MetastoreConf.ConfVars.USE_THRIFT_SASL, true); + } + try { + LOG.info("Creating metastore client for {}", "ExternalTableTool"); + return RetryingMetaStoreClient.getProxy(conf, true); + } catch (MetaException e) { + throw new RuntimeException("Error connecting to Hive Metastore URI: " + + MetastoreConf.get(conf, MetastoreConf.ConfVars.THRIFT_URIS.getVarname()) + ". " + e.getMessage(), e); + } + } + + + /** + * can set it from tests to test when config needs something other than default values + * For example, that acid is enabled + */ + @VisibleForTesting + static Configuration msConf = null; + + private class DataLocation { + private String dbName; + private String tblName; + private String partName; + + private DataLocation (String dbName, String tblName, String partName) { + this.dbName = dbName; + this.tblName = tblName; + this.partName = partName; + } + } + + private class ExternalTableGraphNode { + private String location; + private List childNodes; + private ExternalTableGraphNode parent; + private boolean isLeaf; + private int numLeavesCovered; + + private ExternalTableGraphNode(String location, List childNodes, boolean isLeaf) { + this.location = location; + this.childNodes = childNodes; + this.isLeaf = isLeaf; + this.parent = null; + } + + private void addChild(ExternalTableGraphNode child) { + this.childNodes.add(child); + } + + private List getChildNodes() { + return this.childNodes; + } + + private boolean isLeaf() { + return this.isLeaf; + } + + private void setNumLeavesCovered(int numLeavesCovered) { + this.numLeavesCovered = numLeavesCovered; + } + + private int getNumLeavesCovered() { + return this.numLeavesCovered; + } + + private String getLocation() { + return this.location; + } + + private void setParent(ExternalTableGraphNode node) { + this.parent = node; + } + + private ExternalTableGraphNode getParent() { + return this.parent; + } + + private void updateNumLeavesCovered() { + if(this.isLeaf) { + this.numLeavesCovered = 1; + return; + } + this.numLeavesCovered = 0; + for(ExternalTableGraphNode currChild : childNodes) { + currChild.updateNumLeavesCovered(); + this.numLeavesCovered += currChild.getNumLeavesCovered(); + } + } + } +}