From 4dabfdc21ea4ac2ef5240ed89cec89c658a45030 Mon Sep 17 00:00:00 2001 From: Sam An Date: Mon, 30 Mar 2020 18:12:55 -0700 Subject: [PATCH] MsckPartitionExpressionProxy should filter partitions --- .../apache/hadoop/hive/metastore/Msck.java | 2 +- .../MsckPartitionExpressionProxy.java | 49 +++++++++++++++++++ .../metastore/TestPartitionManagement.java | 40 +++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/Msck.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/Msck.java index fab83b65015..f4e109d1b0b 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/Msck.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/Msck.java @@ -394,7 +394,7 @@ public Void execute(int size) throws MetastoreException { }.run(); } - private static String makePartExpr(Map spec) + public static String makePartExpr(Map spec) throws MetaException { StringBuilder suffixBuf = new StringBuilder(); int i = 0; diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/MsckPartitionExpressionProxy.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/MsckPartitionExpressionProxy.java index d8428255590..11ca1fd8b9a 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/MsckPartitionExpressionProxy.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/MsckPartitionExpressionProxy.java @@ -18,12 +18,19 @@ */ import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.FileMetadataExprType; import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.utils.FileUtils; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; // This is added as part of moving MSCK code from ql to standalone-metastore. There is a metastore API to drop // partitions by name but we cannot use it because msck typically will contain partition value (year=2014). We almost @@ -36,6 +43,8 @@ // should use SearchArgument (storage-api) to construct the filter expression and not depend on ql, but the usecase // for msck is pretty simple and this specific implementation should suffice. public class MsckPartitionExpressionProxy implements PartitionExpressionProxy { + private static final Logger LOG = LoggerFactory.getLogger(MsckPartitionExpressionProxy.class); + @Override public String convertExprToFilter(final byte[] exprBytes, final String defaultPartitionName) throws MetaException { return new String(exprBytes, StandardCharsets.UTF_8); @@ -44,6 +53,46 @@ public String convertExprToFilter(final byte[] exprBytes, final String defaultPa @Override public boolean filterPartitionsByExpr(List partColumns, byte[] expr, String defaultPartitionName, List partitionNames) throws MetaException { + String partExpr = new String(expr, StandardCharsets.UTF_8); + if (LOG.isDebugEnabled()) { + LOG.debug("Partition expr: {}", expr); + } + //This is to find in partitionNames all that match expr + //reverse of the Msck.makePartExpr + Set partValueSet = new HashSet<>(); + String[] parts = partExpr.split(" AND "); + for ( String part : parts){ + String[] colAndValue = part.split("="); + String key = FileUtils.unescapePathName(colAndValue[0]); + //take the value inside without the single quote marks '2018-10-30' becomes 2018-10-31 + String value = FileUtils.unescapePathName(colAndValue[1].substring(1, colAndValue[1].length()-1)); + partValueSet.add(key+"="+value); + } + + List partNamesSeq = new ArrayList<>(); + for (String partition : partitionNames){ + boolean isMatch = true; + for ( String col : partValueSet){ + //list of partitions [year=2001/month=1, year=2002/month=2, year=2001/month=3] + //Given expr: e.g. year='2001' AND month='1'. Only when all the expressions in the expr can be found, + //do we add the partition to the filtered result [year=2001/month=1] + if (partition.indexOf(col) == -1){ + isMatch = false; + break; + } + } + if (isMatch){ + partNamesSeq.add(partition); + } + } + partitionNames.clear(); + partitionNames.addAll(partNamesSeq); + LOG.info("The returned partition list is of size: {}", partitionNames.size()); + for(String s : partitionNames){ + if (LOG.isDebugEnabled()) { + LOG.debug("Matched partition: {}", s); + } + } return false; } diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestPartitionManagement.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestPartitionManagement.java index 1961a70cd70..0813dba2524 100644 --- a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestPartitionManagement.java +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestPartitionManagement.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.net.URI; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -40,6 +41,7 @@ import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; import org.apache.hadoop.hive.metastore.api.Catalog; import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; @@ -654,6 +656,42 @@ public void testNoPartitionRetentionForReplTarget() throws TException, Interrupt assertEquals(3, partitions.size()); } + @Test + public void testPartitionExprFilter() throws TException, IOException { + String dbName = "db10"; + String tableName = "tbl10"; + Map colMap = buildAllColumns(); + List partKeys = Lists.newArrayList("state", "dt"); + List partKeyTypes = Lists.newArrayList("string", "date"); + List> partVals = Lists.newArrayList( + Lists.newArrayList("__HIVE_DEFAULT_PARTITION__", "1990-01-01"), + Lists.newArrayList("CA", "1986-04-28"), + Lists.newArrayList("MN", "2018-11-31")); + createMetadata(DEFAULT_CATALOG_NAME, dbName, tableName, partKeys, partKeyTypes, partVals, colMap, false); + Table table = client.getTable(dbName, tableName); + + table.getParameters().put(PartitionManagementTask.DISCOVER_PARTITIONS_TBLPROPERTY, "true"); + table.getParameters().put("EXTERNAL", "true"); + table.setTableType(TableType.EXTERNAL_TABLE.name()); + client.alter_table(dbName, tableName, table); + + List partitions = client.listPartitions(dbName, tableName, (short) -1); + assertEquals(3, partitions.size()); + + String tableLocation = table.getSd().getLocation(); + URI location = URI.create(tableLocation); + Path tablePath = new Path(location); + FileSystem fs = FileSystem.get(location, conf); + Path newPart1 = new Path(tablePath, "state=MN/dt=2018-11-31"); + fs.delete(newPart1); + + conf.set(MetastoreConf.ConfVars.PARTITION_MANAGEMENT_DATABASE_PATTERN.getVarname(), "*db10*"); + conf.set(ConfVars.PARTITION_MANAGEMENT_TABLE_TYPES.getVarname(), TableType.EXTERNAL_TABLE.name()); + runPartitionManagementTask(conf); + partitions = client.listPartitions(dbName, tableName, (short) -1); + assertEquals(2, partitions.size()); + } + private void runPartitionManagementTask(Configuration conf) { PartitionManagementTask task = new PartitionManagementTask(); task.setConf(conf); @@ -669,4 +707,6 @@ public Column(final String colName, final String colType) { this.colType = colType; } } + + } -- 2.23.0