commit 5031b8bbf1ca3f45ad3b725fc16a53bbc33221cc Author: Bharath Krishna Date: Wed Sep 12 23:35:59 2018 -0700 HIVE-20545: Exclude large-sized parameters from serialization of Table and Partition thrift objects in HMS notifications diff --git standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java index 30ea7f81292b0db54f4eb82468191fda38f9a0d4..4b001d0243e20dc9a119dbbe43783be6bb8ef0bf 100644 --- standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java +++ standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java @@ -512,6 +512,12 @@ public static ConfVars getMetaConf(String name) { "hive.metastore.event.message.factory", "org.apache.hadoop.hive.metastore.messaging.json.JSONMessageFactory", "Factory class for making encoding and decoding messages in the events generated."), + EVENT_NOTIFICATION_PARAMETERS_EXCLUDE_PATTERNS("metastore.notification.parameters.exclude.patterns", + "hive.metastore.notification.parameters.exclude.patterns", "", + "List of comma-separated regexes that are used to reduced the size of HMS Notification messages." + + " The regexes are matched against each key of parameters map in Table or Partition object" + + "present in HMS Notification. Any key-value pair whose key is matched with any regex will" + +" be removed from Parameters map during Serialization of Table/Partition object."), EVENT_DB_LISTENER_TTL("metastore.event.db.listener.timetolive", "hive.metastore.event.db.listener.timetolive", 86400, TimeUnit.SECONDS, "time after which events will be removed from the database listener queue"), @@ -1405,6 +1411,28 @@ public static boolean getBoolVar(Configuration conf, ConfVars var) { return val == null ? conf.getBoolean(var.hiveName, (Boolean)var.defaultVal) : Boolean.valueOf(val); } + /** + * Get values from comma-separated config, to an array after extracting individual values. + * @param conf Configuration to retrieve it from + * @param var variable to retrieve + * @return Array of String, containing each value from the comma-separated config, + * or default value if value not in config file + */ + public static String[] getTrimmedStringsVar(Configuration conf, ConfVars var) { + assert var.defaultVal.getClass() == String.class; + String[] result = conf.getTrimmedStrings(var.varname, (String[]) null); + if (result != null) { + return result; + } + if (var.hiveName != null) { + result = conf.getTrimmedStrings(var.hiveName, (String[]) null); + if (result != null) { + return result; + } + } + return org.apache.hadoop.util.StringUtils.getTrimmedStrings((String) var.getDefaultVal()); + } + /** * Set the variable as a boolean * @param conf configuration file to set it in diff --git standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreUtils.java standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreUtils.java index c681a87a1c6b10a4f9494e49a42282cf90027ad7..09ca27f0d7e4f95019041a7363795d5dfd8a7b17 100644 --- standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreUtils.java +++ standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreUtils.java @@ -56,8 +56,13 @@ import java.util.Map; import java.util.Properties; import java.util.TimeZone; +import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import java.util.stream.Collectors; + +import static java.util.regex.Pattern.compile; public class MetaStoreUtils { /** A fixed date format to be used for hive partition column values. */ @@ -905,4 +910,41 @@ public static boolean isView(Table table) { } return TableType.VIRTUAL_VIEW.toString().equals(table.getTableType()); } + + /** + * filters a given map with predicate provided. All entries of map whose key matches with + * predicate will be removed. Expects map to be modifiable and does the operation on actual map, + * so does not return a copy of filtered map. + * @param map A map of String key-value pairs + * @param predicate Predicate with pattern to filter the map + */ + public static void filterMapKeys(Map map, Predicate predicate) { + if (map == null) { + return; + } + map.entrySet().removeIf(entry -> predicate.test(entry.getKey())); + } + + /** + * filters a given map with list of predicates. All entries of map whose key matches with any + * predicate will be removed. Expects map to be modifiable and does the operation on actual map, + * so does not return a copy of filtered map. + * @param map A map of String key-value pairs + * @param predicates List of predicates with patterns to filter the map + */ + public static void filterMapkeys(Map map, List> predicates) { + if (map == null) { + return; + } + filterMapKeys(map, predicates.stream().reduce(Predicate::or).orElse(x -> false)); + } + + /** + * Compile a list of regex patterns and collect them as Predicates. + * @param patterns List of regex patterns to be compiled + * @return a List of Predicate created by compiling the regex patterns + */ + public static List> compilePatternsToPredicates(List patterns) { + return patterns.stream().map(pattern -> compile(pattern).asPredicate()).collect(Collectors.toList()); + } } diff --git standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/messaging/MessageFactory.java standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/messaging/MessageFactory.java index 7ff168f7931f91fe17f7d38df848ba2eed33c463..58c6891d32202aa02e0294cb51a020354de59512 100644 --- standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/messaging/MessageFactory.java +++ standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/messaging/MessageFactory.java @@ -92,22 +92,23 @@ protected static final String MS_SERVICE_PRINCIPAL = MetastoreConf.getVar(conf, ConfVars.KERBEROS_PRINCIPAL, ""); + /** * Getter for MessageFactory instance. */ public static MessageFactory getInstance() { if (instance == null) { - instance = - getInstance(MetastoreConf.getVar(conf, ConfVars.EVENT_MESSAGE_FACTORY)); + instance = getInstance(MetastoreConf.getVar(conf, ConfVars.EVENT_MESSAGE_FACTORY)); } return instance; } private static MessageFactory getInstance(String className) { try { - return JavaUtils.newInstance(JavaUtils.getClass(className, MessageFactory.class)); - } - catch (MetaException e) { + MessageFactory factory = JavaUtils.newInstance(JavaUtils.getClass(className, MessageFactory.class)); + factory.init(); + return factory; + } catch (MetaException e) { throw new IllegalStateException("Could not construct MessageFactory implementation: ", e); } } @@ -133,6 +134,8 @@ public static MessageDeserializer getDeserializer(String format, // itself for discoverability? Might be worth pursuing. } + public void init() throws MetaException {} + public abstract MessageDeserializer getDeserializer(); /** diff --git standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/messaging/json/JSONMessageFactory.java standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/messaging/json/JSONMessageFactory.java index 2668b053205f48226da442ce65fcc2d7f6e76763..6aa079d29741ede6e9f32645d7f513b1a4cb0cde 100644 --- standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/messaging/json/JSONMessageFactory.java +++ standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/messaging/json/JSONMessageFactory.java @@ -19,16 +19,20 @@ package org.apache.hadoop.hive.metastore.messaging.json; +import java.util.Arrays; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.function.Predicate; +import java.util.regex.PatternSyntaxException; import javax.annotation.Nullable; import org.apache.hadoop.hive.metastore.api.Catalog; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.Function; +import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NotificationEvent; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.SQLForeignKey; @@ -37,6 +41,7 @@ import org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.TxnToWriteId; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; import org.apache.hadoop.hive.metastore.events.AcidWriteEvent; import org.apache.hadoop.hive.metastore.messaging.AbortTxnMessage; import org.apache.hadoop.hive.metastore.messaging.AddForeignKeyMessage; @@ -66,6 +71,7 @@ import org.apache.hadoop.hive.metastore.messaging.OpenTxnMessage; import org.apache.hadoop.hive.metastore.messaging.AcidWriteMessage; import org.apache.hadoop.hive.metastore.messaging.PartitionFiles; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.apache.thrift.TBase; import org.apache.thrift.TDeserializer; import org.apache.thrift.TException; @@ -83,6 +89,8 @@ import com.google.common.collect.Iterators; import com.google.common.collect.Lists; +import static org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.filterMapkeys; + /** * The JSON implementation of the MessageFactory. Constructs JSON implementations of each * message-type. @@ -93,6 +101,23 @@ private static JSONMessageDeserializer deserializer = new JSONMessageDeserializer(); + private static List> paramsFilter; + + @Override + public void init() throws MetaException { + super.init(); + + List excludePatterns = Arrays.asList(MetastoreConf + .getTrimmedStringsVar(conf, MetastoreConf.ConfVars.EVENT_NOTIFICATION_PARAMETERS_EXCLUDE_PATTERNS)); + try { + paramsFilter = MetaStoreUtils.compilePatternsToPredicates(excludePatterns); + } catch (PatternSyntaxException e) { + LOG.error("Regex pattern compilation failed. Verify that " + + "metastore.notification.parameters.exclude.patterns has valid patterns."); + throw new MetaException("Regex pattern compilation failed. " + e.getMessage()); + } + } + @Override public MessageDeserializer getDeserializer() { return deserializer; @@ -295,11 +320,17 @@ static String createCatalogObjJson(Catalog catObj) throws TException { } static String createTableObjJson(Table tableObj) throws TException { + //Note: The parameters of the Table object will be removed in the filter if it matches + // any pattern provided through EVENT_NOTIFICATION_PARAMETERS_EXCLUDE_PATTERNS + filterMapkeys(tableObj.getParameters(), paramsFilter); TSerializer serializer = new TSerializer(new TJSONProtocol.Factory()); return serializer.toString(tableObj, "UTF-8"); } static String createPartitionObjJson(Partition partitionObj) throws TException { + //Note: The parameters of the Partition object will be removed in the filter if it matches + // any pattern provided through EVENT_NOTIFICATION_PARAMETERS_EXCLUDE_PATTERNS + filterMapkeys(partitionObj.getParameters(), paramsFilter); TSerializer serializer = new TSerializer(new TJSONProtocol.Factory()); return serializer.toString(partitionObj, "UTF-8"); } diff --git standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/utils/TestMetaStoreServerUtils.java standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/utils/TestMetaStoreServerUtils.java index 30de1c4cfa1cf019186b10583a06da0bf5491634..b05cb54e77132ab776bec89f46ba5e28d84c9a63 100644 --- standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/utils/TestMetaStoreServerUtils.java +++ standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/utils/TestMetaStoreServerUtils.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.metastore.utils; import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; @@ -30,30 +31,31 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.Partition; -import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.client.builder.DatabaseBuilder; -import org.apache.hadoop.hive.metastore.client.builder.PartitionBuilder; import org.apache.hadoop.hive.metastore.client.builder.TableBuilder; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; import org.apache.thrift.TException; import org.junit.Assert; import org.junit.Test; import org.junit.experimental.categories.Category; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import static java.util.regex.Pattern.compile; import static org.apache.hadoop.hive.common.StatsSetupConst.COLUMN_STATS_ACCURATE; import static org.apache.hadoop.hive.common.StatsSetupConst.FAST_STATS; import static org.apache.hadoop.hive.common.StatsSetupConst.NUM_FILES; import static org.apache.hadoop.hive.common.StatsSetupConst.NUM_ERASURE_CODED_FILES; import static org.apache.hadoop.hive.common.StatsSetupConst.STATS_GENERATED; import static org.apache.hadoop.hive.common.StatsSetupConst.TOTAL_SIZE; +import static org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.filterMapkeys; import static org.hamcrest.core.Is.is; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; @@ -379,5 +381,84 @@ private static FileStatus getFileStatus(long fileLength, boolean isdir, int bloc new Path(pathString), false, false, isErasureCoded); } + @Test + public void testFilterMapWithPredicates() { + Map testMap = getTestParamMap(); + + List excludePatterns = Arrays.asList("lastDdl", "num"); + testMapFilter(testMap, excludePatterns); + assertFalse(testMap.containsKey("transient_lastDdlTime")); + assertFalse(testMap.containsKey("numFiles")); + assertFalse(testMap.containsKey("numFilesErasureCoded")); + assertFalse(testMap.containsKey("numRows")); + + Map expectedMap = new HashMap() {{ + put("totalSize", "1024"); + put("rawDataSize", "3243234"); + put("COLUMN_STATS_ACCURATE", "{\"BASIC_STATS\":\"true\""); + put("COLUMN_STATS_ACCURATED", "dummy"); + put("bucketing_version", "2"); + put("testBucketing_version", "2"); + }}; + + assertThat(expectedMap, is(testMap)); + + testMap = getTestParamMap(); + excludePatterns = Arrays.asList("^bucket", "ACCURATE$"); + testMapFilter(testMap, excludePatterns); + + expectedMap = new HashMap() {{ + put("totalSize", "1024"); + put("numRows", "10"); + put("rawDataSize", "3243234"); + put("COLUMN_STATS_ACCURATED", "dummy"); + put("numFiles", "2"); + put("transient_lastDdlTime", "1537487124"); + put("testBucketing_version", "2"); + put("numFilesErasureCoded", "0"); + }}; + + assertThat(expectedMap, is(testMap)); + + // test that if the config is not set in MetastoreConf, it does not filter any parameter + Configuration testConf = MetastoreConf.newMetastoreConf(); + testMap = getTestParamMap(); + excludePatterns = Arrays.asList(MetastoreConf + .getTrimmedStringsVar(testConf, MetastoreConf.ConfVars.EVENT_NOTIFICATION_PARAMETERS_EXCLUDE_PATTERNS)); + + testMapFilter(testMap, excludePatterns); + assertThat(getTestParamMap(), is(testMap)); + + + // test that if the config is set to empty String in MetastoreConf, it does not filter any parameter + testConf.setStrings(MetastoreConf.ConfVars.EVENT_NOTIFICATION_PARAMETERS_EXCLUDE_PATTERNS.getVarname(), ""); + testMap = getTestParamMap(); + excludePatterns = Arrays.asList(MetastoreConf + .getTrimmedStringsVar(testConf, MetastoreConf.ConfVars.EVENT_NOTIFICATION_PARAMETERS_EXCLUDE_PATTERNS)); + + testMapFilter(testMap, excludePatterns); + assertThat(getTestParamMap(), is(testMap)); + } + + private void testMapFilter(Map testMap, List patterns) { + List> paramsFilter = + patterns.stream().map(pattern -> compile(pattern).asPredicate()).collect(Collectors.toList()); + filterMapkeys(testMap, paramsFilter); + } + + private Map getTestParamMap() { + return new HashMap() {{ + put("totalSize", "1024"); + put("numRows", "10"); + put("rawDataSize", "3243234"); + put("COLUMN_STATS_ACCURATE", "{\"BASIC_STATS\":\"true\""); + put("COLUMN_STATS_ACCURATED", "dummy"); + put("numFiles", "2"); + put("transient_lastDdlTime", "1537487124"); + put("bucketing_version", "2"); + put("testBucketing_version", "2"); + put("numFilesErasureCoded", "0"); + }}; + } }