diff --git common/src/java/org/apache/hadoop/hive/common/FixedSizeCollection.java common/src/java/org/apache/hadoop/hive/common/FixedSizeCollection.java new file mode 100644 index 0000000000000000000000000000000000000000..6b2e65ecb7865e84a9f7040a9505fdd87a0a4d2e --- /dev/null +++ common/src/java/org/apache/hadoop/hive/common/FixedSizeCollection.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common; + + +import com.google.common.base.Preconditions; +import com.google.common.collect.ForwardingList; +import com.google.common.collect.ForwardingSet; + +import java.util.Collection; +import java.util.List; +import java.util.Set; + +/** + * A utility class that can decorate a collection to throw an exception + * if its element size exceeds a predefined threshold + */ +public class FixedSizeCollection { + + public static Set setMaxSize( + final Set input, final int maxSize) { + + Preconditions.checkArgument(input.size() < maxSize, + "Current size " + input.size() + " already exceeds maximum of " + maxSize); + return new ForwardingSet() { + + @Override + public boolean addAll(Collection collection) { + return standardAddAll(collection); + } + + public boolean add(T e) { + checkMaxSize(); + return delegate().add(e); + } + + private void checkMaxSize() { + if (size() >= maxSize) { + throw new UnsupportedOperationException("Maximum Size " + maxSize + " reached"); + } + } + + @Override + protected Set delegate() { + return input; + } + }; + } + + public static List setMaxSize( + final List input, final int maxSize) { + + Preconditions.checkArgument(input.size() < maxSize, + "Current size " + input.size() + " already exceeds maximum of " + maxSize); + return new ForwardingList() { + + @Override + public boolean addAll(Collection collection) { + return standardAddAll(collection); + } + + @Override + public boolean addAll(int index, Collection elements) { + return standardAddAll(index, elements); + } + + public boolean add(T e) { + checkMaxSize(); + return delegate().add(e); + } + + @Override + public void add(final int index, final T e) { + checkMaxSize(); + delegate().add(index, e); + } + + private void checkMaxSize() { + if (size() >= maxSize) { + throw new UnsupportedOperationException("Maximum Size " + maxSize + " reached"); + } + } + + @Override + protected List delegate() { + return input; + } + }; + } +} diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index a479deb7c0c6b779277f1029009b7dfab6dcb9e3..946a26f64b924d114afa3fd92654896ef35e7ef3 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -778,7 +778,9 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal METASTORE_BATCH_RETRIEVE_OBJECTS_MAX( "hive.metastore.batch.retrieve.table.partition.max", 1000, "Maximum number of objects that metastore internally retrieves in one batch."), - + METASTORE_STRICT_CHECK_RESULTS_MAX( + "hive.metastore.strict.check.results.max", 500_000, + "Maximum number of results the Hive MetaStore checker may return in strict mode"), METASTORE_INIT_HOOKS("hive.metastore.init.hooks", "", "A comma separated list of hooks to be invoked at the beginning of HMSHandler initialization. \n" + "An init hook is specified as the name of Java class which extends org.apache.hadoop.hive.metastore.MetaStoreInitListener."), diff --git common/src/test/org/apache/hadoop/hive/common/TestFixedSizeCollection.java common/src/test/org/apache/hadoop/hive/common/TestFixedSizeCollection.java new file mode 100644 index 0000000000000000000000000000000000000000..cf1c8017df90e9c9a1ffcca20a1ea95587a8d621 --- /dev/null +++ common/src/test/org/apache/hadoop/hive/common/TestFixedSizeCollection.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common; + + +import com.google.common.collect.Lists; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class TestFixedSizeCollection { + + @Test + public void testList_SizeFixing_Add() { + + List l = new ArrayList<>(); + List fixedSize = FixedSizeCollection.setMaxSize(l, 5); + + fixedSize.add("a"); + fixedSize.add("a"); + fixedSize.add("a"); + fixedSize.add("a"); + fixedSize.add("a"); + try{ + fixedSize.add("a"); + }catch (UnsupportedOperationException e) { + return; + } + Assert.fail("Expected to fail after 5 elements"); + } + + @Test + public void testList_SizeFixing_AddAll() { + + List l = new ArrayList<>(); + List fixedSize = FixedSizeCollection.setMaxSize(l, 5); + + fixedSize.addAll(Lists.newArrayList("a", "a", "a", "a", "a")); + try{ + fixedSize.addAll(Lists.newArrayList("a")); + }catch (UnsupportedOperationException e) { + return; + } + Assert.fail("Expected to fail after 5 elements"); + } + + @Test + public void testList_SizeFixing_Insert() { + + List l = new ArrayList<>(); + List fixedSize = FixedSizeCollection.setMaxSize(l, 5); + + fixedSize.add(0, "a"); + fixedSize.add(1, "a"); + fixedSize.add(2, "a"); + fixedSize.add(3, "a"); + fixedSize.add(4, "a"); + try{ + fixedSize.add(0, "a"); + }catch (UnsupportedOperationException e) { + return; + } + Assert.fail("Expected to fail after 5 elements"); + } + + @Test + public void testSet_SizeFixing_Add() { + + Set l = new HashSet<>(); + Set fixedSize = FixedSizeCollection.setMaxSize(l, 5); + + fixedSize.add("a"); + fixedSize.add("b"); + fixedSize.add("c"); + fixedSize.add("d"); + fixedSize.add("e"); + try{ + fixedSize.add("f"); + }catch (UnsupportedOperationException e) { + return; + } + Assert.fail("Expected to fail after 5 elements"); + } + + @Test + public void testSet_SizeFixing_AddAll() { + + Set l = new HashSet<>(); + Set fixedSize = FixedSizeCollection.setMaxSize(l, 5); + + fixedSize.addAll(Lists.newArrayList("a", "b", "c", "d", "e")); + try{ + fixedSize.addAll(Lists.newArrayList("f")); + }catch (UnsupportedOperationException e) { + return; + } + Assert.fail("Expected to fail after 5 elements"); + } + + @Test(expected = IllegalArgumentException.class) + public void testList_SizeFixing_InitEx() { + + List l = new ArrayList<>(); + l.add("a"); + FixedSizeCollection.setMaxSize(l, 1); + } + + @Test(expected = IllegalArgumentException.class) + public void testSet_SizeFixing_InitEx() { + + Set l = new HashSet<>(); + l.add("a"); + FixedSizeCollection.setMaxSize(l, 1); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java index 6805c17a116f5ef0febd36c59d454fa631ae0024..f4d70f3791ee181b43f3936c2a11abe164caf1ba 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; @@ -35,7 +36,11 @@ import java.util.concurrent.ThreadPoolExecutor; import com.google.common.collect.Sets; +import org.apache.hadoop.hive.common.FixedSizeCollection; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.log.PerfLogger; +import org.apache.hadoop.hive.ql.session.SessionState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.FileStatus; @@ -49,8 +54,6 @@ import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.ql.metadata.CheckResult.PartitionResult; -import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; -import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.thrift.TException; import com.google.common.util.concurrent.MoreExecutors; @@ -64,6 +67,7 @@ public class HiveMetaStoreChecker { public static final Logger LOG = LoggerFactory.getLogger(HiveMetaStoreChecker.class); + public static final String CLASS_NAME = HiveMetaStoreChecker.class.getName(); private final Hive hive; private final HiveConf conf; @@ -208,19 +212,33 @@ void checkTable(String dbName, String tableName, return; } - List parts = new ArrayList(); + PartitionIterable parts; boolean findUnknownPartitions = true; if (table.isPartitioned()) { if (partitions == null || partitions.isEmpty()) { - PrunedPartitionList prunedPartList = - PartitionPruner.prune(table, null, conf, toString(), null); - // no partitions specified, let's get all - parts.addAll(prunedPartList.getPartitions()); + String mode = HiveConf.getVar(conf, ConfVars.HIVEMAPREDMODE, (String) null); + int batchRetrieveObjectsMax = HiveConf.getIntVar(conf, ConfVars.METASTORE_STRICT_CHECK_RESULTS_MAX); + if ("strict".equalsIgnoreCase(mode)) { + parts = new PartitionIterable(hive, table, null, conf.getIntVar( + HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX)); + result.setPartitionsNotInMs( + FixedSizeCollection.setMaxSize(result.getPartitionsNotInMs(), batchRetrieveObjectsMax)); + result.setPartitionsNotOnFs( + FixedSizeCollection.setMaxSize(result.getPartitionsNotOnFs(), batchRetrieveObjectsMax)); + } else { + List loadedPartitions = new ArrayList<>(); + PerfLogger perfLogger = SessionState.getPerfLogger(); + perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING); + loadedPartitions.addAll(hive.getAllPartitionsOf(table)); + perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING); + parts = new PartitionIterable(loadedPartitions); + } } else { // we're interested in specific partitions, // don't check for any others findUnknownPartitions = false; + List loadedPartitions = new ArrayList<>(); for (Map map : partitions) { Partition part = hive.getPartition(table, map, false); if (part == null) { @@ -229,10 +247,13 @@ void checkTable(String dbName, String tableName, pr.setPartitionName(Warehouse.makePartPath(map)); result.getPartitionsNotInMs().add(pr); } else { - parts.add(part); + loadedPartitions.add(part); } } + parts = new PartitionIterable(loadedPartitions); } + } else { + parts = new PartitionIterable(Collections.emptyList()); } checkTable(table, parts, findUnknownPartitions, result); @@ -255,7 +276,7 @@ void checkTable(String dbName, String tableName, * @throws HiveException * Could not create Partition object */ - void checkTable(Table table, List parts, + void checkTable(Table table, PartitionIterable parts, boolean findUnknownPartitions, CheckResult result) throws IOException, HiveException { @@ -284,7 +305,9 @@ void checkTable(Table table, List parts, } for (int i = 0; i < partition.getSpec().size(); i++) { - partPaths.add(partPath.makeQualified(fs)); + Path qualifiedPath = partPath.makeQualified(fs); + StringInternUtils.internUriStringsInPath(qualifiedPath); + partPaths.add(qualifiedPath); partPath = partPath.getParent(); } } diff --git ql/src/test/queries/clientnegative/msck_repair_4.q ql/src/test/queries/clientnegative/msck_repair_4.q new file mode 100644 index 0000000000000000000000000000000000000000..7c0cb6ebc2ea4917e279a729c4f79ff5c03c9dd2 --- /dev/null +++ ql/src/test/queries/clientnegative/msck_repair_4.q @@ -0,0 +1,16 @@ +set hive.msck.repair.batch.size=1; +set hive.mapred.mode=strict; +set hive.metastore.strict.check.results.max=1; + +DROP TABLE IF EXISTS repairtable; + +CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING); + +dfs ${system:test.dfs.mkdir} ${system:test.warehouse.dir}/repairtable/p1=c/p2=a/p3=b; +dfs ${system:test.dfs.mkdir} ${system:test.warehouse.dir}/repairtable/p1=e/p2=f/p3=g; +dfs -touchz ${system:test.warehouse.dir}/repairtable/p1=c/p2=a/p3=b/datafile; +dfs -touchz ${system:test.warehouse.dir}/repairtable/p1=e/p2=f/p3=g/datafile; + +MSCK REPAIR TABLE default.repairtable; + +DROP TABLE default.repairtable; diff --git ql/src/test/queries/clientpositive/msck_repair_0.q ql/src/test/queries/clientpositive/msck_repair_0.q index ce8ef426a2a58845afc8333259d66725db416584..3329d49867ef2b302ae28eadb21f4770fff8752e 100644 --- ql/src/test/queries/clientpositive/msck_repair_0.q +++ ql/src/test/queries/clientpositive/msck_repair_0.q @@ -16,4 +16,8 @@ MSCK REPAIR TABLE default.repairtable; MSCK TABLE repairtable; +set hive.mapred.mode=strict; + +MSCK REPAIR TABLE default.repairtable; + DROP TABLE default.repairtable; diff --git ql/src/test/results/clientnegative/msck_repair_4.q.out ql/src/test/results/clientnegative/msck_repair_4.q.out new file mode 100644 index 0000000000000000000000000000000000000000..4a8c8e626f5ec406c69df20a2aed8ad56f5fd68c --- /dev/null +++ ql/src/test/results/clientnegative/msck_repair_4.q.out @@ -0,0 +1,16 @@ +PREHOOK: query: DROP TABLE IF EXISTS repairtable +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS repairtable +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@repairtable +POSTHOOK: query: CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@repairtable +PREHOOK: query: MSCK REPAIR TABLE default.repairtable +PREHOOK: type: MSCK +PREHOOK: Output: default@repairtable +FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. Maximum Size 1 reached diff --git ql/src/test/results/clientpositive/msck_repair_0.q.out ql/src/test/results/clientpositive/msck_repair_0.q.out index 3f2fe75b194f1248bd5c073dd7db6b71b2ffc2ba..fe39352e7dfb5118715a8dde027a6f1fed72123f 100644 --- ql/src/test/results/clientpositive/msck_repair_0.q.out +++ ql/src/test/results/clientpositive/msck_repair_0.q.out @@ -37,6 +37,12 @@ PREHOOK: Output: default@repairtable POSTHOOK: query: MSCK TABLE repairtable POSTHOOK: type: MSCK POSTHOOK: Output: default@repairtable +PREHOOK: query: MSCK REPAIR TABLE default.repairtable +PREHOOK: type: MSCK +PREHOOK: Output: default@repairtable +POSTHOOK: query: MSCK REPAIR TABLE default.repairtable +POSTHOOK: type: MSCK +POSTHOOK: Output: default@repairtable PREHOOK: query: DROP TABLE default.repairtable PREHOOK: type: DROPTABLE PREHOOK: Input: default@repairtable