From 8cfc48be27bd19eb563c0378d0b4bdc9976a8428 Mon Sep 17 00:00:00 2001 Date: Mon, 10 May 2021 14:35:07 +0800 Subject: [PATCH] fix fair scheduler continuous scheduling thread crashes while sorting nodes --- .../scheduler/fair/FairScheduler.java | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 1d97983778d..b768a03f732 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -990,12 +990,21 @@ private synchronized void nodeUpdate(RMNode nm) { void continuousSchedulingAttempt() throws InterruptedException { long start = getClock().getTime(); - List nodeIdList = new ArrayList(nodes.keySet()); + List nodeIdList; // Sort the nodes by space available on them, so that we offer // containers on emptier nodes first, facilitating an even spread. This // requires holding the scheduler lock, so that the space available on a // node doesn't change during the sort. synchronized (this) { + // original NodeAvailableResourceComparator use variable nodes which contains to compare, + // but nodeIdList is not same with nodes.keySet() during sort, because nodes may be updated before synchronized + // when node removed before sort, such as n2,n3 are removed, compare(n2, n3)=1 and compare(n3, n2)=1, + // this violates the reflexivity of comparator, we may get the exception: + // java.lang.IllegalArgumentException: Comparison method violates its general contract! + // When this happens, the ContinuousSchedulingThread thread will exit, and we will only attemptScheduling + // at the time of nodeUpdate, which will greatly reduce the scheduling efficiency and lead to a lot of job pending + // so we moved nodeIdList evaluation to synchronized + nodeIdList = new ArrayList<>(nodes.keySet()); Collections.sort(nodeIdList, nodeAvailableResourceComparator); } @@ -1022,12 +1031,6 @@ void continuousSchedulingAttempt() throws InterruptedException { @Override public int compare(NodeId n1, NodeId n2) { - if (!nodes.containsKey(n1)) { - return 1; - } - if (!nodes.containsKey(n2)) { - return -1; - } return RESOURCE_CALCULATOR.compare(clusterResource, nodes.get(n2).getAvailableResource(), nodes.get(n1).getAvailableResource()); -- 2.24.3 (Apple Git-128)