diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/util/DAGTraversal.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/util/DAGTraversal.java index cb5dc2e5d0..40f5f55ccb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/util/DAGTraversal.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/util/DAGTraversal.java @@ -23,7 +23,9 @@ import java.io.Serializable; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; /** * The dag traversal done here is written to be not recursion based as large DAG's will lead to @@ -33,7 +35,21 @@ public static void traverse(List> tasks, Function function) { List> listOfTasks = new ArrayList<>(tasks); while (!listOfTasks.isEmpty()) { - List> children = new ArrayList<>(); + // HashSet will make sure that no duplicate children are added. If a task is added multiple + // time to the children list then it may cause the list to grow exponentially. Lets take an example of + // incremental load with 2 events. The DAG will look some thing similar as below. + // + // --->ev1.task1-- --->ev2.task1-- + // / \ / \ + // evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->ev2.barrierTask------- + // \ / + // --->ev1.task3-- + // + // While traversing the DAG, if the filter is not added then ev1.barrierTask will be added 3 times in + // the children list and in next iteration ev2.task1 will be added 3 times and ev2.task2 will be added + // 3 times. So in next iteration ev2.barrierTask will be added 6 times. As it goes like this, the next barrier + // task will be added 12-15 times and may reaches millions with large number of events. + Set> children = new HashSet<>(); for (Task task : listOfTasks) { // skip processing has to be done first before continuing if (function.skipProcessing(task)) { @@ -48,7 +64,7 @@ public static void traverse(List> tasks, Function f } function.process(task); } - listOfTasks = children; + listOfTasks = new ArrayList<>(children); } }