diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsService.java new file mode 100644 index 00000000000..c1a597f9fbc --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsService.java @@ -0,0 +1,202 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.IssueType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.*; + +/** + * Utility methods to launch the diagnostic scripts. + */ +@InterfaceAudience.Private +public final class DiagnosticsService { + private static final Logger LOG = LoggerFactory + .getLogger(DiagnosticsService.class); + private static final String PYTHON_COMMAND = "python"; + private static final String COLON = ":"; + private static final String COMMA = ","; + private static final String OUT_DIR_PREFIX = "out_dir:"; + private static final String EXECUTION_ERROR_MESSAGE = "Error occurred " + + "during the execution of the diagnostic script with the command '{}'."; + private static final String INCORRECT_NUMBER_OF_PARAMETERS_MESSAGE = + "Error while parsing diagnostic option, incorrect number of " + + "parameters. Expected 1 or 2, but got {}. Skipping this option."; + + private static String scriptLocation = "/tmp/diagnostics_collector.py"; + + private DiagnosticsService() { + // hidden constructor + } + + public static CommonIssues listCommonIssues() throws Exception { + if (Shell.WINDOWS) { + throw new UnsupportedOperationException("Not implemented for Windows."); + } + CommonIssues issueTypes = new CommonIssues(); + ProcessBuilder pb = createProcessBuilder(CommandArgument.LIST_ISSUES); + + List result = executeCommand(pb); + for (String line : result) { + issueTypes.add(parseIssueType(line)); + } + + return issueTypes; + } + + public static String collectIssueData(String issueId, List args) + throws Exception { + if (Shell.WINDOWS) { + throw new UnsupportedOperationException("Not implemented for Windows."); + } + ProcessBuilder pb = createProcessBuilder(CommandArgument.COMMAND, issueId, + args); + + List result = executeCommand(pb); + Optional outputDirectory = result.stream() + .filter(e -> e.contains(OUT_DIR_PREFIX)) + .findFirst(); + + if (!outputDirectory.isPresent()) { + LOG.error(EXECUTION_ERROR_MESSAGE, pb.command()); + throw new IOException("Output directory in result not found."); + } + + String[] splittedOutputDirectory = outputDirectory.get().split(COLON); + + if (splittedOutputDirectory.length != 2) { + LOG.error(EXECUTION_ERROR_MESSAGE, pb.command()); + throw new IOException("Output directory is invalid."); + } + return splittedOutputDirectory[1]; + } + + @VisibleForTesting + protected static ProcessBuilder createProcessBuilder(CommandArgument argument) { + return createProcessBuilder(argument, null, null); + } + + @VisibleForTesting + protected static ProcessBuilder createProcessBuilder( + CommandArgument argument, String issueId, List additionalArgs) { + List commandList = + new ArrayList<>(Arrays.asList(PYTHON_COMMAND, scriptLocation, + argument.getShortOption())); + + if (argument.equals(CommandArgument.COMMAND)) { + commandList.add(issueId); + if (additionalArgs != null) { + commandList.add(CommandArgument.ARGUMENTS.getShortOption()); + commandList.addAll(additionalArgs); + } + } + + return new ProcessBuilder(commandList); + } + + private static List executeCommand(ProcessBuilder pb) + throws Exception { + Process process = pb.start(); + int exitCode; + List result = new ArrayList<>(); + + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), + StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + result.add(line); + } + process.waitFor(); + } catch (Exception e) { + LOG.error(EXECUTION_ERROR_MESSAGE, pb.command()); + throw e; + } + exitCode = process.exitValue(); + if (exitCode != 0) { + throw new IOException("The collector script exited with non-zero " + + "exit code: " + exitCode); + } + + return result; + } + + @VisibleForTesting + protected static IssueType parseIssueType(String line) { + String[] issueParams = line.split(COLON); + IssueType parsedIssueType; + + if (issueParams.length < 1 || issueParams.length > 2) { + LOG.warn(INCORRECT_NUMBER_OF_PARAMETERS_MESSAGE, + issueParams.length); + return null; + } else { + String name = issueParams[0]; + parsedIssueType = new IssueType(name); + if (issueParams.length == 2) { + List parameterList = + Arrays.asList(issueParams[1].split(COMMA)); + parsedIssueType.setParameters(parameterList); + } + } + + return parsedIssueType; + } + + @VisibleForTesting + protected static void setScriptLocation(String scriptLocationParam) { + scriptLocation = scriptLocationParam; + } + + enum CommandArgument{ + LIST_ISSUES("-l"), + COMMAND("-c"), + ARGUMENTS("-a"); + + private final String shortOption; + + CommandArgument(String shortOption) { + this.shortOption = shortOption; + } + + public CommandArgument fromString(String option) { + for (CommandArgument arg : CommandArgument.values()) { + if (arg.shortOption.equals(option)) { + return arg; + } + } + return null; + } + + public String getShortOption() { + return shortOption; + } + + + + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWSConsts.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWSConsts.java index 82ceed37c2d..533d521f759 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWSConsts.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWSConsts.java @@ -42,6 +42,13 @@ /** Path for {@code RMWebServiceProtocol#getClusterMetricsInfo}. */ public static final String METRICS = "/metrics"; + /** Path for {@code RMWebServices#getCommonIssueList}. */ + public static final String COMMON_ISSUE_LIST = "/common-issues/list"; + + /** Path for {@code RMWebServices#getCommonIssueData}. */ + public static final String COMMON_ISSUE_COLLECT = + "/common-issues/collect"; + /** Path for {@code RMWebServiceProtocol#getSchedulerInfo}. */ public static final String SCHEDULER = "/scheduler"; @@ -213,6 +220,8 @@ // ----------------QueryParams for RMWebServiceProtocol---------------- + public static final String ISSUEID = "issueId"; + public static final String ISSUEARGS = "args"; public static final String TIME = "time"; public static final String STATES = "states"; public static final String NODEID = "nodeId"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServiceProtocol.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServiceProtocol.java index f2736e3773c..652baa429f7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServiceProtocol.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServiceProtocol.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.webapp; import java.io.IOException; +import java.util.List; import java.util.Set; import javax.servlet.http.HttpServletRequest; @@ -47,6 +48,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo; @@ -110,6 +112,25 @@ */ ClusterMetricsInfo getClusterMetricsInfo(); + /** + * This method retrieves the common diagnosable issue list, and it is + * reachable by using {@link RMWSConsts#COMMON_ISSUE_LIST}. + * + * @return the list of available diagnostic cases + */ + CommonIssues getCommonIssueList(); + + /** + * This method retrieves the diagnostic information for the selected issue, + * and it is reachable by using {@link RMWSConsts#COMMON_ISSUE_COLLECT}. + * + * @param issueId the selected issue's ID. It is a QueryParam. + * @param args the necessary arguments for diagnosing the issue. + * It is a QueryParam. + * @return the associated diagnostic information to the selected issue + */ + Response getCommonIssueData(String issueId, List args); + /** * This method retrieves the current scheduler status, and it is reachable by * using {@link RMWSConsts#SCHEDULER}. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServices.java index 7c4e5df5bc7..a5d4230a021 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServices.java @@ -132,6 +132,7 @@ import org.apache.hadoop.yarn.security.client.RMDelegationTokenIdentifier; import org.apache.hadoop.yarn.server.api.protocolrecords.UpdateNodeResourceRequest; import org.apache.hadoop.yarn.server.resourcemanager.AdminService; +import org.apache.hadoop.yarn.server.resourcemanager.DiagnosticsService; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; @@ -171,6 +172,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FairSchedulerInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FifoSchedulerInfo; @@ -212,6 +214,7 @@ import org.apache.hadoop.yarn.webapp.BadRequestException; import org.apache.hadoop.yarn.webapp.ForbiddenException; import org.apache.hadoop.yarn.webapp.NotFoundException; +import org.apache.hadoop.yarn.webapp.WebAppException; import org.apache.hadoop.yarn.webapp.util.WebAppUtils; import org.apache.hadoop.yarn.webapp.dao.ConfInfo; import org.apache.hadoop.yarn.webapp.dao.SchedConfUpdateInfo; @@ -385,6 +388,42 @@ public ClusterMetricsInfo getClusterMetricsInfo() { return new ClusterMetricsInfo(this.rm); } + @GET + @Path(RMWSConsts.COMMON_ISSUE_LIST) + @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 }) + @Override + public CommonIssues getCommonIssueList() { + initForReadableEndpoints(); + try { + return DiagnosticsService.listCommonIssues(); + } catch (Exception e) { + throw new WebAppException("Error collecting the common " + + "issue types. Error message: " + e.getMessage() + ". " + + "For more information please check the ResourceManager logs."); + } + } + + @GET + @Path(RMWSConsts.COMMON_ISSUE_COLLECT) + @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 }) + @Override + public Response getCommonIssueData( + @QueryParam(RMWSConsts.ISSUEID) String issueId, + @QueryParam(RMWSConsts.ISSUEARGS) List args) { + initForReadableEndpoints(); + try { + return Response.status(Status.OK) + .entity(DiagnosticsService.collectIssueData(issueId, args)) + .build(); + } catch (Exception e) { + throw new WebAppException("Error collecting the selected " + + "issue data. Error message: " + e.getMessage() + ". " + + "For more information please check the ResourceManager logs."); + } + } + @GET @Path(RMWSConsts.SCHEDULER) @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/CommonIssues.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/CommonIssues.java new file mode 100644 index 00000000000..5ccf36eba7b --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/CommonIssues.java @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao; + +import javax.xml.bind.annotation.XmlAccessType; +import javax.xml.bind.annotation.XmlAccessorType; +import javax.xml.bind.annotation.XmlRootElement; +import java.util.ArrayList; +import java.util.List; + +@XmlRootElement(name = "commonIssues") +@XmlAccessorType(XmlAccessType.FIELD) +public class CommonIssues { + private List issue = new ArrayList<>(); + + public CommonIssues() {} + + public void add(IssueType type) { + if (type != null && + issue.stream().noneMatch(e-> e.getName().equals(type.getName()))) { + issue.add(type); + } + } + + public List getIssueList() { + return issue; + } + + public void addAll(List issueTypes) { + issue.addAll(issueTypes); + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueType.java new file mode 100644 index 00000000000..27f96a0ae2b --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueType.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao; + +import javax.xml.bind.annotation.XmlAccessType; +import javax.xml.bind.annotation.XmlAccessorType; +import javax.xml.bind.annotation.XmlRootElement; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +@XmlRootElement(name = "issue") +@XmlAccessorType(XmlAccessType.FIELD) +public class IssueType { + private String name; + private List parameters = new ArrayList<>(); + + public IssueType() {} + + public IssueType(String name) { + this.name = name; + this.parameters = Collections.emptyList(); + } + + public IssueType(String name, List parameters) { + this.name = name; + this.parameters = new ArrayList<>(parameters); + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getParameters() { + return parameters; + } + + public void setParameters(List parameters) { + this.parameters = parameters; + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsServiceTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsServiceTest.java new file mode 100644 index 00000000000..b65aac8db61 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsServiceTest.java @@ -0,0 +1,199 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager; + + +import org.apache.commons.collections.CollectionUtils; +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.IssueType; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.junit.Assert.fail; + + +public class DiagnosticsServiceTest { + private static final String ISSUE_NAME_APP_FAILED = "application_failed"; + private static final String ISSUE_NAME_APP_FAILED_NECESSARY_ARGS = + "application_failed_necessary_args"; + private static final String ISSUE_NAME_APP_HANGING = "application_hanging"; + private static final String ISSUE_NAME_SCHED_ISSUE = + "scheduler_related_issue"; + private static final String ISSUE_NAME_RM_NM_ISSUE = "rm_nm_start_failure"; + private static final String ISSUE_ARG_APP_ID = "appId"; + private static final String ISSUE_ARG_NODE_ID = "nodeId"; + private static final String COLON = ":"; + private static final String COMMA = ","; + private static final String OUTPUT_DIR = "/tmp"; + + @Before + public void setUp() { + DiagnosticsService.setScriptLocation("src/test/resources/diagnostics" + + "/diagnostics_collector_test.py"); + handleWindowsRuntime(); + } + + @Test + public void testListCommonIssuesValidCaseWithOptionsToBeSkipped() + throws Exception { + // The test script contains two invalid options: one with an ambiguous name + // and one with too many parameters. These should be skipped silently. + CommonIssues commonIssues = DiagnosticsService.listCommonIssues(); + + Assert.assertEquals(4, commonIssues.getIssueList().size()); + assertIssueEquality(ISSUE_NAME_APP_FAILED, + Collections.singletonList(ISSUE_ARG_APP_ID), + commonIssues.getIssueList().get(0)); + + assertIssueEquality(ISSUE_NAME_APP_HANGING, + Arrays.asList(ISSUE_ARG_APP_ID, ISSUE_ARG_NODE_ID), + commonIssues.getIssueList().get(1)); + + assertIssueEquality(ISSUE_NAME_SCHED_ISSUE, + Collections.emptyList(), + commonIssues.getIssueList().get(2)); + + assertIssueEquality(ISSUE_NAME_RM_NM_ISSUE, + Collections.singletonList(ISSUE_ARG_NODE_ID), + commonIssues.getIssueList().get(3)); + } + + @Test(expected = IOException.class) + public void testListCommonIssuesScriptMissing() throws Exception { + DiagnosticsService.setScriptLocation("/src/invalidLocation/script.py"); + DiagnosticsService.listCommonIssues(); + } + + @Test + public void testCollectIssueDataValidOutput() throws Exception { + // valid case: the script prints out one directory + Assert.assertEquals(OUTPUT_DIR, DiagnosticsService.collectIssueData( + ISSUE_NAME_APP_FAILED, null)); + } + + @Test + public void testCollectIssueDataValidOutputWhenArgsArePresent() + throws Exception { + // valid case: appId and nodeId are necessary params and they are present + Assert.assertEquals(OUTPUT_DIR, DiagnosticsService.collectIssueData( + ISSUE_NAME_APP_FAILED_NECESSARY_ARGS, + Arrays.asList(ISSUE_ARG_APP_ID, ISSUE_ARG_NODE_ID))); + } + + @Test(expected = IOException.class) + public void testCollectIssueDataInvalidOutputWhenWrongArgsArePresent() + throws Exception { + // valid case: appId and nodeId are necessary params but two appIds are + // given + Assert.assertEquals(OUTPUT_DIR, DiagnosticsService.collectIssueData( + ISSUE_NAME_APP_FAILED_NECESSARY_ARGS, + Arrays.asList(ISSUE_ARG_APP_ID, ISSUE_ARG_APP_ID))); + } + + @Test(expected = IOException.class) + public void testCollectIssueDataInvalidOutputEmptyDir() throws Exception { + // invalid case: the script prints out an empty string as directory + // with the correct prefix + DiagnosticsService.collectIssueData(ISSUE_NAME_APP_HANGING, null); + } + + @Test(expected = IOException.class) + public void testCollectIssueDataInvalidOutputMissingOutputDir() + throws Exception { + // invalid case: the script doesn't print out the correct output directory + DiagnosticsService.collectIssueData(ISSUE_NAME_SCHED_ISSUE, null); + } + + @Test(expected = IOException.class) + public void testCollectIssueDataInvalidOutputMissingPrints() + throws Exception { + // invalid case: the script doesn't print out anything + DiagnosticsService.collectIssueData(ISSUE_NAME_RM_NM_ISSUE, null); + } + + @Test(expected = IOException.class) + public void testCollectIssueDataScriptMissing() throws Exception { + DiagnosticsService.setScriptLocation("/src/invalidLocation/script.py"); + DiagnosticsService.collectIssueData(ISSUE_NAME_APP_FAILED, null); + } + + @Test + public void testParseIssueTypeValidCases() { + // valid case: name, no parameters + String line = ISSUE_NAME_APP_FAILED; + + assertIssueEquality(ISSUE_NAME_APP_FAILED, Collections.emptyList(), + DiagnosticsService.parseIssueType(line)); + + // valid case: name, one parameter + line = ISSUE_NAME_APP_FAILED + COLON + ISSUE_ARG_APP_ID; + + assertIssueEquality(ISSUE_NAME_APP_FAILED, + Collections.singletonList(ISSUE_ARG_APP_ID), + DiagnosticsService.parseIssueType(line)); + + // valid case: name, two parameters + line = ISSUE_NAME_APP_FAILED + COLON + ISSUE_ARG_APP_ID + + COMMA + ISSUE_ARG_NODE_ID; + + assertIssueEquality(ISSUE_NAME_APP_FAILED, + Arrays.asList(ISSUE_ARG_APP_ID, ISSUE_ARG_NODE_ID), + DiagnosticsService.parseIssueType(line)); + } + + @Test + public void testParseIssueTypeInvalidCases() { + // invalid case: too many values + String line = ISSUE_NAME_APP_FAILED + COLON + ISSUE_NAME_APP_FAILED + + COLON + ISSUE_NAME_APP_FAILED; + + IssueType issueType = DiagnosticsService.parseIssueType(line); + Assert.assertNull(issueType); + } + + private void assertIssueEquality(String expectedIssueName, + List expectedParams, + IssueType actualIssue) { + Assert.assertEquals(expectedIssueName, + actualIssue.getName()); + Assert.assertEquals(expectedParams.size(), + actualIssue.getParameters().size()); + Assert.assertTrue(CollectionUtils.isEqualCollection( + expectedParams, actualIssue.getParameters())); + } + + private void handleWindowsRuntime() { + if (Shell.WINDOWS) { + try { + DiagnosticsService.listCommonIssues(); + fail("On Windows listCommonIssues should throw " + + "UnsupportedOperationException"); + } catch (Exception e) { + // Exception is expected + } + } + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/diagnostics/diagnostics_collector_test.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/diagnostics/diagnostics_collector_test.py new file mode 100644 index 00000000000..0a81261cd8a --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/diagnostics/diagnostics_collector_test.py @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import argparse +import sys, os + + +def application_failed(): + # Prints out out dir correctly + print("application_failed") + print("out_dir:/tmp") + + +def application_failed_necessary_args(): + # Prints out out dir correctly only if "arg1" and "arg2" are present + if args.arguments is None or len(args.arguments) is not 2: + sys.exit(os.EX_USAGE) + elif args.arguments[0] == "appId" and args.arguments[1] == "nodeId": + print("out_dir:/tmp") + + +def application_hanging(): + # Prints out empty out dir + print("application_hanging") + print("out_dir:") + + +def scheduler_related_issue(): + # Doesn't print out out dir + print("scheduler_related_issue") + + +def rm_nm_start_failure(): + sys.exit(os.EX_OK) + + +def list_issues(): + print("application_failed:appId", "application_hanging:appId,nodeId", "scheduler_related_issue", + "rm_nm_start_failure:nodeId", "rm_nm_start_failure:nodeId", "rm_nm_start_failure_1:nodeId:nodeId", sep="\n") + + +ISSUE_MAP = { + "application_failed": application_failed, + "application_failed_necessary_args": application_failed_necessary_args, + "application_hanging": application_hanging, + "scheduler_related_issue": scheduler_related_issue, + "rm_nm_start_failure": rm_nm_start_failure +} + +parser = argparse.ArgumentParser() +parser.add_argument("-l", "--list", help="List the available issue types.", action="store_true") +parser.add_argument("-c", "--command", choices=list(ISSUE_MAP), help="Initiate the diagnostic information collecton" + "for diagnosing the selected issue type.") +parser.add_argument("-a", "--arguments", nargs='*', help="The required arguments for the selected issue type.") +args = parser.parse_args() + +if not (args.list or args.command): + parser.error('No action requested, use --list or --command') + +if args.list: + list_issues() + sys.exit(os.EX_OK) + +func = ISSUE_MAP[args.command] +func() \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/DefaultRequestInterceptorREST.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/DefaultRequestInterceptorREST.java index 00a8beb6684..9fcb41ba0ee 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/DefaultRequestInterceptorREST.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/DefaultRequestInterceptorREST.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Set; @@ -47,6 +48,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo; @@ -128,6 +130,22 @@ public ClusterMetricsInfo getClusterMetricsInfo() { getConf()); } + @Override + public CommonIssues getCommonIssueList() { + return RouterWebServiceUtil.genericForward(webAppAddress, null, + CommonIssues.class, HTTPMethods.GET, + RMWSConsts.RM_WEB_SERVICE_PATH + RMWSConsts.COMMON_ISSUE_LIST, + null, null, getConf()); + } + + @Override + public Response getCommonIssueData(String issueId, List args) { + return RouterWebServiceUtil.genericForward(webAppAddress, null, + Response.class, HTTPMethods.GET, + RMWSConsts.RM_WEB_SERVICE_PATH + RMWSConsts.COMMON_ISSUE_COLLECT, + null, null, getConf()); + } + @Override public SchedulerTypeInfo getSchedulerInfo() { return RouterWebServiceUtil.genericForward(webAppAddress, null, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java index ab97b1a7fd9..c5e90c48f58 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java @@ -75,6 +75,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo; @@ -1131,6 +1132,16 @@ public ClusterUserInfo getClusterUserInfo(HttpServletRequest hsr) { throw new NotImplementedException("Code is not implemented"); } + @Override + public CommonIssues getCommonIssueList() { + throw new NotImplementedException("Code is not implemented"); + } + + @Override + public Response getCommonIssueData(String issueId, List args) { + throw new NotImplementedException("Code is not implemented"); + } + @Override public SchedulerTypeInfo getSchedulerInfo() { throw new NotImplementedException("Code is not implemented"); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServices.java index bde46484d6b..fb9f7cf1832 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServices.java @@ -70,6 +70,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo; @@ -356,6 +357,30 @@ public ClusterMetricsInfo getClusterMetricsInfo() { return pipeline.getRootInterceptor().getClusterMetricsInfo(); } + @GET + @Path(RMWSConsts.COMMON_ISSUE_LIST) + @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 }) + @Override + public CommonIssues getCommonIssueList() { + init(); + RequestInterceptorChainWrapper pipeline = getInterceptorChain(null); + return pipeline.getRootInterceptor().getCommonIssueList(); + } + + @GET + @Path(RMWSConsts.COMMON_ISSUE_COLLECT) + @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 }) + @Override + public Response getCommonIssueData( + @QueryParam(RMWSConsts.ISSUEID) String issueId, + @QueryParam(RMWSConsts.ISSUEARGS) List args) { + init(); + RequestInterceptorChainWrapper pipeline = getInterceptorChain(null); + return pipeline.getRootInterceptor().getCommonIssueData(issueId, args); + } + @GET @Path(RMWSConsts.SCHEDULER) @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockRESTRequestInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockRESTRequestInterceptor.java index 67c9d671fb1..56b790be941 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockRESTRequestInterceptor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockRESTRequestInterceptor.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.router.webapp; import java.io.IOException; +import java.util.List; import java.util.Set; import javax.servlet.http.HttpServletRequest; @@ -44,6 +45,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo; @@ -96,6 +98,16 @@ public ClusterMetricsInfo getClusterMetricsInfo() { return new ClusterMetricsInfo(); } + @Override // TODO these may need to be edited for testing purposes + public CommonIssues getCommonIssueList() { + return new CommonIssues(); + } + + @Override // TODO these may need to be edited for testing purposes + public Response getCommonIssueData(String issueId, List args) { + return Response.ok().build(); + } + @Override public SchedulerTypeInfo getSchedulerInfo() { return new SchedulerTypeInfo(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/PassThroughRESTRequestInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/PassThroughRESTRequestInterceptor.java index 142a6511b93..a77841c64d6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/PassThroughRESTRequestInterceptor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/PassThroughRESTRequestInterceptor.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.router.webapp; import java.io.IOException; +import java.util.List; import java.util.Set; import javax.servlet.http.HttpServletRequest; @@ -42,6 +43,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo; +import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo; @@ -121,6 +123,16 @@ public ClusterMetricsInfo getClusterMetricsInfo() { return getNextInterceptor().getClusterMetricsInfo(); } + @Override + public CommonIssues getCommonIssueList() { + return getNextInterceptor().getCommonIssueList(); + } + + @Override + public Response getCommonIssueData(String issueId, List args) { + return getNextInterceptor().getCommonIssueData(issueId, args); + } + @Override public SchedulerTypeInfo getSchedulerInfo() { return getNextInterceptor().getSchedulerInfo();