Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (revision 4620) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (working copy) @@ -50,6 +50,8 @@ import org.apache.hadoop.hive.ql.plan.api.StageType; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.ql.exec.errors.Error; +import org.apache.hadoop.hive.ql.exec.errors.TaskLogProcessor; import org.apache.hadoop.hive.ql.history.HiveHistory.Keys; import org.apache.hadoop.hive.ql.io.*; import org.apache.hadoop.hive.ql.QueryPlan; @@ -655,16 +657,54 @@ return (returnVal); } + private String getTaskAttemptLogUrl(String taskTrackerHttpAddress, + String taskAttemptId) { + return taskTrackerHttpAddress + "/tasklog?taskid=" + taskAttemptId + "&all=true"; + } + @SuppressWarnings("deprecation") + + // Used for showJobFailDebugInfo + private class TaskInfo { + String jobId; + HashSet logUrls; + boolean isMapTask; + + public TaskInfo(String jobId, boolean isMapTask) { + this.jobId = jobId; + this.isMapTask = isMapTask; + logUrls = new HashSet(); + } + public void addLogUrl(String logUrl) { + logUrls.add(logUrl); + } + + public boolean getIsMapTask() { + return isMapTask; + } + public HashSet getLogUrls() { + return logUrls; + } + public String getJobId() { + return jobId; + } + } + private void showJobFailDebugInfo(JobConf conf, RunningJob rj) throws IOException { + // Mapping from task ID to the number of failures Map failures = new HashMap(); + // Successful task ID's Set successes = new HashSet (); - Map taskToJob = new HashMap(); - + + Map taskIdToInfo = new HashMap(); + int startIndex = 0; + // Loop to get all task completion events because getTaskCompletionEvents + // only returns a subset per call while(true) { - TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex); + TaskCompletionEvent[] taskCompletions = + rj.getTaskCompletionEvents(startIndex); if(taskCompletions == null || taskCompletions.length == 0) { break; @@ -672,20 +712,34 @@ boolean more = true; for(TaskCompletionEvent t : taskCompletions) { - // getTaskJobIDs return Strings for compatibility with Hadoop version without - // TaskID or TaskAttemptID + // getTaskJobIDs returns Strings for compatibility with Hadoop versions + // without TaskID or TaskAttemptID String [] taskJobIds = ShimLoader.getHadoopShims().getTaskJobIDs(t); - if(taskJobIds == null) { - console.printError("Task attempt info is unavailable in this Hadoop version"); + console.printError("Task attempt info is unavailable in " + + "this Hadoop version"); more = false; break; } + // For each task completion event, get the associated task id, job id + // and the logs String taskId = taskJobIds[0]; String jobId = taskJobIds[1]; - taskToJob.put(taskId, jobId); + TaskInfo ti = taskIdToInfo.get(taskId); + if(ti == null) { + ti = new TaskInfo(jobId, t.isMapTask()); + taskIdToInfo.put(taskId, ti); + } + // These tasks should have have come from the same job. + assert(ti.getJobId() == jobId); + ti.getLogUrls().add(getTaskAttemptLogUrl(t.getTaskTrackerHttp(), + t.getTaskId())); + + // If a task failed, then keep track of the total number of failures + // for that task (typically, a task gets re-run up to 4 times if it + // fails if(t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) { Integer failAttempts = failures.get(taskId); if(failAttempts == null) { @@ -719,14 +773,41 @@ } // Display Error Message for tasks with the highest failure count - console.printError("\nFailed tasks with most" + "(" + maxFailures + ")" + " failures " + ": "); String jtUrl = JobTrackerURLResolver.getURL(conf); for(String task : failures.keySet()) { if(failures.get(task).intValue() == maxFailures) { - String jobId = taskToJob.get(task); - String taskUrl = jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString(); - console.printError("Task URL: " + taskUrl +"\n"); + TaskInfo ti = taskIdToInfo.get(task); + String jobId = ti.getJobId(); + String taskUrl = jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + + task.toString(); + + TaskLogProcessor tlp = new TaskLogProcessor(conf, ti.getIsMapTask()); + for(String logUrl : ti.getLogUrls()) { + tlp.addTaskLogUrl(logUrl); + } + + List errors = tlp.getErrors(); + + StringBuilder sb = new StringBuilder(); + // We use a StringBuilder and then call printError only once as + // printError will write to both stderr and the error log file. In + // situations where both the stderr and the log file output is + // simultaneously output to a single stream, this will look cleaner. + sb.append("\n"); + sb.append("Task with the most failures(" + maxFailures + "): \n"); + sb.append("-----\n"); + sb.append("Task ID:\n " + task + "\n"); + sb.append("URL:\n " + taskUrl + "\n"); + + for(Error e : errors) { + sb.append("\n"); + sb.append("Possible error:\n " + e.getError() + "\n\n"); + sb.append("Solution:\n " + e.getSolution() + "\n"); + } + sb.append("-----\n"); + + console.printError(sb.toString()); // Only print out one task because that's good enough for debugging. break; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/errors/TaskLogProcessor.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/errors/TaskLogProcessor.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/errors/TaskLogProcessor.java (revision 0) @@ -0,0 +1,129 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.errors; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.mapred.JobConf; + +/** + * TaskLogProcessor reads the logs from failed tasks and attempts to figure out + * what the cause of the error was using various heuristics + */ +public class TaskLogProcessor { + + List heuristics = new ArrayList(); + List taskLogUrls = new ArrayList(); + + JobConf conf = null; + // Query is the hive query string i.e. "SELECT * FROM src;" associated with + // this set of tasks logs + String query = null; + + public TaskLogProcessor(JobConf conf, boolean isMapTask) { + this.conf = conf; + query = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYSTRING); + + heuristics.add(new ScriptErrorHeuristic()); + heuristics.add(new MapAggrMemErrorHeuristic()); + + for(ErrorHeuristic e : heuristics) { + e.init(query, conf, isMapTask); + } + } + + /** + * Adds a task log URL for the heuristics to read through + * @param url + */ + public void addTaskLogUrl(String url) { + taskLogUrls.add(url); + } + + public void addTaskLogUrls(Collection c) { + for(String s : c) { + addTaskLogUrl(s); + } + } + + /** + * Processes the provided task logs using the known error heuristics to get + * the matching errors + * @return Errors that may have caused the query to fail + */ + public List getErrors() { + List errors = new ArrayList(); + + URL taskLogUrl; + + // Read read the lines from all the task logs and feed them to all the + // error heuristics + for(String urlString : taskLogUrls) { + for(ErrorHeuristic e : heuristics) { + e.reset(); + } + try { + taskLogUrl = new URL(urlString); + } catch(MalformedURLException e) { + throw new RuntimeException("Bad task log url", e); + } + BufferedReader in; + try { + in = new BufferedReader( + new InputStreamReader(taskLogUrl.openStream())); + String inputLine; + while ((inputLine = in.readLine()) != null) { + for(ErrorHeuristic e : heuristics) { + e.readLogLine(inputLine); + } + } + in.close(); + } catch (IOException e) { + throw new RuntimeException("Error while reading from task log url", e); + } + + // Then see if the heuristics have detected anything. Once a heuristic + // matches, we remove it from the list of heuristics to evaluate because + // the task attempts are likely to have failed for the same reason and + // we don't want the same error message to appear multiple times + List untriggeredHeuristics = + new ArrayList(); + for(ErrorHeuristic e : heuristics) { + List eList = e.getErrors(); + if(eList.size()==0) { + untriggeredHeuristics.add(e); + } else { + errors.addAll(eList); + } + } + heuristics = untriggeredHeuristics; + } + return errors; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/errors/RegexErrorHeuristic.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/errors/RegexErrorHeuristic.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/errors/RegexErrorHeuristic.java (revision 0) @@ -0,0 +1,64 @@ +package org.apache.hadoop.hive.ql.exec.errors; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import org.apache.hadoop.mapred.JobConf; + +/** + * Simple heuristic where the query and the lines of the task log file are run + * through regular expressions to see if they resemble a known error cond. + */ +public abstract class RegexErrorHeuristic implements ErrorHeuristic { + + String query = null; + JobConf conf = null; + + // Pattern to look for in the hive query and whether it matched + Pattern queryPattern = null; + boolean queryMatches = false; + + // The pattern to look for in the task log lines, and whether a match has been + // seen so far + Pattern logPattern = null; + boolean logMatches = false; + + // Whether the log files came from a mapTask. if false, it means it came from + // a reduce task + boolean isMapTask; + + ArrayList errors = new ArrayList(); + + public RegexErrorHeuristic() { + } + + public RegexErrorHeuristic(String queryRegex, String logRegex) { + queryPattern = Pattern.compile(queryRegex, Pattern.CASE_INSENSITIVE); + logPattern = Pattern.compile(logRegex, Pattern.CASE_INSENSITIVE); + } + + @Override + public void init(String query, JobConf conf, boolean isMapTask) { + this.query = query; + this.conf = conf; + this.isMapTask = isMapTask; + queryMatches = queryPattern.matcher(query).find(); + } + + @Override + abstract public List getErrors(); + + @Override + public void readLogLine(String line) { + if(queryMatches && !logMatches) { + logMatches = logPattern.matcher(line).find(); + } + } + + @Override + public void reset() { + logMatches = false; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/errors/MapAggrMemErrorHeuristic.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/errors/MapAggrMemErrorHeuristic.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/errors/MapAggrMemErrorHeuristic.java (revision 0) @@ -0,0 +1,49 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.errors; + +import java.util.List; + +import org.apache.hadoop.hive.conf.HiveConf; + +public class MapAggrMemErrorHeuristic extends RegexErrorHeuristic { + + public MapAggrMemErrorHeuristic() { + super("group by", "OutOfMemoryError"); + } + + @Override + public List getErrors() { + if (queryMatches && logMatches && isMapTask) { + + String confName = HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY.toString(); + float confValue = HiveConf.getFloatVar(conf, + HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY); + + Error e = new Error( + "Out of memory due to hash maps used in map-side aggregation", + "Currently " + confName + " is set to " + confValue + ". " + + "Try setting it to a lower value. i.e " + + "'set " + confName + " = " + confValue/2 + ";'"); + errors.add(e); + } + return errors; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/errors/ErrorHeuristic.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/errors/ErrorHeuristic.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/errors/ErrorHeuristic.java (revision 0) @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.errors; + +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.mapred.JobConf; + +/** + * Classes implementing ErrorHeuristic are able to generate a possible cause and + * solution for Hive jobs that have failed by examining the query, task log + * files, and the job configuration. + * + * Generally, a class implementing ErrorHeuristic detects only one kind of error. + * + */ +public interface ErrorHeuristic { + + /** + * Must be called before any other functions are called + * @param query + * @param jobConf + * @param isMapTask - log files should all be from Map or Reduce tasks + */ + void init(String query, JobConf jobConf, boolean isMapTask); + + /** + * readLogLine should be called for every line in the task log file, in + * sequence. + * + * @param line + */ + void readLogLine(String line); + + /** + * Makes it forget the task log lines that it has seen so far. + */ + void reset(); + + /** + * Examine the hive query, job configuration, and the lines from the task log + * seen so far and generate a possible cause / solution + * @return all matching errors + * + */ + List getErrors(); +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/errors/Error.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/errors/Error.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/errors/Error.java (revision 0) @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.errors; + +/** + * Immutable class for storing a possible error and a way to resolve the error + */ +public class Error { + + String error = null; + String solution = null; + + Error(String error, String solution) { + this.error = error; + this.solution = solution; + } + + public String getError() { + return error; + } + + public String getSolution() { + return solution; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/errors/ScriptErrorHeuristic.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/errors/ScriptErrorHeuristic.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/errors/ScriptErrorHeuristic.java (revision 0) @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.errors; + +import java.util.List; + +/** + * Detects when a query has failed because a user's script that was specified in + * transform returns non-zero error code. + */ + +public class ScriptErrorHeuristic extends RegexErrorHeuristic { + + public ScriptErrorHeuristic() { + super(".*", "Script failed with code"); + } + + @Override + public List getErrors() { + errors.clear(); + + if(queryMatches && logMatches) { + Error e = new Error( + "A user-supplied transfrom script has failed to exit with a 0 " + + "error code.", + "Verify that the script can properly handle all the input rows " + + "without throwing exceptions and exits properly"); + errors.add(e); + } + + return errors; + } + +}