Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ConditionalTask.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ConditionalTask.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ConditionalTask.java (working copy) @@ -84,7 +84,7 @@ for (Task tsk : getListTasks()) { if (!resTasks.contains(tsk)) { driverContext.getRunnable().remove(tsk); - console.printInfo(ExecDriver.getJobEndMsg("" + Utilities.randGen.nextInt()) + console.printInfo(HadoopJobExecHelper.getJobEndMsg("" + Utilities.randGen.nextInt()) + ", job is filtered out (removed at runtime)."); if (tsk.isMapRedTask()) { driverContext.incCurJobNo(1); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java (revision 1073798) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java (working copy) @@ -75,6 +75,9 @@ import org.apache.hadoop.hive.ql.QueryPlan; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.io.RCFileInputFormat; +import org.apache.hadoop.hive.ql.io.rcfile.merge.BlockMergeTask; +import org.apache.hadoop.hive.ql.io.rcfile.merge.MergeWork; import org.apache.hadoop.hive.ql.lockmgr.HiveLock; import org.apache.hadoop.hive.ql.lockmgr.HiveLockManager; import org.apache.hadoop.hive.ql.lockmgr.HiveLockMode; @@ -89,6 +92,7 @@ import org.apache.hadoop.hive.ql.metadata.MetaDataFormatUtils; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.AlterTablePartMergeFilesDesc; import org.apache.hadoop.hive.ql.plan.AddPartitionDesc; import org.apache.hadoop.hive.ql.plan.AlterDatabaseDesc; import org.apache.hadoop.hive.ql.plan.AlterIndexDesc; @@ -138,6 +142,7 @@ import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.shims.HadoopShims; import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.util.ToolRunner; /** @@ -350,6 +355,11 @@ if (showIndexes != null) { return showIndexes(db, showIndexes); } + + AlterTablePartMergeFilesDesc mergeFilesDesc = work.getMergeFilesDesc(); + if(mergeFilesDesc != null) { + return mergeFiles(db, mergeFilesDesc); + } } catch (InvalidTableException e) { console.printError("Table " + e.getTableName() + " does not exist"); @@ -369,6 +379,33 @@ return 0; } + /** + * First, make sure the source table/partition is not + * archived/indexes/non-rcfile. If either of these is true, throw an + * exception. + * + * The way how it does the merge is to create a BlockMergeTask from the + * mergeFilesDesc. + * + * @param db + * @param mergeFilesDesc + * @return + * @throws HiveException + */ + private int mergeFiles(Hive db, AlterTablePartMergeFilesDesc mergeFilesDesc) + throws HiveException { + // merge work only needs input and output. + MergeWork mergeWork = new MergeWork(mergeFilesDesc.getInputDir(), + mergeFilesDesc.getOutputDir()); + DriverContext driverCxt = new DriverContext(); + BlockMergeTask taskExec = new BlockMergeTask(); + taskExec.initialize(db.getConf(), null, driverCxt); + taskExec.setWork(mergeWork); + int ret = taskExec.execute(driverCxt); + + return ret; + } + private int grantOrRevokeRole(GrantRevokeRoleDDL grantOrRevokeRoleDDL) throws HiveException { try { @@ -894,23 +931,6 @@ return 0; } - /** - * Determines whether a partition has been archived - * - * @param p - * @return - */ - - private boolean isArchived(Partition p) { - Map params = p.getParameters(); - if ("true".equalsIgnoreCase(params.get( - org.apache.hadoop.hive.metastore.api.Constants.IS_ARCHIVED))) { - return true; - } else { - return false; - } - } - private void setIsArchived(Partition p, boolean state) { Map params = p.getParameters(); if (state) { @@ -958,7 +978,7 @@ */ private void setArchived(Partition p, Path parentDir, String dirInArchive, String archiveName) throws URISyntaxException { - assert(isArchived(p) == false); + assert(Utilities.isArchived(p) == false); Map params = p.getParameters(); URI parentUri = parentDir.toUri(); @@ -996,7 +1016,7 @@ * @param p - the partition to modify */ private void setUnArchived(Partition p) { - assert(isArchived(p) == true); + assert(Utilities.isArchived(p) == true); String parentDir = getOriginalLocation(p); setIsArchived(p, false); setOriginalLocation(p, null); @@ -1051,7 +1071,7 @@ throw new HiveException("Specified partition does not exist"); } - if (isArchived(p)) { + if (Utilities.isArchived(p)) { // If there were a failure right after the metadata was updated in an // archiving operation, it's possible that the original, unarchived files // weren't deleted. @@ -1236,7 +1256,7 @@ throw new HiveException("Specified partition does not exist"); } - if (!isArchived(p)) { + if (!Utilities.isArchived(p)) { Path location = new Path(p.getLocation()); Path leftOverArchiveDir = new Path(location.getParent(), location.getName() + INTERMEDIATE_ARCHIVED_DIR_SUFFIX); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (working copy) @@ -28,13 +28,9 @@ import java.net.URL; import java.net.URLDecoder; import java.net.URLEncoder; -import java.text.SimpleDateFormat; import java.util.ArrayList; -import java.util.Calendar; import java.util.Collections; import java.util.Enumeration; -import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -56,10 +52,6 @@ import org.apache.hadoop.hive.ql.DriverContext; import org.apache.hadoop.hive.ql.QueryPlan; import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; -import org.apache.hadoop.hive.ql.exec.Operator.ProgressCounter; -import org.apache.hadoop.hive.ql.exec.errors.ErrorAndSolution; -import org.apache.hadoop.hive.ql.exec.errors.TaskLogProcessor; -import org.apache.hadoop.hive.ql.history.HiveHistory.Keys; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.io.IOPrepareCache; @@ -85,7 +77,6 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Partitioner; import org.apache.hadoop.mapred.RunningJob; -import org.apache.hadoop.mapred.TaskCompletionEvent; import org.apache.hadoop.mapred.TaskReport; import org.apache.log4j.Appender; import org.apache.log4j.BasicConfigurator; @@ -98,30 +89,22 @@ * ExecDriver. * */ -public class ExecDriver extends Task implements Serializable { +public class ExecDriver extends Task implements Serializable, HadoopJobExecHook { private static final long serialVersionUID = 1L; protected transient JobConf job; - protected transient int mapProgress = 0; - protected transient int reduceProgress = 0; - public transient String jobId; - - public String getJobId() { - return jobId; - } - - public void setJobId(String jobId) { - this.jobId = jobId; - } - public static MemoryMXBean memoryMXBean; + protected HadoopJobExecHelper jobExecHelper; /** * Constructor when invoked from QL. */ public ExecDriver() { super(); + LOG = LogFactory.getLog(this.getClass().getName()); + console = new LogHelper(LOG); + this.jobExecHelper = new HadoopJobExecHelper(job, console, this, this); } protected static String getResourceFiles(Configuration conf, SessionState.ResourceType t) { @@ -179,6 +162,7 @@ if (StringUtils.isNotBlank(addedArchives)) { HiveConf.setVar(job, ConfVars.HIVEADDEDARCHIVES, addedArchives); } + this.jobExecHelper = new HadoopJobExecHelper(job, console, this, this); } /** @@ -189,101 +173,7 @@ this.job = job; LOG = LogFactory.getLog(this.getClass().getName()); console = new LogHelper(LOG, isSilent); - } - - /** - * A list of the currently running jobs spawned in this Hive instance that is used to kill all - * running jobs in the event of an unexpected shutdown - i.e., the JVM shuts down while there are - * still jobs running. - */ - private static Map runningJobKillURIs = Collections - .synchronizedMap(new HashMap()); - - /** - * In Hive, when the user control-c's the command line, any running jobs spawned from that command - * line are best-effort killed. - * - * This static constructor registers a shutdown thread to iterate over all the running job kill - * URLs and do a get on them. - * - */ - static { - if (new org.apache.hadoop.conf.Configuration() - .getBoolean("webinterface.private.actions", false)) { - Runtime.getRuntime().addShutdownHook(new Thread() { - @Override - public void run() { - synchronized (runningJobKillURIs) { - for (String uri : runningJobKillURIs.values()) { - try { - System.err.println("killing job with: " + uri); - java.net.HttpURLConnection conn = (java.net.HttpURLConnection) new java.net.URL(uri) - .openConnection(); - conn.setRequestMethod("POST"); - int retCode = conn.getResponseCode(); - if (retCode != 200) { - System.err.println("Got an error trying to kill job with URI: " + uri + " = " - + retCode); - } - } catch (Exception e) { - System.err.println("trying to kill job, caught: " + e); - // do nothing - } - } - } - } - }); - } - } - - /** - * from StreamJob.java. - */ - private void jobInfo(RunningJob rj) { - if (job.get("mapred.job.tracker", "local").equals("local")) { - console.printInfo("Job running in-process (local Hadoop)"); - } else { - String hp = job.get("mapred.job.tracker"); - if (SessionState.get() != null) { - SessionState.get().getHiveHistory().setTaskProperty(SessionState.get().getQueryId(), - getId(), Keys.TASK_HADOOP_ID, rj.getJobID()); - } - console.printInfo(ExecDriver.getJobStartMsg(rj.getJobID()) + ", Tracking URL = " - + rj.getTrackingURL()); - console.printInfo("Kill Command = " + HiveConf.getVar(job, HiveConf.ConfVars.HADOOPBIN) - + " job -Dmapred.job.tracker=" + hp + " -kill " + rj.getJobID()); - } - } - - /** - * This class contains the state of the running task Going forward, we will return this handle - * from execute and Driver can split execute into start, monitorProgess and postProcess. - */ - private static class ExecDriverTaskHandle extends TaskHandle { - JobClient jc; - RunningJob rj; - - JobClient getJobClient() { - return jc; - } - - RunningJob getRunningJob() { - return rj; - } - - public ExecDriverTaskHandle(JobClient jc, RunningJob rj) { - this.jc = jc; - this.rj = rj; - } - - public void setRunningJob(RunningJob job) { - rj = job; - } - - @Override - public Counters getCounters() throws IOException { - return rj.getCounters(); - } + this.jobExecHelper = new HadoopJobExecHelper(job, console, this, this); } /** @@ -296,20 +186,7 @@ * * @return true if fatal errors happened during job execution, false otherwise. */ - private boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) { - if (ctrs == null) { - // hadoop might return null if it cannot locate the job. - // we may still be able to retrieve the job status - so ignore - return false; - } - // check for number of created files - long numFiles = ctrs.getCounter(ProgressCounter.CREATED_FILES); - long upperLimit = HiveConf.getLongVar(job, HiveConf.ConfVars.MAXCREATEDFILES); - if (numFiles > upperLimit) { - errMsg.append("total number of created files exceeds ").append(upperLimit); - return true; - } - + public boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) { for (Operator op : work.getAliasToWork().values()) { if (op.checkFatalErrors(ctrs, errMsg)) { return true; @@ -323,186 +200,7 @@ return false; } - private boolean progress(ExecDriverTaskHandle th) throws IOException { - JobClient jc = th.getJobClient(); - RunningJob rj = th.getRunningJob(); - String lastReport = ""; - SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS"); - long reportTime = System.currentTimeMillis(); - long maxReportInterval = 60 * 1000; // One minute - boolean fatal = false; - StringBuilder errMsg = new StringBuilder(); - long pullInterval = HiveConf.getLongVar(job, HiveConf.ConfVars.HIVECOUNTERSPULLINTERVAL); - boolean initializing = true; - boolean initOutputPrinted = false; - while (!rj.isComplete()) { - try { - Thread.sleep(pullInterval); - } catch (InterruptedException e) { - } - - if (initializing && ShimLoader.getHadoopShims().isJobPreparing(rj)) { - // No reason to poll untill the job is initialized - continue; - } else { - // By now the job is initialized so no reason to do - // rj.getJobState() again and we do not want to do an extra RPC call - initializing = false; - } - - if (!initOutputPrinted) { - SessionState ss = SessionState.get(); - - String logMapper; - String logReducer; - - TaskReport[] mappers = jc.getMapTaskReports(rj.getJobID()); - if (mappers == null) { - logMapper = "no information for number of mappers; "; - } else { - int numMap = mappers.length; - if (ss != null) { - ss.getHiveHistory().setTaskProperty(SessionState.get().getQueryId(), getId(), - Keys.TASK_NUM_MAPPERS, Integer.toString(numMap)); - } - logMapper = "number of mappers: " + numMap + "; "; - } - - TaskReport[] reducers = jc.getReduceTaskReports(rj.getJobID()); - if (reducers == null) { - logReducer = "no information for number of reducers. "; - } else { - int numReduce = reducers.length; - if (ss != null) { - ss.getHiveHistory().setTaskProperty(SessionState.get().getQueryId(), getId(), - Keys.TASK_NUM_REDUCERS, Integer.toString(numReduce)); - } - logReducer = "number of reducers: " + numReduce; - } - - console - .printInfo("Hadoop job information for " + getId() + ": " + logMapper + logReducer); - initOutputPrinted = true; - } - - RunningJob newRj = jc.getJob(rj.getJobID()); - if (newRj == null) { - // under exceptional load, hadoop may not be able to look up status - // of finished jobs (because it has purged them from memory). From - // hive's perspective - it's equivalent to the job having failed. - // So raise a meaningful exception - throw new IOException("Could not find status of job: + rj.getJobID()"); - } else { - th.setRunningJob(newRj); - rj = newRj; - } - - // If fatal errors happen we should kill the job immediately rather than - // let the job retry several times, which eventually lead to failure. - if (fatal) { - continue; // wait until rj.isComplete - } - - Counters ctrs = th.getCounters(); - - if (fatal = checkFatalErrors(ctrs, errMsg)) { - console.printError("[Fatal Error] " + errMsg.toString() + ". Killing the job."); - rj.killJob(); - continue; - } - errMsg.setLength(0); - - updateCounters(ctrs, rj); - - String report = " " + getId() + " map = " + mapProgress + "%, reduce = " + reduceProgress - + "%"; - - if (!report.equals(lastReport) - || System.currentTimeMillis() >= reportTime + maxReportInterval) { - - // write out serialized plan with counters to log file - // LOG.info(queryPlan); - String output = dateFormat.format(Calendar.getInstance().getTime()) + report; - SessionState ss = SessionState.get(); - if (ss != null) { - ss.getHiveHistory().setTaskCounters(SessionState.get().getQueryId(), getId(), ctrs); - ss.getHiveHistory().setTaskProperty(SessionState.get().getQueryId(), getId(), - Keys.TASK_HADOOP_PROGRESS, output); - ss.getHiveHistory().progressTask(SessionState.get().getQueryId(), this); - ss.getHiveHistory().logPlanProgress(queryPlan); - } - console.printInfo(output); - lastReport = report; - reportTime = System.currentTimeMillis(); - } - } - - boolean success; - Counters ctrs = th.getCounters(); - - if (fatal) { - success = false; - } else { - // check for fatal error again in case it occurred after - // the last check before the job is completed - if (checkFatalErrors(ctrs, errMsg)) { - console.printError("[Fatal Error] " + errMsg.toString()); - success = false; - } else { - success = rj.isSuccessful(); - } - } - - setDone(); - // update based on the final value of the counters - updateCounters(ctrs, rj); - - SessionState ss = SessionState.get(); - if (ss != null) { - ss.getHiveHistory().logPlanProgress(queryPlan); - } - // LOG.info(queryPlan); - return (success); - } - - /** - * Update counters relevant to this task. - */ - private void updateCounters(Counters ctrs, RunningJob rj) throws IOException { - mapProgress = Math.round(rj.mapProgress() * 100); - reduceProgress = Math.round(rj.reduceProgress() * 100); - taskCounters.put("CNTR_NAME_" + getId() + "_MAP_PROGRESS", Long.valueOf(mapProgress)); - taskCounters.put("CNTR_NAME_" + getId() + "_REDUCE_PROGRESS", Long.valueOf(reduceProgress)); - if (ctrs == null) { - // hadoop might return null if it cannot locate the job. - // we may still be able to retrieve the job status - so ignore - return; - } - for (Operator op : work.getAliasToWork().values()) { - op.updateCounters(ctrs); - } - if (work.getReducer() != null) { - work.getReducer().updateCounters(ctrs); - } - } - - public boolean mapStarted() { - return mapProgress > 0; - } - - public boolean reduceStarted() { - return reduceProgress > 0; - } - - public boolean mapDone() { - return mapProgress == 100; - } - - public boolean reduceDone() { - return reduceProgress == 100; - } - - /** + /** * Execute a query plan using Hadoop. */ @Override @@ -676,7 +374,6 @@ HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE"); } JobClient jc = new JobClient(job); - // make this client wait if job trcker is not behaving well. Throttle.checkJobTracker(job, LOG); @@ -692,35 +389,13 @@ // Finally SUBMIT the JOB! rj = jc.submitJob(job); - - jobId = rj.getJobID(); - // replace it back if (pwd != null) { HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd); } - - // add to list of running jobs to kill in case of abnormal shutdown - - runningJobKillURIs.put(rj.getJobID(), rj.getTrackingURL() + "&action=kill"); - - ExecDriverTaskHandle th = new ExecDriverTaskHandle(jc, rj); - jobInfo(rj); - success = progress(th); - - String statusMesg = getJobEndMsg(rj.getJobID()); - if (!success) { - statusMesg += " with errors"; - returnVal = 2; - console.printError(statusMesg); - if (HiveConf.getBoolVar(job, HiveConf.ConfVars.SHOW_JOB_FAIL_DEBUG_INFO)) { - showJobFailDebugInfo(job, rj); - } - } else { - console.printInfo(statusMesg); - } - - + + returnVal = jobExecHelper.progress(rj, jc); + success = (returnVal == 0); } catch (Exception e) { e.printStackTrace(); String mesg = " with exception '" + Utilities.getNameMessage(e) + "'"; @@ -747,7 +422,7 @@ if (returnVal != 0) { rj.killJob(); } - runningJobKillURIs.remove(rj.getJobID()); + HadoopJobExecHelper.runningJobKillURIs.remove(rj.getJobID()); } } catch (Exception e) { } @@ -778,179 +453,21 @@ return (returnVal); } - - /** - * This msg pattern is used to track when a job is started. - * - * @param jobId - * @return - */ - private static String getJobStartMsg(String jobId) { - return "Starting Job = " + jobId; - } - - /** - * this msg pattern is used to track when a job is successfully done. - * - * @param jobId - * @return - */ - public static String getJobEndMsg(String jobId) { - return "Ended Job = " + jobId; + + public boolean mapStarted() { + return this.jobExecHelper.mapStarted(); } - private String getTaskAttemptLogUrl(String taskTrackerHttpAddress, String taskAttemptId) { - return taskTrackerHttpAddress + "/tasklog?taskid=" + taskAttemptId + "&all=true"; + public boolean reduceStarted() { + return this.jobExecHelper.reduceStarted(); } - // Used for showJobFailDebugInfo - private static class TaskInfo { - String jobId; - HashSet logUrls; - - public TaskInfo(String jobId) { - this.jobId = jobId; - logUrls = new HashSet(); - } - - public void addLogUrl(String logUrl) { - logUrls.add(logUrl); - } - - public HashSet getLogUrls() { - return logUrls; - } - - public String getJobId() { - return jobId; - } + public boolean mapDone() { + return this.jobExecHelper.mapDone(); } - @SuppressWarnings("deprecation") - private void showJobFailDebugInfo(JobConf conf, RunningJob rj) throws IOException { - // Mapping from task ID to the number of failures - Map failures = new HashMap(); - // Successful task ID's - Set successes = new HashSet(); - - Map taskIdToInfo = new HashMap(); - - int startIndex = 0; - - // Loop to get all task completion events because getTaskCompletionEvents - // only returns a subset per call - while (true) { - TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex); - - if (taskCompletions == null || taskCompletions.length == 0) { - break; - } - - boolean more = true; - for (TaskCompletionEvent t : taskCompletions) { - // getTaskJobIDs returns Strings for compatibility with Hadoop versions - // without TaskID or TaskAttemptID - String[] taskJobIds = ShimLoader.getHadoopShims().getTaskJobIDs(t); - - if (taskJobIds == null) { - console.printError("Task attempt info is unavailable in this Hadoop version"); - more = false; - break; - } - - // For each task completion event, get the associated task id, job id - // and the logs - String taskId = taskJobIds[0]; - String jobId = taskJobIds[1]; - - TaskInfo ti = taskIdToInfo.get(taskId); - if (ti == null) { - ti = new TaskInfo(jobId); - taskIdToInfo.put(taskId, ti); - } - // These tasks should have come from the same job. - assert (ti.getJobId() == jobId); - ti.getLogUrls().add(getTaskAttemptLogUrl(t.getTaskTrackerHttp(), t.getTaskId())); - - // If a task failed, then keep track of the total number of failures - // for that task (typically, a task gets re-run up to 4 times if it - // fails - - if (t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) { - Integer failAttempts = failures.get(taskId); - if (failAttempts == null) { - failAttempts = Integer.valueOf(0); - } - failAttempts = Integer.valueOf(failAttempts.intValue() + 1); - failures.put(taskId, failAttempts); - } else { - successes.add(taskId); - } - } - if (!more) { - break; - } - startIndex += taskCompletions.length; - } - // Remove failures for tasks that succeeded - for (String task : successes) { - failures.remove(task); - } - - if (failures.keySet().size() == 0) { - return; - } - - // Find the highest failure count - int maxFailures = 0; - for (Integer failCount : failures.values()) { - if (maxFailures < failCount.intValue()) { - maxFailures = failCount.intValue(); - } - } - - // Display Error Message for tasks with the highest failure count - String jtUrl = JobTrackerURLResolver.getURL(conf); - - for (String task : failures.keySet()) { - if (failures.get(task).intValue() == maxFailures) { - TaskInfo ti = taskIdToInfo.get(task); - String jobId = ti.getJobId(); - String taskUrl = jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString(); - - TaskLogProcessor tlp = new TaskLogProcessor(conf); - for (String logUrl : ti.getLogUrls()) { - tlp.addTaskAttemptLogUrl(logUrl); - } - - List errors = tlp.getErrors(); - - StringBuilder sb = new StringBuilder(); - // We use a StringBuilder and then call printError only once as - // printError will write to both stderr and the error log file. In - // situations where both the stderr and the log file output is - // simultaneously output to a single stream, this will look cleaner. - sb.append("\n"); - sb.append("Task with the most failures(" + maxFailures + "): \n"); - sb.append("-----\n"); - sb.append("Task ID:\n " + task + "\n\n"); - sb.append("URL:\n " + taskUrl + "\n"); - - for (ErrorAndSolution e : errors) { - sb.append("\n"); - sb.append("Possible error:\n " + e.getError() + "\n\n"); - sb.append("Solution:\n " + e.getSolution() + "\n"); - } - sb.append("-----\n"); - - console.printError(sb.toString()); - - // Only print out one task because that's good enough for debugging. - break; - } - } - return; - + public boolean reduceDone() { + return this.jobExecHelper.reduceDone(); } private static void printUsage() { @@ -1375,4 +892,19 @@ } } } + + @Override + public void updateCounters(Counters ctrs, RunningJob rj) throws IOException { + for (Operator op : work.getAliasToWork().values()) { + op.updateCounters(ctrs); + } + if (work.getReducer() != null) { + work.getReducer().updateCounters(ctrs); + } + } + + @Override + public void logPlanProgress(SessionState ss) throws IOException { + ss.getHiveHistory().logPlanProgress(queryPlan); + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (working copy) @@ -673,7 +673,7 @@ if ((conf != null) && isNativeTable) { String specPath = conf.getDirName(); DynamicPartitionCtx dpCtx = conf.getDynPartCtx(); - mvFileToFinalPath(specPath, hconf, success, LOG, dpCtx); + Utilities.mvFileToFinalPath(specPath, hconf, success, LOG, dpCtx, conf); } } catch (IOException e) { throw new HiveException(e); @@ -681,83 +681,6 @@ super.jobClose(hconf, success, feedBack); } - public void mvFileToFinalPath(String specPath, Configuration hconf, - boolean success, Log log, DynamicPartitionCtx dpCtx) throws IOException, HiveException { - - FileSystem fs = (new Path(specPath)).getFileSystem(hconf); - Path tmpPath = Utilities.toTempPath(specPath); - Path intermediatePath = new Path(tmpPath.getParent(), tmpPath.getName() - + ".intermediate"); - Path finalPath = new Path(specPath); - if (success) { - if (fs.exists(tmpPath)) { - // Step1: rename tmp output folder to intermediate path. After this - // point, updates from speculative tasks still writing to tmpPath - // will not appear in finalPath. - log.info("Moving tmp dir: " + tmpPath + " to: " + intermediatePath); - Utilities.rename(fs, tmpPath, intermediatePath); - // Step2: remove any tmp file or double-committed output files - ArrayList emptyBuckets = - Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, dpCtx); - // create empty buckets if necessary - if (emptyBuckets.size() > 0) { - createEmptyBuckets(hconf, emptyBuckets); - } - - // Step3: move to the file destination - log.info("Moving tmp dir: " + intermediatePath + " to: " + finalPath); - Utilities.renameOrMoveFiles(fs, intermediatePath, finalPath); - } - } else { - fs.delete(tmpPath, true); - } - } - - /** - * Check the existence of buckets according to bucket specification. Create empty buckets if - * needed. - * @param specPath The final path where the dynamic partitions should be in. - * @param conf FileSinkDesc. - * @param dpCtx dynamic partition context. - * @throws HiveException - * @throws IOException - */ - private void createEmptyBuckets(Configuration hconf, ArrayList paths) - throws HiveException, IOException { - - JobConf jc; - if (hconf instanceof JobConf) { - jc = new JobConf(hconf); - } else { - // test code path - jc = new JobConf(hconf, ExecDriver.class); - } - HiveOutputFormat hiveOutputFormat = null; - Class outputClass = null; - boolean isCompressed = conf.getCompressed(); - TableDesc tableInfo = conf.getTableInfo(); - try { - Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance(); - serializer.initialize(null, tableInfo.getProperties()); - outputClass = serializer.getSerializedClass(); - hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance(); - } catch (SerDeException e) { - throw new HiveException(e); - } catch (InstantiationException e) { - throw new HiveException(e); - } catch (IllegalAccessException e) { - throw new HiveException(e); - } - - for (String p: paths) { - Path path = new Path(p); - RecordWriter writer = HiveFileFormatUtils.getRecordWriter( - jc, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), path); - writer.close(false); - LOG.info("created empty bucket for enforcing bucketing at " + path); - } - } - @Override public OperatorType getType() { return OperatorType.FILESINK; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/HadoopJobExecHelper.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/HadoopJobExecHelper.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/HadoopJobExecHelper.java (revision 0) @@ -0,0 +1,548 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import java.io.IOException; +import java.io.Serializable; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Operator.ProgressCounter; +import org.apache.hadoop.hive.ql.exec.errors.ErrorAndSolution; +import org.apache.hadoop.hive.ql.exec.errors.TaskLogProcessor; +import org.apache.hadoop.hive.ql.history.HiveHistory.Keys; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.TaskCompletionEvent; + +public class HadoopJobExecHelper { + + protected transient JobConf job; + protected Task task; + + protected transient int mapProgress = 0; + protected transient int reduceProgress = 0; + public transient String jobId; + private LogHelper console; + private HadoopJobExecHook callBackObj; + + /** + * Update counters relevant to this task. + */ + private void updateCounters(Counters ctrs, RunningJob rj) throws IOException { + mapProgress = Math.round(rj.mapProgress() * 100); + reduceProgress = Math.round(rj.reduceProgress() * 100); + task.taskCounters.put("CNTR_NAME_" + task.getId() + "_MAP_PROGRESS", Long.valueOf(mapProgress)); + task.taskCounters.put("CNTR_NAME_" + task.getId() + "_REDUCE_PROGRESS", Long.valueOf(reduceProgress)); + if (ctrs == null) { + // hadoop might return null if it cannot locate the job. + // we may still be able to retrieve the job status - so ignore + return; + } + if(callBackObj != null) { + callBackObj.updateCounters(ctrs, rj); + } + } + + /** + * This msg pattern is used to track when a job is started. + * + * @param jobId + * @return + */ + private static String getJobStartMsg(String jobId) { + return "Starting Job = " + jobId; + } + + /** + * this msg pattern is used to track when a job is successfully done. + * + * @param jobId + * @return + */ + public static String getJobEndMsg(String jobId) { + return "Ended Job = " + jobId; + } + + private String getTaskAttemptLogUrl(String taskTrackerHttpAddress, String taskAttemptId) { + return taskTrackerHttpAddress + "/tasklog?taskid=" + taskAttemptId + "&all=true"; + } + + public boolean mapStarted() { + return mapProgress > 0; + } + + public boolean reduceStarted() { + return reduceProgress > 0; + } + + public boolean mapDone() { + return mapProgress == 100; + } + + public boolean reduceDone() { + return reduceProgress == 100; + } + + + public String getJobId() { + return jobId; + } + + public void setJobId(String jobId) { + this.jobId = jobId; + } + + + public HadoopJobExecHelper() { + } + + public HadoopJobExecHelper(JobConf job, LogHelper console, + Task task, HadoopJobExecHook hookCallBack) { + this.job = job; + this.console = console; + this.task = task; + this.callBackObj = hookCallBack; + } + + + /** + * A list of the currently running jobs spawned in this Hive instance that is used to kill all + * running jobs in the event of an unexpected shutdown - i.e., the JVM shuts down while there are + * still jobs running. + */ + public static Map runningJobKillURIs = Collections + .synchronizedMap(new HashMap()); + + + /** + * In Hive, when the user control-c's the command line, any running jobs spawned from that command + * line are best-effort killed. + * + * This static constructor registers a shutdown thread to iterate over all the running job kill + * URLs and do a get on them. + * + */ + static { + if (new org.apache.hadoop.conf.Configuration() + .getBoolean("webinterface.private.actions", false)) { + Runtime.getRuntime().addShutdownHook(new Thread() { + @Override + public void run() { + synchronized (runningJobKillURIs) { + for (String uri : runningJobKillURIs.values()) { + try { + System.err.println("killing job with: " + uri); + java.net.HttpURLConnection conn = (java.net.HttpURLConnection) new java.net.URL(uri) + .openConnection(); + conn.setRequestMethod("POST"); + int retCode = conn.getResponseCode(); + if (retCode != 200) { + System.err.println("Got an error trying to kill job with URI: " + uri + " = " + + retCode); + } + } catch (Exception e) { + System.err.println("trying to kill job, caught: " + e); + // do nothing + } + } + } + } + }); + } + } + + public boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) { + if (ctrs == null) { + // hadoop might return null if it cannot locate the job. + // we may still be able to retrieve the job status - so ignore + return false; + } + // check for number of created files + long numFiles = ctrs.getCounter(ProgressCounter.CREATED_FILES); + long upperLimit = HiveConf.getLongVar(job, HiveConf.ConfVars.MAXCREATEDFILES); + if (numFiles > upperLimit) { + errMsg.append("total number of created files exceeds ").append(upperLimit); + return true; + } + return this.callBackObj.checkFatalErrors(ctrs, errMsg); + } + + private boolean progress(ExecDriverTaskHandle th) throws IOException { + JobClient jc = th.getJobClient(); + RunningJob rj = th.getRunningJob(); + String lastReport = ""; + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS"); + long reportTime = System.currentTimeMillis(); + long maxReportInterval = 60 * 1000; // One minute + boolean fatal = false; + StringBuilder errMsg = new StringBuilder(); + long pullInterval = HiveConf.getLongVar(job, HiveConf.ConfVars.HIVECOUNTERSPULLINTERVAL); + boolean initializing = true; + while (!rj.isComplete()) { + try { + Thread.sleep(pullInterval); + } catch (InterruptedException e) { + } + + if (initializing && ShimLoader.getHadoopShims().isJobPreparing(rj)) { + // No reason to poll untill the job is initialized + continue; + } else { + // By now the job is initialized so no reason to do + // rj.getJobState() again and we do not want to do an extra RPC call + initializing = false; + } + + RunningJob newRj = jc.getJob(rj.getJobID()); + if (newRj == null) { + // under exceptional load, hadoop may not be able to look up status + // of finished jobs (because it has purged them from memory). From + // hive's perspective - it's equivalent to the job having failed. + // So raise a meaningful exception + throw new IOException("Could not find status of job: + rj.getJobID()"); + } else { + th.setRunningJob(newRj); + rj = newRj; + } + + // If fatal errors happen we should kill the job immediately rather than + // let the job retry several times, which eventually lead to failure. + if (fatal) { + continue; // wait until rj.isComplete + } + + Counters ctrs = th.getCounters(); + + if (fatal = this.callBackObj.checkFatalErrors(ctrs, errMsg)) { + console.printError("[Fatal Error] " + errMsg.toString() + ". Killing the job."); + rj.killJob(); + continue; + } + errMsg.setLength(0); + + updateCounters(ctrs, rj); + + String report = " " + getId() + " map = " + mapProgress + "%, reduce = " + reduceProgress + + "%"; + + if (!report.equals(lastReport) + || System.currentTimeMillis() >= reportTime + maxReportInterval) { + + // write out serialized plan with counters to log file + // LOG.info(queryPlan); + String output = dateFormat.format(Calendar.getInstance().getTime()) + report; + SessionState ss = SessionState.get(); + if (ss != null) { + ss.getHiveHistory().setTaskCounters(SessionState.get().getQueryId(), getId(), ctrs); + ss.getHiveHistory().setTaskProperty(SessionState.get().getQueryId(), getId(), + Keys.TASK_HADOOP_PROGRESS, output); + ss.getHiveHistory().progressTask(SessionState.get().getQueryId(), this.task); + this.callBackObj.logPlanProgress(ss); + } + console.printInfo(output); + lastReport = report; + reportTime = System.currentTimeMillis(); + } + } + + boolean success; + Counters ctrs = th.getCounters(); + + if (fatal) { + success = false; + } else { + // check for fatal error again in case it occurred after + // the last check before the job is completed + if (checkFatalErrors(ctrs, errMsg)) { + console.printError("[Fatal Error] " + errMsg.toString()); + success = false; + } else { + success = rj.isSuccessful(); + } + } + + this.task.setDone(); + // update based on the final value of the counters + updateCounters(ctrs, rj); + + SessionState ss = SessionState.get(); + if (ss != null) { + this.callBackObj.logPlanProgress(ss); + } + // LOG.info(queryPlan); + return (success); + } + + private String getId() { + return this.task.getId(); + } + + /** + * from StreamJob.java. + */ + public void jobInfo(RunningJob rj) { + if (job.get("mapred.job.tracker", "local").equals("local")) { + console.printInfo("Job running in-process (local Hadoop)"); + } else { + String hp = job.get("mapred.job.tracker"); + if (SessionState.get() != null) { + SessionState.get().getHiveHistory().setTaskProperty(SessionState.get().getQueryId(), + getId(), Keys.TASK_HADOOP_ID, rj.getJobID()); + } + console.printInfo(getJobStartMsg(rj.getJobID()) + ", Tracking URL = " + + rj.getTrackingURL()); + console.printInfo("Kill Command = " + HiveConf.getVar(job, HiveConf.ConfVars.HADOOPBIN) + + " job -Dmapred.job.tracker=" + hp + " -kill " + rj.getJobID()); + } + } + + /** + * This class contains the state of the running task Going forward, we will return this handle + * from execute and Driver can split execute into start, monitorProgess and postProcess. + */ + private static class ExecDriverTaskHandle extends TaskHandle { + JobClient jc; + RunningJob rj; + + JobClient getJobClient() { + return jc; + } + + RunningJob getRunningJob() { + return rj; + } + + public ExecDriverTaskHandle(JobClient jc, RunningJob rj) { + this.jc = jc; + this.rj = rj; + } + + public void setRunningJob(RunningJob job) { + rj = job; + } + + @Override + public Counters getCounters() throws IOException { + return rj.getCounters(); + } + } + + // Used for showJobFailDebugInfo + private static class TaskInfo { + String jobId; + HashSet logUrls; + + public TaskInfo(String jobId) { + this.jobId = jobId; + logUrls = new HashSet(); + } + + public void addLogUrl(String logUrl) { + logUrls.add(logUrl); + } + + public HashSet getLogUrls() { + return logUrls; + } + + public String getJobId() { + return jobId; + } + } + + @SuppressWarnings("deprecation") + private void showJobFailDebugInfo(JobConf conf, RunningJob rj) throws IOException { + // Mapping from task ID to the number of failures + Map failures = new HashMap(); + // Successful task ID's + Set successes = new HashSet(); + + Map taskIdToInfo = new HashMap(); + + int startIndex = 0; + + // Loop to get all task completion events because getTaskCompletionEvents + // only returns a subset per call + while (true) { + TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex); + + if (taskCompletions == null || taskCompletions.length == 0) { + break; + } + + boolean more = true; + for (TaskCompletionEvent t : taskCompletions) { + // getTaskJobIDs returns Strings for compatibility with Hadoop versions + // without TaskID or TaskAttemptID + String[] taskJobIds = ShimLoader.getHadoopShims().getTaskJobIDs(t); + + if (taskJobIds == null) { + console.printError("Task attempt info is unavailable in this Hadoop version"); + more = false; + break; + } + + // For each task completion event, get the associated task id, job id + // and the logs + String taskId = taskJobIds[0]; + String jobId = taskJobIds[1]; + + TaskInfo ti = taskIdToInfo.get(taskId); + if (ti == null) { + ti = new TaskInfo(jobId); + taskIdToInfo.put(taskId, ti); + } + // These tasks should have come from the same job. + assert (ti.getJobId() == jobId); + ti.getLogUrls().add(getTaskAttemptLogUrl(t.getTaskTrackerHttp(), t.getTaskId())); + + // If a task failed, then keep track of the total number of failures + // for that task (typically, a task gets re-run up to 4 times if it + // fails + + if (t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) { + Integer failAttempts = failures.get(taskId); + if (failAttempts == null) { + failAttempts = Integer.valueOf(0); + } + failAttempts = Integer.valueOf(failAttempts.intValue() + 1); + failures.put(taskId, failAttempts); + } else { + successes.add(taskId); + } + } + if (!more) { + break; + } + startIndex += taskCompletions.length; + } + // Remove failures for tasks that succeeded + for (String task : successes) { + failures.remove(task); + } + + if (failures.keySet().size() == 0) { + return; + } + + // Find the highest failure count + int maxFailures = 0; + for (Integer failCount : failures.values()) { + if (maxFailures < failCount.intValue()) { + maxFailures = failCount.intValue(); + } + } + + // Display Error Message for tasks with the highest failure count + String jtUrl = JobTrackerURLResolver.getURL(conf); + + for (String task : failures.keySet()) { + if (failures.get(task).intValue() == maxFailures) { + TaskInfo ti = taskIdToInfo.get(task); + String jobId = ti.getJobId(); + String taskUrl = jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString(); + + TaskLogProcessor tlp = new TaskLogProcessor(conf); + for (String logUrl : ti.getLogUrls()) { + tlp.addTaskAttemptLogUrl(logUrl); + } + + List errors = tlp.getErrors(); + + StringBuilder sb = new StringBuilder(); + // We use a StringBuilder and then call printError only once as + // printError will write to both stderr and the error log file. In + // situations where both the stderr and the log file output is + // simultaneously output to a single stream, this will look cleaner. + sb.append("\n"); + sb.append("Task with the most failures(" + maxFailures + "): \n"); + sb.append("-----\n"); + sb.append("Task ID:\n " + task + "\n\n"); + sb.append("URL:\n " + taskUrl + "\n"); + + for (ErrorAndSolution e : errors) { + sb.append("\n"); + sb.append("Possible error:\n " + e.getError() + "\n\n"); + sb.append("Solution:\n " + e.getSolution() + "\n"); + } + sb.append("-----\n"); + + console.printError(sb.toString()); + + // Only print out one task because that's good enough for debugging. + break; + } + } + return; + + } + + public int progress(RunningJob rj, JobClient jc) throws IOException { + jobId = rj.getJobID(); + + int returnVal = 0; + + // remove the pwd from conf file so that job tracker doesn't show this + // logs + String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD); + if (pwd != null) { + HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE"); + } + + // replace it back + if (pwd != null) { + HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd); + } + + // add to list of running jobs to kill in case of abnormal shutdown + + runningJobKillURIs.put(rj.getJobID(), rj.getTrackingURL() + "&action=kill"); + + ExecDriverTaskHandle th = new ExecDriverTaskHandle(jc, rj); + jobInfo(rj); + boolean success = progress(th); + + String statusMesg = getJobEndMsg(rj.getJobID()); + if (!success) { + statusMesg += " with errors"; + returnVal = 2; + console.printError(statusMesg); + if (HiveConf.getBoolVar(job, HiveConf.ConfVars.SHOW_JOB_FAIL_DEBUG_INFO)) { + showJobFailDebugInfo(job, rj); + } + } else { + console.printInfo(statusMesg); + } + + return returnVal; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/HadoopJobExecHook.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/HadoopJobExecHook.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/HadoopJobExecHook.java (revision 0) @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.RunningJob; + +@SuppressWarnings("deprecation") +public interface HadoopJobExecHook { + + public void updateCounters(Counters ctrs, RunningJob rj) throws IOException; + public boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg); + public void logPlanProgress(SessionState ss) throws IOException; + +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java (revision 1073798) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java (working copy) @@ -207,6 +207,7 @@ LOG.error("Cannot get table " + tableName, e); console.printError("Cannot get table " + tableName, e.toString()); } + return aggregateStats(); } @@ -227,9 +228,7 @@ private int aggregateStats() { - String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS); - StatsFactory.setImplementation(statsImplementationClass, conf); - StatsAggregator statsAggregator = StatsFactory.getStatsAggregator(); + StatsAggregator statsAggregator = null; try { // Stats setup: @@ -237,38 +236,44 @@ FileSystem fileSys; FileStatus[] fileStatus; - // manufacture a StatsAggregator - if (!statsAggregator.connect(conf)) { - throw new HiveException("StatsAggregator connect failed " + statsImplementationClass); + if(!this.getWork().getNoStatsAggregator()) { + String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS); + StatsFactory.setImplementation(statsImplementationClass, conf); + statsAggregator = StatsFactory.getStatsAggregator(); + // manufacture a StatsAggregator + if (!statsAggregator.connect(conf)) { + throw new HiveException("StatsAggregator connect failed " + statsImplementationClass); + } } TableStatistics tblStats = new TableStatistics(); - // - // For partitioned table get the old table statistics for incremental update - // - if (table.isPartitioned()) { - org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable(); - Map parameters = tTable.getParameters(); - if (parameters.containsKey(StatsSetupConst.ROW_COUNT)) { - tblStats.setNumRows(Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT))); - } - if (parameters.containsKey(StatsSetupConst.NUM_PARTITIONS)) { - tblStats.setNumPartitions(Integer.parseInt(parameters.get(StatsSetupConst.NUM_PARTITIONS))); - } - if (parameters.containsKey(StatsSetupConst.NUM_FILES)) { - tblStats.setNumFiles(Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES))); - } - if (parameters.containsKey(StatsSetupConst.TOTAL_SIZE)) { - tblStats.setSize(Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE))); - } + org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable(); + Map parameters = tTable.getParameters(); + + boolean tableStatsExist = this.existStats(parameters); + + if (parameters.containsKey(StatsSetupConst.ROW_COUNT)) { + tblStats.setNumRows(Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT))); + } + if (parameters.containsKey(StatsSetupConst.NUM_PARTITIONS)) { + tblStats.setNumPartitions(Integer.parseInt(parameters.get(StatsSetupConst.NUM_PARTITIONS))); + } + if (parameters.containsKey(StatsSetupConst.NUM_FILES)) { + tblStats.setNumFiles(Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES))); + } + if (parameters.containsKey(StatsSetupConst.TOTAL_SIZE)) { + tblStats.setSize(Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE))); } List partitions = getPartitionsList(); - + boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC); + if (partitions == null) { // non-partitioned tables: - + if (!tableStatsExist && atomic) { + return 0; + } Path tablePath = wh.getDefaultTablePath(table.getDbName(), table.getTableName()); fileSys = tablePath.getFileSystem(conf); fileStatus = Utilities.getFileStatusRecurse(tablePath, 1, fileSys); @@ -280,12 +285,14 @@ tblStats.setSize(tableSize); // In case of a non-partitioned table, the key for stats temporary store is "rootDir" - String rows = statsAggregator.aggregateStats(work.getAggKey(), StatsSetupConst.ROW_COUNT); - if (rows != null) { - tblStats.setNumRows(Long.parseLong(rows)); - } else { - if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) { - throw new HiveException("StatsAggregator failed to get numRows."); + if(statsAggregator != null) { + String rows = statsAggregator.aggregateStats(work.getAggKey(), StatsSetupConst.ROW_COUNT); + if (rows != null) { + tblStats.setNumRows(Long.parseLong(rows)); + } else { + if (atomic) { + throw new HiveException("StatsAggregator failed to get numRows."); + } } } } else { @@ -294,20 +301,45 @@ // and update the table stats based on the old and new stats. for (Partition partn : partitions) { // + // get the old partition stats + // + org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition(); + parameters = tPart.getParameters(); + + boolean hasStats = this.existStats(parameters); + if(!hasStats && atomic) { + continue; + } + + int nf = parameters.containsKey(StatsSetupConst.NUM_FILES) ? + Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)) : + 0; + long nr = parameters.containsKey(StatsSetupConst.ROW_COUNT) ? + Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)) : + 0L; + long sz = parameters.containsKey(StatsSetupConst.TOTAL_SIZE) ? + Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)) : + 0L; + + // // get the new partition stats // PartitionStatistics newPartStats = new PartitionStatistics(); // In that case of a partition, the key for stats temporary store is "rootDir/[dynamic_partition_specs/]%" String partitionID = work.getAggKey() + Warehouse.makePartPath(partn.getSpec()); - - String rows = statsAggregator.aggregateStats(partitionID, StatsSetupConst.ROW_COUNT); - if (rows != null) { - newPartStats.setNumRows(Long.parseLong(rows)); - } else { - if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) { - throw new HiveException("StatsAggregator failed to get numRows."); + + if (statsAggregator != null) { + String rows = statsAggregator.aggregateStats(partitionID, StatsSetupConst.ROW_COUNT); + if (rows != null) { + newPartStats.setNumRows(Long.parseLong(rows)); + } else { + if (atomic) { + throw new HiveException("StatsAggregator failed to get numRows."); + } } + } else { + newPartStats.setNumRows(nr); } fileSys = partn.getPartitionPath().getFileSystem(conf); @@ -320,26 +352,6 @@ } newPartStats.setSize(partitionSize); - // - // get the old partition stats - // - org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition(); - Map parameters = tPart.getParameters(); - - boolean hasStats = - parameters.containsKey(StatsSetupConst.NUM_FILES) || - parameters.containsKey(StatsSetupConst.ROW_COUNT) || - parameters.containsKey(StatsSetupConst.TOTAL_SIZE); - - int nf = parameters.containsKey(StatsSetupConst.NUM_FILES) ? - Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)) : - 0; - long nr = parameters.containsKey(StatsSetupConst.ROW_COUNT) ? - Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)) : - 0L; - long sz = parameters.containsKey(StatsSetupConst.TOTAL_SIZE) ? - Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)) : - 0L; if (hasStats) { PartitionStatistics oldPartStats = new PartitionStatistics(nf, nr, sz); tblStats.updateStats(oldPartStats, newPartStats); @@ -363,12 +375,10 @@ } } - // // write table stats to metastore // - org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable(); - Map parameters = tTable.getParameters(); + parameters = tTable.getParameters(); parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(tblStats.getNumRows())); parameters.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(tblStats.getNumPartitions())); parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(tblStats.getNumFiles())); @@ -387,11 +397,20 @@ "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e)); } finally { - statsAggregator.closeConnection(); + if(statsAggregator != null) { + statsAggregator.closeConnection(); + } } // StatsTask always return 0 so that the whole job won't fail return 0; } + + private boolean existStats(Map parameters) { + return parameters.containsKey(StatsSetupConst.ROW_COUNT) + || parameters.containsKey(StatsSetupConst.NUM_FILES) + || parameters.containsKey(StatsSetupConst.TOTAL_SIZE) + || parameters.containsKey(StatsSetupConst.NUM_PARTITIONS); + } /** * Get the list of partitions that need to update statistics. Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Throttle.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Throttle.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Throttle.java (working copy) @@ -43,7 +43,7 @@ /** * Fetch http://tracker.om:/gc.jsp?threshold=period. */ - static void checkJobTracker(JobConf conf, Log LOG) { + public static void checkJobTracker(JobConf conf, Log LOG) { try { byte[] buffer = new byte[1024]; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (revision 1073798) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (working copy) @@ -80,6 +80,9 @@ import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; +import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat; import org.apache.hadoop.hive.ql.io.HiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat; @@ -91,6 +94,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; import org.apache.hadoop.hive.ql.plan.MapredWork; @@ -101,9 +105,12 @@ import org.apache.hadoop.hive.ql.stats.StatsFactory; import org.apache.hadoop.hive.ql.stats.StatsPublisher; import org.apache.hadoop.hive.serde.Constants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.Serializer; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DefaultCodec; @@ -1133,6 +1140,83 @@ Path pathPattern = new Path(path, sb.toString()); return fs.globStatus(pathPattern); } + + public static void mvFileToFinalPath(String specPath, Configuration hconf, + boolean success, Log log, DynamicPartitionCtx dpCtx, FileSinkDesc conf) throws IOException, HiveException { + + FileSystem fs = (new Path(specPath)).getFileSystem(hconf); + Path tmpPath = Utilities.toTempPath(specPath); + Path intermediatePath = new Path(tmpPath.getParent(), tmpPath.getName() + + ".intermediate"); + Path finalPath = new Path(specPath); + if (success) { + if (fs.exists(tmpPath)) { + // Step1: rename tmp output folder to intermediate path. After this + // point, updates from speculative tasks still writing to tmpPath + // will not appear in finalPath. + log.info("Moving tmp dir: " + tmpPath + " to: " + intermediatePath); + Utilities.rename(fs, tmpPath, intermediatePath); + // Step2: remove any tmp file or double-committed output files + ArrayList emptyBuckets = + Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, dpCtx); + // create empty buckets if necessary + if (emptyBuckets.size() > 0) { + createEmptyBuckets(hconf, emptyBuckets, conf); + } + + // Step3: move to the file destination + log.info("Moving tmp dir: " + intermediatePath + " to: " + finalPath); + Utilities.renameOrMoveFiles(fs, intermediatePath, finalPath); + } + } else { + fs.delete(tmpPath, true); + } + } + + /** + * Check the existence of buckets according to bucket specification. Create empty buckets if + * needed. + * @param specPath The final path where the dynamic partitions should be in. + * @param conf FileSinkDesc. + * @param dpCtx dynamic partition context. + * @throws HiveException + * @throws IOException + */ + private static void createEmptyBuckets(Configuration hconf, ArrayList paths, FileSinkDesc conf) + throws HiveException, IOException { + + JobConf jc; + if (hconf instanceof JobConf) { + jc = new JobConf(hconf); + } else { + // test code path + jc = new JobConf(hconf, ExecDriver.class); + } + HiveOutputFormat hiveOutputFormat = null; + Class outputClass = null; + boolean isCompressed = conf.getCompressed(); + TableDesc tableInfo = conf.getTableInfo(); + try { + Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance(); + serializer.initialize(null, tableInfo.getProperties()); + outputClass = serializer.getSerializedClass(); + hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance(); + } catch (SerDeException e) { + throw new HiveException(e); + } catch (InstantiationException e) { + throw new HiveException(e); + } catch (IllegalAccessException e) { + throw new HiveException(e); + } + + for (String p: paths) { + Path path = new Path(p); + RecordWriter writer = HiveFileFormatUtils.getRecordWriter( + jc, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), path); + writer.close(false); + LOG.info("created empty bucket for enforcing bucketing at " + path); + } + } /** * Remove all temporary files and duplicate (double-committed) files from a given directory. @@ -1644,4 +1728,20 @@ double result = (double) time / (double)1000; return result; } + + /** + * Determines whether a partition has been archived + * + * @param p + * @return + */ + public static boolean isArchived(Partition p) { + Map params = p.getParameters(); + if ("true".equalsIgnoreCase(params.get( + org.apache.hadoop.hive.metastore.api.Constants.IS_ARCHIVED))) { + return true; + } else { + return false; + } + } } Index: ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java (working copy) @@ -387,10 +387,12 @@ public static List> doGetAliasesFromPath( Map> pathToAliases, Map> aliasToWork, Path dir) { - - String path = getMatchingPath(pathToAliases, dir); List> opList = new ArrayList>(); + if (pathToAliases == null) { + return opList; + } + String path = getMatchingPath(pathToAliases, dir); List aliases = pathToAliases.get(path); for (String alias : aliases) { opList.add(aliasToWork.get(alias)); Index: ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java (working copy) @@ -374,7 +374,11 @@ if (this.mrwork == null) { init(job); } - + + if(this.mrwork.getPathToAliases() == null) { + return; + } + ArrayList aliases = new ArrayList(); Iterator>> iterator = this.mrwork .getPathToAliases().entrySet().iterator(); Index: ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java (working copy) @@ -46,6 +46,7 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.VersionMismatchException; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.SequenceFile.Metadata; import org.apache.hadoop.io.compress.CompressionCodec; @@ -190,7 +191,7 @@ *
  • {the end of the key part}
  • * */ - static class KeyBuffer implements Writable { + public static class KeyBuffer implements WritableComparable { // each column's value length in a split private int[] eachColumnValueLen = null; private int[] eachColumnUncompressedValueLen = null; @@ -200,6 +201,14 @@ private int numberRows = 0; // how many columns private int columnNumber = 0; + + // return the number of columns recorded in this file's header + public int getColumnNumber() { + return columnNumber; + } + + public KeyBuffer(){ + } KeyBuffer(int columnNumber) { this(0, columnNumber); @@ -281,6 +290,12 @@ return ret; } + + @Override + public int compareTo(Object arg0) { + throw new RuntimeException("compareTo not supported in class " + + this.getClass().getName()); + } } /** @@ -293,7 +308,7 @@ * column_2_row_2_value,....] * */ - static class ValueBuffer implements Writable { + public static class ValueBuffer implements WritableComparable { class LazyDecompressionCallbackImpl implements LazyDecompressionCallback { @@ -347,6 +362,9 @@ Decompressor valDecompressor = null; NonSyncDataInputBuffer decompressBuffer = new NonSyncDataInputBuffer(); CompressionInputStream deflatFilter = null; + + public ValueBuffer() throws IOException { + } public ValueBuffer(KeyBuffer keyBuffer) throws IOException { this(keyBuffer, null); @@ -464,6 +482,12 @@ CodecPool.returnDecompressor(valDecompressor); } } + + @Override + public int compareTo(Object arg0) { + throw new RuntimeException("compareTo not supported in class " + + this.getClass().getName()); + } } /** @@ -872,6 +896,33 @@ bufferedRecords = 0; columnBufferSize = 0; } + + /** + * flush a block out without doing anything except compressing the key part. + */ + public void flushBlock(KeyBuffer keyBuffer, ValueBuffer valueBuffer, + int recordLen, int keyLength, int compressedKeyLen) throws IOException { + checkAndWriteSync(); // sync + out.writeInt(recordLen); // total record length + out.writeInt(keyLength); // key portion length + + if(this.isCompressed()) { + //compress key and write key out + keyCompressionBuffer.reset(); + keyDeflateFilter.resetState(); + keyBuffer.write(keyDeflateOut); + keyDeflateOut.flush(); + keyDeflateFilter.finish(); + compressedKeyLen = keyCompressionBuffer.getLength(); + out.writeInt(compressedKeyLen); + out.write(keyCompressionBuffer.getData(), 0, compressedKeyLen); + } else { + out.writeInt(compressedKeyLen); + keyBuffer.write(out); + } + + valueBuffer.write(out); // value + } private void clearColumnBuffers() throws IOException { for (int i = 0; i < columnNumber; i++) { @@ -1304,6 +1355,15 @@ currentValue.readFields(in); currentValue.inited = true; } + + public boolean nextBlock() throws IOException { + int keyLength = nextKeyBuffer(); + if(keyLength > 0) { + currentValueBuffer(); + return true; + } + return false; + } private boolean rowFetched = false; @@ -1500,5 +1560,44 @@ CodecPool.returnDecompressor(keyDecompressor); } } + + /** + * return the KeyBuffer object used in the reader. Internally in each + * reader, there is only one KeyBuffer object, which gets reused for every + * block. + */ + public KeyBuffer getCurrentKeyBufferObj() { + return this.currentKey; + } + + /** + * return the ValueBuffer object used in the reader. Internally in each + * reader, there is only one ValueBuffer object, which gets reused for every + * block. + */ + public ValueBuffer getCurrentValueBufferObj() { + return this.currentValue; + } + + //return the current block's length + public int getCurrentBlockLength() { + return this.currentRecordLength; + } + + //return the current block's key length + public int getCurrentKeyLength() { + return this.currentKeyLength; + } + + //return the current block's compressed key length + public int getCurrentCompressedKeyLen() { + return this.compressedKeyLen; + } + + //return the CompressionCodec used for this file + public CompressionCodec getCompressionCodec() { + return this.codec; + } + } } Index: ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/BlockMergeTask.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/BlockMergeTask.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/BlockMergeTask.java (revision 0) @@ -0,0 +1,359 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.rcfile.merge; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.DriverContext; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.HadoopJobExecHelper; +import org.apache.hadoop.hive.ql.exec.HadoopJobExecHook; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.Throttle; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; +import org.apache.hadoop.hive.ql.plan.api.StageType; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.log4j.Appender; +import org.apache.log4j.FileAppender; +import org.apache.log4j.LogManager; + +@SuppressWarnings( { "deprecation", "unchecked" }) +public class BlockMergeTask extends Task implements Serializable, + HadoopJobExecHook { + + private static final long serialVersionUID = 1L; + + protected transient JobConf job; + protected HadoopJobExecHelper jobExecHelper; + + @Override + public void initialize(HiveConf conf, QueryPlan queryPlan, + DriverContext driverContext) { + super.initialize(conf, queryPlan, driverContext); + job = new JobConf(conf, BlockMergeTask.class); + jobExecHelper = new HadoopJobExecHelper(job, this.console, this, this); + } + + boolean success = true; + + @Override + /** + * start a new map-reduce job to do the merge, almost the same as ExecDriver. + */ + public int execute(DriverContext driverContext) { + HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, + CombineHiveInputFormat.class.getName()); + success = true; + ShimLoader.getHadoopShims().setNullOutputFormat(job); + job.setMapperClass(work.getMapperClass()); + + Context ctx = driverContext.getCtx(); + boolean ctxCreated = false; + try { + if (ctx == null) { + ctx = new Context(job); + ctxCreated = true; + } + }catch (IOException e) { + e.printStackTrace(); + console.printError("Error launching map-reduce job", "\n" + + org.apache.hadoop.util.StringUtils.stringifyException(e)); + return 5; + } + + job.setMapOutputKeyClass(NullWritable.class); + job.setMapOutputValueClass(NullWritable.class); + if(work.getNumMapTasks() != null) { + job.setNumMapTasks(work.getNumMapTasks()); + } + + if (work.getMinSplitSize() != null) { + HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, work + .getMinSplitSize().longValue()); + } + + if (work.getInputformat() != null) { + HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, work + .getInputformat()); + } + + String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT); + if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) { + inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName(); + } + + LOG.info("Using " + inpFormat); + + try { + job.setInputFormat((Class) (Class + .forName(inpFormat))); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e.getMessage()); + } + + String outputPath = this.work.getOutputDir(); + RCFileBlockMergeOutputFormat.setMergeOutputPath(job, new Path(outputPath)); + + job.setOutputKeyClass(NullWritable.class); + job.setOutputValueClass(NullWritable.class); + + int returnVal = 0; + RunningJob rj = null; + boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, + HiveConf.ConfVars.HADOOPJOBNAME)); + + if (noName) { + // This is for a special case to ensure unit tests pass + HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + + Utilities.randGen.nextInt()); + } + + try { + addInputPaths(job, work); + + Utilities.setMapRedWork(job, work, ctx.getMRTmpFileURI()); + + // remove the pwd from conf file so that job tracker doesn't show this + // logs + String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD); + if (pwd != null) { + HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE"); + } + JobClient jc = new JobClient(job); + + // make this client wait if job trcker is not behaving well. + Throttle.checkJobTracker(job, LOG); + + // Finally SUBMIT the JOB! + rj = jc.submitJob(job); + + returnVal = jobExecHelper.progress(rj, jc); + success = (returnVal == 0); + + } catch (Exception e) { + e.printStackTrace(); + String mesg = " with exception '" + Utilities.getNameMessage(e) + "'"; + if (rj != null) { + mesg = "Ended Job = " + rj.getJobID() + mesg; + } else { + mesg = "Job Submission failed" + mesg; + } + + // Has to use full name to make sure it does not conflict with + // org.apache.commons.lang.StringUtils + console.printError(mesg, "\n" + + org.apache.hadoop.util.StringUtils.stringifyException(e)); + + success = false; + returnVal = 1; + } finally { + try { + if (ctxCreated) { + ctx.clear(); + } + if (rj != null) { + if (returnVal != 0) { + rj.killJob(); + } + HadoopJobExecHelper.runningJobKillURIs.remove(rj.getJobID()); + } + RCFileMergeMapper.jobClose(outputPath, noName, job, console); + } catch (Exception e) { + } + } + + return (returnVal); + } + + private void addInputPaths(JobConf job, MergeWork work) { + for (String path : work.getInputPaths()) { + FileInputFormat.addInputPath(job, new Path(path)); + } + } + + @Override + public String getName() { + return "RCFile Merge"; + } + + public static String INPUT_SEPERATOR = ":"; + + public static void main(String[] args) { + + ArrayList jobConfArgs = new ArrayList(); + + String inputPathStr = null; + String outputDir = null; + + try { + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-input")) { + inputPathStr = args[++i]; + } else if (args[i].equals("-jobconf")) { + jobConfArgs.add(args[++i]); + } else if (args[i].equals("-outputDir")) { + outputDir = args[++i]; + } + } + } catch (IndexOutOfBoundsException e) { + System.err.println("Missing argument to option"); + printUsage(); + } + + if (inputPathStr == null || outputDir == null + || outputDir.trim().equals("")) { + printUsage(); + } + + List inputPaths = new ArrayList(); + String[] paths = inputPathStr.split(INPUT_SEPERATOR); + if (paths == null || paths.length == 0) { + printUsage(); + } + + FileSystem fs = null; + JobConf conf = new JobConf(BlockMergeTask.class); + HiveConf hiveConf = new HiveConf(conf, BlockMergeTask.class); + for (String path : paths) { + try { + Path pathObj = new Path(path); + if (fs == null) { + fs = FileSystem.get(pathObj.toUri(), conf); + } + FileStatus fstatus = fs.getFileStatus(pathObj); + if (fstatus.isDir()) { + FileStatus[] fileStatus = fs.listStatus(pathObj); + for (FileStatus st : fileStatus) { + inputPaths.add(st.getPath().toString()); + } + } else { + inputPaths.add(fstatus.getPath().toString()); + } + } catch (IOException e) { + e.printStackTrace(System.err); + } + } + + StringBuilder sb = new StringBuilder("JobConf:\n"); + + for (String one : jobConfArgs) { + int eqIndex = one.indexOf('='); + if (eqIndex != -1) { + try { + String key = one.substring(0, eqIndex); + String value = URLDecoder.decode(one.substring(eqIndex + 1), "UTF-8"); + conf.set(key, value); + sb.append(key).append("=").append(value).append("\n"); + } catch (UnsupportedEncodingException e) { + System.err.println("Unexpected error " + e.getMessage() + + " while encoding " + one.substring(eqIndex + 1)); + System.exit(3); + } + } + } + + Log LOG = LogFactory.getLog(BlockMergeTask.class.getName()); + boolean isSilent = HiveConf.getBoolVar(conf, + HiveConf.ConfVars.HIVESESSIONSILENT); + LogHelper console = new LogHelper(LOG, isSilent); + + // print out the location of the log file for the user so + // that it's easy to find reason for local mode execution failures + for (Appender appender : Collections + .list((Enumeration) LogManager.getRootLogger() + .getAllAppenders())) { + if (appender instanceof FileAppender) { + console.printInfo("Execution log at: " + + ((FileAppender) appender).getFile()); + } + } + + // log the list of job conf parameters for reference + LOG.info(sb.toString()); + + MergeWork mergeWork = new MergeWork(inputPaths, outputDir); + DriverContext driverCxt = new DriverContext(); + BlockMergeTask taskExec = new BlockMergeTask(); + taskExec.initialize(hiveConf, null, driverCxt); + taskExec.setWork(mergeWork); + int ret = taskExec.execute(driverCxt); + + if (ret != 0) { + System.exit(2); + } + + } + + private static void printUsage() { + System.err.println("BlockMergeTask -input " + + "-outputDir outputDir [-jobconf k1=v1 [-jobconf k2=v2] ...] "); + System.exit(1); + } + + @Override + public StageType getType() { + return StageType.MAPRED; + } + + @Override + public boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) { + return false; + } + + @Override + public void logPlanProgress(SessionState ss) throws IOException { + // no op + } + + @Override + public void updateCounters(Counters ctrs, RunningJob rj) throws IOException { + // no op + } + + @Override + protected void localizeMRTmpFilesImpl(Context ctx) { + // no op + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/MergeWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/MergeWork.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/MergeWork.java (revision 0) @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.rcfile.merge; + +import java.io.Serializable; +import java.util.LinkedHashMap; +import java.util.List; + +import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.Mapper; + +public class MergeWork extends MapredWork implements Serializable { + + private static final long serialVersionUID = 1L; + + private List inputPaths; + private String outputDir; + + public MergeWork() { + } + + public MergeWork(List inputPaths, String outputDir) { + super(); + this.inputPaths = inputPaths; + this.outputDir = outputDir; + PartitionDesc partDesc = new PartitionDesc(); + partDesc.setInputFileFormatClass(RCFileBlockMergeInputFormat.class); + if(this.getPathToPartitionInfo() == null) { + this.setPathToPartitionInfo(new LinkedHashMap()); + } + for(String path: this.inputPaths) { + this.getPathToPartitionInfo().put(path, partDesc); + } + } + + public List getInputPaths() { + return inputPaths; + } + + public void setInputPaths(List inputPaths) { + this.inputPaths = inputPaths; + } + + public String getOutputDir() { + return outputDir; + } + + public void setOutputDir(String outputDir) { + this.outputDir = outputDir; + } + + public Class getMapperClass() { + return RCFileMergeMapper.class; + } + + public Long getMinSplitSize() { + return null; + } + + public String getInputformat() { + return CombineHiveInputFormat.class.getName(); + } + + public boolean isGatheringStats() { + return false; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeInputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeInputFormat.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeInputFormat.java (revision 0) @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.rcfile.merge; + +import java.io.IOException; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + +@SuppressWarnings({ "deprecation", "unchecked" }) +public class RCFileBlockMergeInputFormat extends FileInputFormat { + + @Override + public RecordReader getRecordReader(InputSplit split, JobConf job, + Reporter reporter) throws IOException { + + reporter.setStatus(split.toString()); + + return new RCFileBlockMergeRecordReader(job, (FileSplit) split); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeOutputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeOutputFormat.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeOutputFormat.java (revision 0) @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.rcfile.merge; + +import java.io.IOException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.util.Progressable; + +public class RCFileBlockMergeOutputFormat extends + FileOutputFormat { + + public static void setMergeOutputPath(JobConf job, Path path) { + job.set("hive.rcfile.merge.output.dir", path.toString()); + } + + public static Path getMergeOutputPath(JobConf conf) { + String name = conf.get("hive.rcfile.merge.output.dir"); + return name == null ? null: new Path(name); + } + + @Override + public RecordWriter getRecordWriter( + FileSystem ignored, JobConf job, String name, Progressable progress) + throws IOException { + throw new RuntimeException("Not implemented."); + } +} \ No newline at end of file Index: ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeRecordReader.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeRecordReader.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeRecordReader.java (revision 0) @@ -0,0 +1,136 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.rcfile.merge; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.RCFile; +import org.apache.hadoop.hive.ql.io.RCFile.Reader; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.RecordReader; + +@SuppressWarnings("deprecation") +public class RCFileBlockMergeRecordReader implements + RecordReader { + + private final Reader in; + private final long start; + private final long end; + private boolean more = true; + protected Configuration conf; + + public RCFileBlockMergeRecordReader(Configuration conf, FileSplit split) + throws IOException { + Path path = split.getPath(); + FileSystem fs = path.getFileSystem(conf); + this.in = new RCFile.Reader(fs, path, conf); + this.end = split.getStart() + split.getLength(); + this.conf = conf; + + if (split.getStart() > in.getPosition()) { + in.sync(split.getStart()); // sync to start + } + + this.start = in.getPosition(); + more = start < end; + } + + public Class getKeyClass() { + return RCFileKeyBufferWrapper.class; + } + + public Class getValueClass() { + return RCFileValueBufferWrapper.class; + } + + public RCFileKeyBufferWrapper createKey() { + return new RCFileKeyBufferWrapper(); + } + + public RCFileValueBufferWrapper createValue() { + return new RCFileValueBufferWrapper(); + } + + @Override + public boolean next(RCFileKeyBufferWrapper key, RCFileValueBufferWrapper value) + throws IOException { + more = nextBlock(key, value); + return more; + } + + protected boolean nextBlock(RCFileKeyBufferWrapper keyWrapper, RCFileValueBufferWrapper valueWrapper) + throws IOException { + if (!more) { + return false; + } + + more = in.nextBlock(); + if (!more) { + return false; + } + + keyWrapper.keyBuffer = this.in.getCurrentKeyBufferObj(); + keyWrapper.recordLength = this.in.getCurrentBlockLength(); + keyWrapper.keyLength = this.in.getCurrentKeyLength(); + keyWrapper.compressedKeyLength = this.in.getCurrentCompressedKeyLen(); + keyWrapper.codec = this.in.getCompressionCodec(); + + valueWrapper.valueBuffer = this.in.getCurrentValueBufferObj(); + + long lastSeenSyncPos = in.lastSeenSyncPos(); + if (lastSeenSyncPos >= end) { + more = false; + return more; + } + return more; + } + + /** + * Return the progress within the input split. + * + * @return 0.0 to 1.0 of the input byte range + */ + public float getProgress() throws IOException { + if (end == start) { + return 0.0f; + } else { + return Math.min(1.0f, (in.getPosition() - start) / (float) (end - start)); + } + } + + public long getPos() throws IOException { + return in.getPosition(); + } + + protected void seek(long pos) throws IOException { + in.seek(pos); + } + + public long getStart() { + return start; + } + + public void close() throws IOException { + in.close(); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileKeyBufferWrapper.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileKeyBufferWrapper.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileKeyBufferWrapper.java (revision 0) @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.rcfile.merge; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.hive.ql.io.RCFile.KeyBuffer; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.compress.CompressionCodec; + +public class RCFileKeyBufferWrapper implements + WritableComparable { + + protected KeyBuffer keyBuffer; + protected int recordLength; + protected int keyLength; + protected int compressedKeyLength; + + protected CompressionCodec codec; + + protected RCFileKeyBufferWrapper() { + } + + public static RCFileKeyBufferWrapper create(KeyBuffer currentKeyBufferObj) { + RCFileKeyBufferWrapper obj = new RCFileKeyBufferWrapper(); + obj.keyBuffer = currentKeyBufferObj; + return obj; + } + + @Override + public void readFields(DataInput in) throws IOException { + throw new RuntimeException("Not supported."); + } + + @Override + public void write(DataOutput out) throws IOException { + throw new RuntimeException("Not supported."); + } + + @Override + public int compareTo(RCFileKeyBufferWrapper o) { + return this.keyBuffer.compareTo(o.keyBuffer); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileMergeMapper.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileMergeMapper.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileMergeMapper.java (revision 0) @@ -0,0 +1,166 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.rcfile.merge; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.RCFile; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; +import org.apache.hadoop.hive.shims.CombineHiveKey; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +@SuppressWarnings("deprecation") +public class RCFileMergeMapper extends MapReduceBase implements + Mapper { + + private JobConf jc; + Class outputClass; + RCFile.Writer outWriter; + + Path finalPath; + FileSystem fs; + + boolean exception = false; + boolean autoDelete = false; + Path outPath; + + CompressionCodec codec = null; + int columnNumber = 0; + + public final static Log LOG = LogFactory.getLog("RCFileMergeMapper"); + + public RCFileMergeMapper() { + } + + public void configure(JobConf job) { + jc = job; + String specPath = RCFileBlockMergeOutputFormat.getMergeOutputPath(job) + .toString(); + Path tmpPath = Utilities.toTempPath(specPath); + String taskId = Utilities.getTaskId(job); + finalPath = new Path(tmpPath, taskId); + outPath = new Path(tmpPath, Utilities.toTempPath(taskId)); + try { + fs = (new Path(specPath)).getFileSystem(job); + autoDelete = ShimLoader.getHadoopShims().fileSystemDeleteOnExit(fs, + outPath); + } catch (IOException e) { + this.exception = true; + throw new RuntimeException(e); + } + } + + @Override + public void map(Object k, RCFileValueBufferWrapper value, + OutputCollector output, Reporter reporter) + throws IOException { + try { + + RCFileKeyBufferWrapper key = null; + if (k instanceof CombineHiveKey) { + key = (RCFileKeyBufferWrapper) ((CombineHiveKey) k).getKey(); + } else { + key = (RCFileKeyBufferWrapper) k; + } + + if (outWriter == null) { + codec = key.codec; + columnNumber = key.keyBuffer.getColumnNumber(); + jc.setInt(RCFile.COLUMN_NUMBER_CONF_STR, columnNumber); + outWriter = new RCFile.Writer(fs, jc, outPath, null, codec); + } + + boolean sameCodec = ((codec == key.codec) || codec.getClass().equals( + key.codec.getClass())); + + if ((key.keyBuffer.getColumnNumber() != columnNumber) || (!sameCodec)) { + throw new IOException( + "RCFileMerge failed because the input files use different CompressionCodec or have different column number setting."); + } + + outWriter.flushBlock(key.keyBuffer, value.valueBuffer, key.recordLength, + key.keyLength, key.compressedKeyLength); + } catch (Throwable e) { + this.exception = true; + close(); + throw new IOException(e); + } + } + + public void close() throws IOException { + // close writer + if (outWriter == null) { + return; + } + + outWriter.close(); + outWriter = null; + + if (!exception) { + FileStatus fss = fs.getFileStatus(outPath); + System.out.println("renamed path " + outPath + " to " + finalPath + + " . File size is " + fss.getLen()); + if (!fs.rename(outPath, finalPath)) { + throw new IOException("Unable to rename output to " + finalPath); + } + } else { + if (!autoDelete) { + fs.delete(outPath, true); + } + } + } + + public static String BACKUP_PREFIX = "_backup."; + + public static Path backupOutputPath(FileSystem fs, Path outpath, JobConf job) + throws IOException, HiveException { + if (fs.exists(outpath)) { + Path backupPath = new Path(outpath.getParent(), BACKUP_PREFIX + + outpath.getName()); + Utilities.rename(fs, outpath, backupPath); + return backupPath; + } else { + return null; + } + } + + public static void jobClose(String outputPath, boolean success, JobConf job, + LogHelper console) throws HiveException, IOException { + Path outpath = new Path(outputPath); + FileSystem fs = outpath.getFileSystem(job); + Path backupPath = backupOutputPath(fs, outpath, job); + Utilities.mvFileToFinalPath(outputPath, job, success, LOG, null, null); + fs.delete(backupPath, true); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileValueBufferWrapper.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileValueBufferWrapper.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileValueBufferWrapper.java (revision 0) @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.rcfile.merge; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.hive.ql.io.RCFile.ValueBuffer; +import org.apache.hadoop.io.WritableComparable; + +public class RCFileValueBufferWrapper implements + WritableComparable { + + protected ValueBuffer valueBuffer; + + public RCFileValueBufferWrapper() { + } + + @Override + public void readFields(DataInput in) throws IOException { + throw new RuntimeException("Not supported."); + } + + @Override + public void write(DataOutput out) throws IOException { + throw new RuntimeException("Not supported."); + } + + @Override + public int compareTo(RCFileValueBufferWrapper o) { + return this.valueBuffer.compareTo(o.valueBuffer); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java (working copy) @@ -707,6 +707,13 @@ public boolean isView() { return TableType.VIRTUAL_VIEW.equals(getTableType()); } + + /** + * @return whether this table is actually an index table + */ + public boolean isIndexTable() { + return TableType.INDEX_TABLE.equals(getTableType()); + } /** * Creates a partition name -> value spec map object Index: ql/src/java/org/apache/hadoop/hive/ql/parse/AlterTablePartMergeFilesDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/AlterTablePartMergeFilesDesc.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/AlterTablePartMergeFilesDesc.java (revision 0) @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.parse; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import org.apache.hadoop.hive.ql.plan.Explain; + +@Explain(displayName = "Alter Table Partition Merge Files") +public class AlterTablePartMergeFilesDesc { + + private String tableName; + private HashMap partSpec; + + private List inputDir = new ArrayList(); + private String outputDir = null; + + public AlterTablePartMergeFilesDesc(String tableName, + HashMap partSpec) { + this.tableName = tableName; + this.partSpec = partSpec; + } + + @Explain(displayName = "table name") + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + @Explain(displayName = "partition desc") + public HashMap getPartSpec() { + return partSpec; + } + + public void setPartSpec(HashMap partSpec) { + this.partSpec = partSpec; + } + + public String getOutputDir() { + return outputDir; + } + + public void setOutputDir(String outputDir) { + this.outputDir = outputDir; + } + + public List getInputDir() { + return inputDir; + } + + public void setInputDir(List inputDir) { + this.inputDir = inputDir; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java (revision 1073798) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java (working copy) @@ -603,7 +603,9 @@ public tableSpec(Hive db, HiveConf conf, ASTNode ast) throws SemanticException { - assert (ast.getToken().getType() == HiveParser.TOK_TAB || ast.getToken().getType() == HiveParser.TOK_TABTYPE); + assert (ast.getToken().getType() == HiveParser.TOK_TAB + || ast.getToken().getType() == HiveParser.TOK_TABLE_PARTITION + || ast.getToken().getType() == HiveParser.TOK_TABTYPE); int childIndex = 0; numDynParts = 0; Index: ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java (revision 1073951) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java (working copy) @@ -23,6 +23,7 @@ import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_IFNOTEXISTS; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_SHOWDATABASES; +import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -50,12 +51,14 @@ import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.index.HiveIndex; import org.apache.hadoop.hive.ql.index.HiveIndexHandler; import org.apache.hadoop.hive.ql.index.HiveIndex.IndexType; import org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.RCFileInputFormat; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.HiveUtils; @@ -68,6 +71,13 @@ import org.apache.hadoop.hive.ql.plan.AlterTableSimpleDesc; import org.apache.hadoop.hive.ql.plan.CreateDatabaseDesc; import org.apache.hadoop.hive.ql.plan.CreateIndexDesc; +import org.apache.hadoop.hive.ql.plan.GrantDesc; +import org.apache.hadoop.hive.ql.plan.GrantRevokeRoleDDL; +import org.apache.hadoop.hive.ql.plan.PrincipalDesc; +import org.apache.hadoop.hive.ql.plan.PrivilegeDesc; +import org.apache.hadoop.hive.ql.plan.PrivilegeObjectDesc; +import org.apache.hadoop.hive.ql.plan.RevokeDesc; +import org.apache.hadoop.hive.ql.plan.RoleDDLDesc; import org.apache.hadoop.hive.ql.plan.DDLWork; import org.apache.hadoop.hive.ql.plan.DescDatabaseDesc; import org.apache.hadoop.hive.ql.plan.DescFunctionDesc; @@ -76,15 +86,8 @@ import org.apache.hadoop.hive.ql.plan.DropIndexDesc; import org.apache.hadoop.hive.ql.plan.DropTableDesc; import org.apache.hadoop.hive.ql.plan.FetchWork; -import org.apache.hadoop.hive.ql.plan.GrantDesc; -import org.apache.hadoop.hive.ql.plan.GrantRevokeRoleDDL; import org.apache.hadoop.hive.ql.plan.LockTableDesc; import org.apache.hadoop.hive.ql.plan.MsckDesc; -import org.apache.hadoop.hive.ql.plan.PrincipalDesc; -import org.apache.hadoop.hive.ql.plan.PrivilegeDesc; -import org.apache.hadoop.hive.ql.plan.PrivilegeObjectDesc; -import org.apache.hadoop.hive.ql.plan.RevokeDesc; -import org.apache.hadoop.hive.ql.plan.RoleDDLDesc; import org.apache.hadoop.hive.ql.plan.ShowDatabasesDesc; import org.apache.hadoop.hive.ql.plan.ShowFunctionsDesc; import org.apache.hadoop.hive.ql.plan.ShowGrantDesc; @@ -93,6 +96,7 @@ import org.apache.hadoop.hive.ql.plan.ShowPartitionsDesc; import org.apache.hadoop.hive.ql.plan.ShowTableStatusDesc; import org.apache.hadoop.hive.ql.plan.ShowTablesDesc; +import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.SwitchDatabaseDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.UnlockTableDesc; @@ -103,6 +107,7 @@ import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.TextInputFormat; /** @@ -171,7 +176,8 @@ switch(ast.getToken().getType()) { case HiveParser.TOK_ALTERTABLE_PARTITION: { - TablePartition tblPart = new TablePartition((ASTNode)ast.getChild(0)); + ASTNode tablePart = (ASTNode)ast.getChild(0); + TablePartition tblPart = new TablePartition(tablePart); String tableName = tblPart.tableName; HashMap partSpec = tblPart.partSpec; ast = (ASTNode)ast.getChild(1); @@ -181,6 +187,8 @@ analyzeAlterTableProtectMode(ast, tableName, partSpec); } else if (ast.getToken().getType() == HiveParser.TOK_ALTERTABLE_LOCATION) { analyzeAlterTableLocation(ast, tableName, partSpec); + } else if (ast.getToken().getType() == HiveParser.TOK_ALTERTABLE_ALTERPARTS_MERGEFILES) { + analyzeAlterTablePartMergeFiles(tablePart, ast, tableName, partSpec); } break; } @@ -1067,6 +1075,89 @@ rootTasks.add(TaskFactory.get(new DDLWork(getInputs(), getOutputs(), alterTblDesc), conf)); } + + private void analyzeAlterTablePartMergeFiles(ASTNode tablePartAST, ASTNode ast, + String tableName, HashMap partSpec) + throws SemanticException { + AlterTablePartMergeFilesDesc mergeDesc = new AlterTablePartMergeFilesDesc( + tableName, partSpec); + + List inputDir = new ArrayList(); + String outputDir = null; + + try { + Table tblObj = db.getTable(tableName); + + List bucketCols = null; + Class inputFormatClass = null; + boolean isArchived = false; + List indexes = db.getIndexes(tblObj.getDbName(), tableName, + Short.MAX_VALUE); + if (indexes != null && indexes.size() > 0) { + throw new SemanticException("can not do merge because source table " + + tableName + " is indexed."); + } + + if (tblObj.isPartitioned()) { + if (partSpec == null) { + throw new SemanticException("source table " + tableName + + " is partitioned but no partition desc found."); + } else { + Partition part = db.getPartition(tblObj, partSpec, false); + if (part == null) { + throw new SemanticException("source table " + tableName + + " is partitioned but partition not found."); + } + bucketCols = part.getBucketCols(); + inputFormatClass = part.getInputFormatClass(); + isArchived = Utilities.isArchived(part); + outputDir = part.getDataLocation().toString(); + } + } else { + inputFormatClass = tblObj.getInputFormatClass(); + bucketCols = tblObj.getBucketCols(); + outputDir = tblObj.getDataLocation().toString(); + } + + // throw a HiveException for non-rcfile. + if (!inputFormatClass.equals(RCFileInputFormat.class)) { + throw new SemanticException( + "Only RCFileFormat is supportted right now."); + } + + // throw a HiveException if the table/partition is bucketized + if (bucketCols != null && bucketCols.size() > 0) { + throw new SemanticException( + "Merge can not perform on bucketized partition/table."); + } + + // throw a HiveException if the table/partition is archived + if (isArchived) { + throw new SemanticException( + "Merge can not perform on archived partitions."); + } + } catch (HiveException e) { + throw new SemanticException(e); + } + + // input and output are the same + inputDir.add(outputDir); + + mergeDesc.setInputDir(inputDir); + mergeDesc.setOutputDir(outputDir); + + addInputsOutputsAlterTable(tableName, partSpec); + Task mergeTask = TaskFactory.get(new DDLWork( + getInputs(), getOutputs(), mergeDesc), conf); + + tableSpec tablepart = new tableSpec(this.db, conf, tablePartAST); + StatsWork statDesc = new StatsWork(tablepart); + statDesc.setNoStatsAggregator(true); + Task statTask = TaskFactory.get(statDesc, conf); + mergeTask.addDependentTask(statTask); + + rootTasks.add(mergeTask); + } private void analyzeAlterTableClusterSort(ASTNode ast) throws SemanticException { Index: ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g (revision 1073798) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g (working copy) @@ -238,6 +238,7 @@ TOK_DATABASEPROPERTIES; TOK_DBPROPLIST; TOK_ALTERDATABASE_PROPERTIES; +TOK_ALTERTABLE_ALTERPARTS_MERGEFILES; TOK_TABNAME; TOK_TABSRC; } @@ -671,6 +672,7 @@ : alterStatementSuffixFileFormat | alterStatementSuffixLocation | alterStatementSuffixProtectMode + | alterStatementSuffixMergeFiles ; alterStatementSuffixFileFormat @@ -694,6 +696,13 @@ -> ^(TOK_ALTERTABLE_ALTERPARTS_PROTECTMODE alterProtectMode) ; +alterStatementSuffixMergeFiles +@init { msgs.push(""); } +@after { msgs.pop(); } + : KW_CONCATENATE + -> ^(TOK_ALTERTABLE_ALTERPARTS_MERGEFILES) + ; + alterProtectMode @init { msgs.push("protect mode specification enable"); } @after { msgs.pop(); } @@ -2199,6 +2208,7 @@ KW_STATISTICS: 'STATISTICS'; KW_USE: 'USE'; KW_OPTION: 'OPTION'; +KW_CONCATENATE: 'CONCATENATE'; KW_SHOW_DATABASE: 'SHOW_DATABASE'; KW_UPDATE: 'UPDATE'; Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java (working copy) @@ -99,6 +99,9 @@ tablePartitionCommandType.put(HiveParser.TOK_ALTERTABLE_LOCATION, new HiveOperation[] { HiveOperation.ALTERTABLE_LOCATION, HiveOperation.ALTERPARTITION_LOCATION }); + tablePartitionCommandType.put(HiveParser.TOK_ALTERTABLE_ALTERPARTS_MERGEFILES, + new HiveOperation[] {HiveOperation.ALTERTABLE_MERGEFILES, + HiveOperation.ALTERPARTITION_MERGEFILES }); } public static BaseSemanticAnalyzer get(HiveConf conf, ASTNode tree) Index: ql/src/java/org/apache/hadoop/hive/ql/plan/DDLWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/DDLWork.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/DDLWork.java (working copy) @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.parse.AlterTablePartMergeFilesDesc; /** * DDLWork. @@ -73,6 +74,7 @@ * List of WriteEntities that are passed to the hooks. */ protected HashSet outputs; + private AlterTablePartMergeFilesDesc mergeFilesDesc; public DDLWork() { } @@ -376,6 +378,12 @@ this.showIndexesDesc = showIndexesDesc; } + public DDLWork(HashSet inputs, HashSet outputs, + AlterTablePartMergeFilesDesc mergeDesc) { + this(inputs, outputs); + this.mergeFilesDesc = mergeDesc; + } + /** * @return Create Database descriptor */ @@ -854,4 +862,19 @@ public AlterDatabaseDesc getAlterDatabaseDesc() { return this.alterDbDesc; } + + /** + * @return descriptor for merging files + */ + public AlterTablePartMergeFilesDesc getMergeFilesDesc() { + return mergeFilesDesc; + } + + /** + * @param mergeDesc descriptor of merging files + */ + public void setMergeFilesDesc(AlterTablePartMergeFilesDesc mergeDesc) { + this.mergeFilesDesc = mergeDesc; + } + } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java (working copy) @@ -80,7 +80,9 @@ QUERY("QUERY", new Privilege[]{Privilege.SELECT}, new Privilege[]{Privilege.ALTER_DATA, Privilege.CREATE}), ALTERINDEX_PROPS("ALTERINDEX_PROPS",null, null), ALTERDATABASE("ALTERDATABASE", null, null), - DESCDATABASE("DESCDATABASE", null, null), + DESCDATABASE("DESCDATABASE", null, null), + ALTERTABLE_MERGEFILES("ALTER_TABLE_MERGE", new Privilege[] { Privilege.SELECT }, new Privilege[] { Privilege.ALTER_DATA }), + ALTERPARTITION_MERGEFILES("ALTER_PARTITION_MERGE", new Privilege[] { Privilege.SELECT }, new Privilege[] { Privilege.ALTER_DATA }), ; private String operationName; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (working copy) @@ -279,6 +279,9 @@ * operator - but could be useful for debugging as well. */ private void setAliases() { + if(aliasToWork == null) { + return; + } for (String oneAlias : aliasToWork.keySet()) { aliasToWork.get(oneAlias).setAlias(oneAlias); } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java (revision 1073470) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java (working copy) @@ -33,6 +33,8 @@ private tableSpec tableSpecs; // source table spec -- for TableScanOperator private LoadTableDesc loadTableDesc; // same as MoveWork.loadTableDesc -- for FileSinkOperator private String aggKey; // aggregation key prefix + + private boolean noStatsAggregator = false; public StatsWork() { } @@ -62,4 +64,12 @@ return aggKey; } + public boolean getNoStatsAggregator() { + return noStatsAggregator; + } + + public void setNoStatsAggregator(boolean noStatsAggregator) { + this.noStatsAggregator = noStatsAggregator; + } + } Index: ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java =================================================================== --- ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java (revision 1073470) +++ ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java (working copy) @@ -52,6 +52,7 @@ import org.apache.hadoop.hive.cli.CliSessionState; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.MetaStoreUtils; +import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -330,7 +331,21 @@ db.setCurrentDatabase(dbName); for (String tblName : db.getAllTables()) { if (!DEFAULT_DATABASE_NAME.equals(dbName) || !srcTables.contains(tblName)) { + Table tblObj = db.getTable(tblName); + // dropping index table can not be dropped directly. Dropping the base + // table will automatically drop all its index table + if(tblObj.isIndexTable()) { + continue; + } db.dropTable(dbName, tblName); + } else { + // this table is defined in srcTables, drop all indexes on it + List indexes = db.getIndexes(dbName, tblName, (short)-1); + if (indexes != null && indexes.size() > 0) { + for (Index index : indexes) { + db.dropIndex(dbName, tblName, index.getIndexName(), true); + } + } } } if (!DEFAULT_DATABASE_NAME.equals(dbName)) { Index: ql/src/test/queries/clientnegative/merge_negative_1.q =================================================================== --- ql/src/test/queries/clientnegative/merge_negative_1.q (revision 0) +++ ql/src/test/queries/clientnegative/merge_negative_1.q (revision 0) @@ -0,0 +1,3 @@ +create table src2 like src; +CREATE INDEX src_index_merge_test ON TABLE src2(key) as 'COMPACT' WITH DEFERRED REBUILD; +alter table src2 concatenate; Index: ql/src/test/queries/clientnegative/merge_negative_2.q =================================================================== --- ql/src/test/queries/clientnegative/merge_negative_2.q (revision 0) +++ ql/src/test/queries/clientnegative/merge_negative_2.q (revision 0) @@ -0,0 +1,3 @@ +create table srcpart2 (key int, value string) partitioned by (ds string); +insert overwrite table srcpart2 partition (ds='2011') select * from src; +alter table srcpart2 concatenate; Index: ql/src/test/queries/clientpositive/alter_merge.q =================================================================== --- ql/src/test/queries/clientpositive/alter_merge.q (revision 0) +++ ql/src/test/queries/clientpositive/alter_merge.q (revision 0) @@ -0,0 +1,41 @@ +create table src_rc_merge_test(key int, value string) stored as rcfile; + +load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test; +load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test; +load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test; + +show table extended like `src_rc_merge_test`; + +select count(1) from src_rc_merge_test; +select sum(hash(key)), sum(hash(value)) from src_rc_merge_test; + +alter table src_rc_merge_test concatenate; + +show table extended like `src_rc_merge_test`; + +select count(1) from src_rc_merge_test; +select sum(hash(key)), sum(hash(value)) from src_rc_merge_test; + + +create table src_rc_merge_test_part(key int, value string) partitioned by (ds string) stored as rcfile; + +alter table src_rc_merge_test_part add partition (ds='2011'); + +load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_part partition (ds='2011'); +load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_part partition (ds='2011'); +load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_part partition (ds='2011'); + +show table extended like `src_rc_merge_test_part` partition (ds='2011'); + +select count(1) from src_rc_merge_test_part; +select sum(hash(key)), sum(hash(value)) from src_rc_merge_test_part; + +alter table src_rc_merge_test_part partition (ds='2011') concatenate; + +show table extended like `src_rc_merge_test_part` partition (ds='2011'); + +select count(1) from src_rc_merge_test_part; +select sum(hash(key)), sum(hash(value)) from src_rc_merge_test_part; + +drop table src_rc_merge_test; +drop table src_rc_merge_test_part; \ No newline at end of file Index: ql/src/test/queries/clientpositive/alter_merge_stats.q =================================================================== --- ql/src/test/queries/clientpositive/alter_merge_stats.q (revision 0) +++ ql/src/test/queries/clientpositive/alter_merge_stats.q (revision 0) @@ -0,0 +1,41 @@ +create table src_rc_merge_test_stat(key int, value string) stored as rcfile; + +load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_stat; +load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_stat; +load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_stat; + +show table extended like `src_rc_merge_test_stat`; +desc extended src_rc_merge_test_stat; + +analyze table src_rc_merge_test_stat compute statistics; + +desc extended src_rc_merge_test_stat; + +alter table src_rc_merge_test_stat concatenate; + +show table extended like `src_rc_merge_test_stat`; +desc extended src_rc_merge_test_stat; + + +create table src_rc_merge_test_part_stat(key int, value string) partitioned by (ds string) stored as rcfile; + +alter table src_rc_merge_test_part_stat add partition (ds='2011'); + +load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_part_stat partition (ds='2011'); +load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_part_stat partition (ds='2011'); +load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_part_stat partition (ds='2011'); + +show table extended like `src_rc_merge_test_part_stat` partition (ds='2011'); +desc extended src_rc_merge_test_part_stat; + +analyze table src_rc_merge_test_part_stat partition(ds='2011') compute statistics; + +desc extended src_rc_merge_test_part_stat; + +alter table src_rc_merge_test_part_stat partition (ds='2011') concatenate; + +show table extended like `src_rc_merge_test_part_stat` partition (ds='2011'); +desc extended src_rc_merge_test_part_stat; + +drop table src_rc_merge_test_stat; +drop table src_rc_merge_test_part_stat; \ No newline at end of file Index: ql/src/test/results/clientnegative/merge_negative_1.q.out =================================================================== --- ql/src/test/results/clientnegative/merge_negative_1.q.out (revision 0) +++ ql/src/test/results/clientnegative/merge_negative_1.q.out (revision 0) @@ -0,0 +1,10 @@ +PREHOOK: query: create table src2 like src +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table src2 like src +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src2 +PREHOOK: query: CREATE INDEX src_index_merge_test ON TABLE src2(key) as 'COMPACT' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +POSTHOOK: query: CREATE INDEX src_index_merge_test ON TABLE src2(key) as 'COMPACT' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +FAILED: Error in semantic analysis: org.apache.hadoop.hive.ql.parse.SemanticException: can not do merge because source table src2 is indexed. Index: ql/src/test/results/clientnegative/merge_negative_2.q.out =================================================================== --- ql/src/test/results/clientnegative/merge_negative_2.q.out (revision 0) +++ ql/src/test/results/clientnegative/merge_negative_2.q.out (revision 0) @@ -0,0 +1,16 @@ +PREHOOK: query: create table srcpart2 (key int, value string) partitioned by (ds string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table srcpart2 (key int, value string) partitioned by (ds string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@srcpart2 +PREHOOK: query: insert overwrite table srcpart2 partition (ds='2011') select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@srcpart2@ds=2011 +POSTHOOK: query: insert overwrite table srcpart2 partition (ds='2011') select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@srcpart2@ds=2011 +POSTHOOK: Lineage: srcpart2 PARTITION(ds=2011).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart2 PARTITION(ds=2011).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +FAILED: Error in semantic analysis: org.apache.hadoop.hive.ql.parse.SemanticException: source table srcpart2 is partitioned but no partition desc found. Index: ql/src/test/results/clientpositive/alter_merge.q.out =================================================================== --- ql/src/test/results/clientpositive/alter_merge.q.out (revision 0) +++ ql/src/test/results/clientpositive/alter_merge.q.out (revision 0) @@ -0,0 +1,227 @@ +PREHOOK: query: create table src_rc_merge_test(key int, value string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table src_rc_merge_test(key int, value string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src_rc_merge_test +PREHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test +PREHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test +PREHOOK: query: load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test +PREHOOK: query: show table extended like `src_rc_merge_test` +PREHOOK: type: SHOW_TABLESTATUS +POSTHOOK: query: show table extended like `src_rc_merge_test` +POSTHOOK: type: SHOW_TABLESTATUS +tableName:src_rc_merge_test +owner:heyongqiang +location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test +inputformat:org.apache.hadoop.hive.ql.io.RCFileInputFormat +outputformat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat +columns:struct columns { i32 key, string value} +partitioned:false +partitionColumns: +totalNumberFiles:3 +totalFileSize:636 +maxFileSize:222 +minFileSize:206 +lastAccessTime:0 +lastUpdateTime:1297289844000 + +PREHOOK: query: select count(1) from src_rc_merge_test +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test +PREHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-25_132_6609659589044932947/-mr-10000 +POSTHOOK: query: select count(1) from src_rc_merge_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test +POSTHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-25_132_6609659589044932947/-mr-10000 +15 +PREHOOK: query: select sum(hash(key)), sum(hash(value)) from src_rc_merge_test +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test +PREHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-31_973_310810915733974504/-mr-10000 +POSTHOOK: query: select sum(hash(key)), sum(hash(value)) from src_rc_merge_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test +POSTHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-31_973_310810915733974504/-mr-10000 +214 -7678496319 +PREHOOK: query: alter table src_rc_merge_test concatenate +PREHOOK: type: ALTER_TABLE_MERGE +PREHOOK: Input: default@src_rc_merge_test +PREHOOK: Output: default@src_rc_merge_test +POSTHOOK: query: alter table src_rc_merge_test concatenate +POSTHOOK: type: ALTER_TABLE_MERGE +POSTHOOK: Input: default@src_rc_merge_test +POSTHOOK: Output: default@src_rc_merge_test +PREHOOK: query: show table extended like `src_rc_merge_test` +PREHOOK: type: SHOW_TABLESTATUS +POSTHOOK: query: show table extended like `src_rc_merge_test` +POSTHOOK: type: SHOW_TABLESTATUS +tableName:src_rc_merge_test +owner:heyongqiang +location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test +inputformat:org.apache.hadoop.hive.ql.io.RCFileInputFormat +outputformat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat +columns:struct columns { i32 key, string value} +partitioned:false +partitionColumns: +totalNumberFiles:1 +totalFileSize:334 +maxFileSize:334 +minFileSize:334 +lastAccessTime:0 +lastUpdateTime:1297289860000 + +PREHOOK: query: select count(1) from src_rc_merge_test +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test +PREHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-41_208_5207931311702930559/-mr-10000 +POSTHOOK: query: select count(1) from src_rc_merge_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test +POSTHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-41_208_5207931311702930559/-mr-10000 +15 +PREHOOK: query: select sum(hash(key)), sum(hash(value)) from src_rc_merge_test +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test +PREHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-47_722_8434907403740180333/-mr-10000 +POSTHOOK: query: select sum(hash(key)), sum(hash(value)) from src_rc_merge_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test +POSTHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-47_722_8434907403740180333/-mr-10000 +214 -7678496319 +PREHOOK: query: create table src_rc_merge_test_part(key int, value string) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table src_rc_merge_test_part(key int, value string) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src_rc_merge_test_part +PREHOOK: query: alter table src_rc_merge_test_part add partition (ds='2011') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Input: default@src_rc_merge_test_part +POSTHOOK: query: alter table src_rc_merge_test_part add partition (ds='2011') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Input: default@src_rc_merge_test_part +POSTHOOK: Output: default@src_rc_merge_test_part@ds=2011 +PREHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_part partition (ds='2011') +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_part partition (ds='2011') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_part@ds=2011 +PREHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_part partition (ds='2011') +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_part partition (ds='2011') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_part@ds=2011 +PREHOOK: query: load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_part partition (ds='2011') +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_part partition (ds='2011') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_part@ds=2011 +PREHOOK: query: show table extended like `src_rc_merge_test_part` partition (ds='2011') +PREHOOK: type: SHOW_TABLESTATUS +POSTHOOK: query: show table extended like `src_rc_merge_test_part` partition (ds='2011') +POSTHOOK: type: SHOW_TABLESTATUS +tableName:src_rc_merge_test_part +owner:heyongqiang +location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_part/ds=2011 +inputformat:org.apache.hadoop.hive.ql.io.RCFileInputFormat +outputformat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat +columns:struct columns { i32 key, string value} +partitioned:true +partitionColumns:struct partition_columns { string ds} +totalNumberFiles:3 +totalFileSize:636 +maxFileSize:222 +minFileSize:206 +lastAccessTime:0 +lastUpdateTime:1297289876000 + +PREHOOK: query: select count(1) from src_rc_merge_test_part +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test_part@ds=2011 +PREHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-56_896_5278586649045165840/-mr-10000 +POSTHOOK: query: select count(1) from src_rc_merge_test_part +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test_part@ds=2011 +POSTHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-17-56_896_5278586649045165840/-mr-10000 +15 +PREHOOK: query: select sum(hash(key)), sum(hash(value)) from src_rc_merge_test_part +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test_part@ds=2011 +PREHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-18-03_376_7736781093036816805/-mr-10000 +POSTHOOK: query: select sum(hash(key)), sum(hash(value)) from src_rc_merge_test_part +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test_part@ds=2011 +POSTHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-18-03_376_7736781093036816805/-mr-10000 +214 -7678496319 +PREHOOK: query: alter table src_rc_merge_test_part partition (ds='2011') concatenate +PREHOOK: type: ALTER_PARTITION_MERGE +PREHOOK: Input: default@src_rc_merge_test_part +PREHOOK: Output: default@src_rc_merge_test_part@ds=2011 +POSTHOOK: query: alter table src_rc_merge_test_part partition (ds='2011') concatenate +POSTHOOK: type: ALTER_PARTITION_MERGE +POSTHOOK: Input: default@src_rc_merge_test_part +POSTHOOK: Output: default@src_rc_merge_test_part@ds=2011 +PREHOOK: query: show table extended like `src_rc_merge_test_part` partition (ds='2011') +PREHOOK: type: SHOW_TABLESTATUS +POSTHOOK: query: show table extended like `src_rc_merge_test_part` partition (ds='2011') +POSTHOOK: type: SHOW_TABLESTATUS +tableName:src_rc_merge_test_part +owner:heyongqiang +location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_part/ds=2011 +inputformat:org.apache.hadoop.hive.ql.io.RCFileInputFormat +outputformat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat +columns:struct columns { i32 key, string value} +partitioned:true +partitionColumns:struct partition_columns { string ds} +totalNumberFiles:1 +totalFileSize:334 +maxFileSize:334 +minFileSize:334 +lastAccessTime:0 +lastUpdateTime:1297289892000 + +PREHOOK: query: select count(1) from src_rc_merge_test_part +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test_part@ds=2011 +PREHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-18-12_761_3729513878577662537/-mr-10000 +POSTHOOK: query: select count(1) from src_rc_merge_test_part +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test_part@ds=2011 +POSTHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-18-12_761_3729513878577662537/-mr-10000 +15 +PREHOOK: query: select sum(hash(key)), sum(hash(value)) from src_rc_merge_test_part +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test_part@ds=2011 +PREHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-18-19_311_2661494784056382484/-mr-10000 +POSTHOOK: query: select sum(hash(key)), sum(hash(value)) from src_rc_merge_test_part +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test_part@ds=2011 +POSTHOOK: Output: file:/var/folders/6g/6grtCwPMEf4sqHUPpy6xQG9ByHg/-Tmp-/heyongqiang/hive_2011-02-09_14-18-19_311_2661494784056382484/-mr-10000 +214 -7678496319 +PREHOOK: query: drop table src_rc_merge_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@src_rc_merge_test +PREHOOK: Output: default@src_rc_merge_test +POSTHOOK: query: drop table src_rc_merge_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@src_rc_merge_test +POSTHOOK: Output: default@src_rc_merge_test +PREHOOK: query: drop table src_rc_merge_test_part +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@src_rc_merge_test_part +PREHOOK: Output: default@src_rc_merge_test_part +POSTHOOK: query: drop table src_rc_merge_test_part +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@src_rc_merge_test_part +POSTHOOK: Output: default@src_rc_merge_test_part Index: ql/src/test/results/clientpositive/alter_merge_stats.q.out =================================================================== --- ql/src/test/results/clientpositive/alter_merge_stats.q.out (revision 0) +++ ql/src/test/results/clientpositive/alter_merge_stats.q.out (revision 0) @@ -0,0 +1,224 @@ +PREHOOK: query: create table src_rc_merge_test_stat(key int, value string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table src_rc_merge_test_stat(key int, value string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src_rc_merge_test_stat +PREHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_stat +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_stat +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_stat +PREHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_stat +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_stat +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_stat +PREHOOK: query: load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_stat +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_stat +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_stat +PREHOOK: query: show table extended like `src_rc_merge_test_stat` +PREHOOK: type: SHOW_TABLESTATUS +POSTHOOK: query: show table extended like `src_rc_merge_test_stat` +POSTHOOK: type: SHOW_TABLESTATUS +tableName:src_rc_merge_test_stat +owner:heyongqiang +location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_stat +inputformat:org.apache.hadoop.hive.ql.io.RCFileInputFormat +outputformat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat +columns:struct columns { i32 key, string value} +partitioned:false +partitionColumns: +totalNumberFiles:3 +totalFileSize:636 +maxFileSize:222 +minFileSize:206 +lastAccessTime:0 +lastUpdateTime:1297289991000 + +PREHOOK: query: desc extended src_rc_merge_test_stat +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc extended src_rc_merge_test_stat +POSTHOOK: type: DESCTABLE +key int from deserializer +value string from deserializer + +Detailed Table Information Table(tableName:src_rc_merge_test_stat, dbName:default, owner:heyongqiang, createTime:1297289989, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:key, type:int, comment:null), FieldSchema(name:value, type:string, comment:null)], location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_stat, inputFormat:org.apache.hadoop.hive.ql.io.RCFileInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}), partitionKeys:[], parameters:{transient_lastDdlTime=1297289991}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE) +PREHOOK: query: analyze table src_rc_merge_test_stat compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test_stat +PREHOOK: Output: default@src_rc_merge_test_stat +POSTHOOK: query: analyze table src_rc_merge_test_stat compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test_stat +POSTHOOK: Output: default@src_rc_merge_test_stat +PREHOOK: query: desc extended src_rc_merge_test_stat +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc extended src_rc_merge_test_stat +POSTHOOK: type: DESCTABLE +key int from deserializer +value string from deserializer + +Detailed Table Information Table(tableName:src_rc_merge_test_stat, dbName:default, owner:heyongqiang, createTime:1297289989, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:key, type:int, comment:null), FieldSchema(name:value, type:string, comment:null)], location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_stat, inputFormat:org.apache.hadoop.hive.ql.io.RCFileInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}), partitionKeys:[], parameters:{numPartitions=0, numFiles=3, transient_lastDdlTime=1297289998, numRows=6, totalSize=636}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE) +PREHOOK: query: alter table src_rc_merge_test_stat concatenate +PREHOOK: type: ALTER_TABLE_MERGE +PREHOOK: Input: default@src_rc_merge_test_stat +PREHOOK: Output: default@src_rc_merge_test_stat +POSTHOOK: query: alter table src_rc_merge_test_stat concatenate +POSTHOOK: type: ALTER_TABLE_MERGE +POSTHOOK: Input: default@src_rc_merge_test_stat +POSTHOOK: Output: default@src_rc_merge_test_stat +PREHOOK: query: show table extended like `src_rc_merge_test_stat` +PREHOOK: type: SHOW_TABLESTATUS +POSTHOOK: query: show table extended like `src_rc_merge_test_stat` +POSTHOOK: type: SHOW_TABLESTATUS +tableName:src_rc_merge_test_stat +owner:heyongqiang +location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_stat +inputformat:org.apache.hadoop.hive.ql.io.RCFileInputFormat +outputformat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat +columns:struct columns { i32 key, string value} +partitioned:false +partitionColumns: +totalNumberFiles:1 +totalFileSize:334 +maxFileSize:334 +minFileSize:334 +lastAccessTime:0 +lastUpdateTime:1297290000000 + +PREHOOK: query: desc extended src_rc_merge_test_stat +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc extended src_rc_merge_test_stat +POSTHOOK: type: DESCTABLE +key int from deserializer +value string from deserializer + +Detailed Table Information Table(tableName:src_rc_merge_test_stat, dbName:default, owner:heyongqiang, createTime:1297289989, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:key, type:int, comment:null), FieldSchema(name:value, type:string, comment:null)], location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_stat, inputFormat:org.apache.hadoop.hive.ql.io.RCFileInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}), partitionKeys:[], parameters:{numPartitions=0, numFiles=1, transient_lastDdlTime=1297290000, numRows=6, totalSize=334}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE) +PREHOOK: query: create table src_rc_merge_test_part_stat(key int, value string) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table src_rc_merge_test_part_stat(key int, value string) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src_rc_merge_test_part_stat +PREHOOK: query: alter table src_rc_merge_test_part_stat add partition (ds='2011') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Input: default@src_rc_merge_test_part_stat +POSTHOOK: query: alter table src_rc_merge_test_part_stat add partition (ds='2011') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Input: default@src_rc_merge_test_part_stat +POSTHOOK: Output: default@src_rc_merge_test_part_stat@ds=2011 +PREHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_part_stat partition (ds='2011') +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' into table src_rc_merge_test_part_stat partition (ds='2011') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_part_stat@ds=2011 +PREHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_part_stat partition (ds='2011') +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' into table src_rc_merge_test_part_stat partition (ds='2011') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_part_stat@ds=2011 +PREHOOK: query: load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_part_stat partition (ds='2011') +PREHOOK: type: LOAD +POSTHOOK: query: load data local inpath '../data/files/smbbucket_3.rc' into table src_rc_merge_test_part_stat partition (ds='2011') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_rc_merge_test_part_stat@ds=2011 +PREHOOK: query: show table extended like `src_rc_merge_test_part_stat` partition (ds='2011') +PREHOOK: type: SHOW_TABLESTATUS +POSTHOOK: query: show table extended like `src_rc_merge_test_part_stat` partition (ds='2011') +POSTHOOK: type: SHOW_TABLESTATUS +tableName:src_rc_merge_test_part_stat +owner:heyongqiang +location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_part_stat/ds=2011 +inputformat:org.apache.hadoop.hive.ql.io.RCFileInputFormat +outputformat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat +columns:struct columns { i32 key, string value} +partitioned:true +partitionColumns:struct partition_columns { string ds} +totalNumberFiles:3 +totalFileSize:636 +maxFileSize:222 +minFileSize:206 +lastAccessTime:0 +lastUpdateTime:1297290003000 + +PREHOOK: query: desc extended src_rc_merge_test_part_stat +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc extended src_rc_merge_test_part_stat +POSTHOOK: type: DESCTABLE +key int from deserializer +value string from deserializer +ds string + +Detailed Table Information Table(tableName:src_rc_merge_test_part_stat, dbName:default, owner:heyongqiang, createTime:1297290001, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:key, type:int, comment:null), FieldSchema(name:value, type:string, comment:null)], location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_part_stat, inputFormat:org.apache.hadoop.hive.ql.io.RCFileInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}), partitionKeys:[FieldSchema(name:ds, type:string, comment:null)], parameters:{transient_lastDdlTime=1297290001}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE) +PREHOOK: query: analyze table src_rc_merge_test_part_stat partition(ds='2011') compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@src_rc_merge_test_part_stat@ds=2011 +PREHOOK: Output: default@src_rc_merge_test_part_stat +PREHOOK: Output: default@src_rc_merge_test_part_stat@ds=2011 +POSTHOOK: query: analyze table src_rc_merge_test_part_stat partition(ds='2011') compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_rc_merge_test_part_stat@ds=2011 +POSTHOOK: Output: default@src_rc_merge_test_part_stat +POSTHOOK: Output: default@src_rc_merge_test_part_stat@ds=2011 +PREHOOK: query: desc extended src_rc_merge_test_part_stat +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc extended src_rc_merge_test_part_stat +POSTHOOK: type: DESCTABLE +key int from deserializer +value string from deserializer +ds string + +Detailed Table Information Table(tableName:src_rc_merge_test_part_stat, dbName:default, owner:heyongqiang, createTime:1297290001, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:key, type:int, comment:null), FieldSchema(name:value, type:string, comment:null)], location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_part_stat, inputFormat:org.apache.hadoop.hive.ql.io.RCFileInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}), partitionKeys:[FieldSchema(name:ds, type:string, comment:null)], parameters:{numPartitions=1, numFiles=3, transient_lastDdlTime=1297290011, numRows=6, totalSize=636}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE) +PREHOOK: query: alter table src_rc_merge_test_part_stat partition (ds='2011') concatenate +PREHOOK: type: ALTER_PARTITION_MERGE +PREHOOK: Input: default@src_rc_merge_test_part_stat +PREHOOK: Output: default@src_rc_merge_test_part_stat@ds=2011 +POSTHOOK: query: alter table src_rc_merge_test_part_stat partition (ds='2011') concatenate +POSTHOOK: type: ALTER_PARTITION_MERGE +POSTHOOK: Input: default@src_rc_merge_test_part_stat +POSTHOOK: Output: default@src_rc_merge_test_part_stat@ds=2011 +PREHOOK: query: show table extended like `src_rc_merge_test_part_stat` partition (ds='2011') +PREHOOK: type: SHOW_TABLESTATUS +POSTHOOK: query: show table extended like `src_rc_merge_test_part_stat` partition (ds='2011') +POSTHOOK: type: SHOW_TABLESTATUS +tableName:src_rc_merge_test_part_stat +owner:heyongqiang +location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_part_stat/ds=2011 +inputformat:org.apache.hadoop.hive.ql.io.RCFileInputFormat +outputformat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat +columns:struct columns { i32 key, string value} +partitioned:true +partitionColumns:struct partition_columns { string ds} +totalNumberFiles:1 +totalFileSize:334 +maxFileSize:334 +minFileSize:334 +lastAccessTime:0 +lastUpdateTime:1297290013000 + +PREHOOK: query: desc extended src_rc_merge_test_part_stat +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc extended src_rc_merge_test_part_stat +POSTHOOK: type: DESCTABLE +key int from deserializer +value string from deserializer +ds string + +Detailed Table Information Table(tableName:src_rc_merge_test_part_stat, dbName:default, owner:heyongqiang, createTime:1297290001, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:key, type:int, comment:null), FieldSchema(name:value, type:string, comment:null)], location:pfile:/Users/heyongqiang/Documents/workspace/Hive-3/build/ql/test/data/warehouse/src_rc_merge_test_part_stat, inputFormat:org.apache.hadoop.hive.ql.io.RCFileInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.RCFileOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}), partitionKeys:[FieldSchema(name:ds, type:string, comment:null)], parameters:{numPartitions=1, numFiles=1, transient_lastDdlTime=1297290013, numRows=6, totalSize=334}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE) +PREHOOK: query: drop table src_rc_merge_test_stat +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@src_rc_merge_test_stat +PREHOOK: Output: default@src_rc_merge_test_stat +POSTHOOK: query: drop table src_rc_merge_test_stat +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@src_rc_merge_test_stat +POSTHOOK: Output: default@src_rc_merge_test_stat +PREHOOK: query: drop table src_rc_merge_test_part_stat +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@src_rc_merge_test_part_stat +PREHOOK: Output: default@src_rc_merge_test_part_stat +POSTHOOK: query: drop table src_rc_merge_test_part_stat +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@src_rc_merge_test_part_stat +POSTHOOK: Output: default@src_rc_merge_test_part_stat Index: shims/src/0.20/java/org/apache/hadoop/hive/shims/Hadoop20Shims.java =================================================================== --- shims/src/0.20/java/org/apache/hadoop/hive/shims/Hadoop20Shims.java (revision 1073470) +++ shims/src/0.20/java/org/apache/hadoop/hive/shims/Hadoop20Shims.java (working copy) @@ -148,35 +148,6 @@ } } - public static class CombineHiveKey implements WritableComparable { - Object key; - - public CombineHiveKey(Object key) { - this.key = key; - } - - public Object getKey() { - return key; - } - - public void setKey(Object key) { - this.key = key; - } - - public void write(DataOutput out) throws IOException { - throw new IOException("Method not supported"); - } - - public void readFields(DataInput in) throws IOException { - throw new IOException("Method not supported"); - } - - public int compareTo(Object w) { - assert false; - return 0; - } - } - /* This class should be replaced with org.apache.hadoop.mapred.lib.CombineFileRecordReader class, once * https://issues.apache.org/jira/browse/MAPREDUCE-955 is fixed. This code should be removed - it is a copy * of org.apache.hadoop.mapred.lib.CombineFileRecordReader Index: shims/src/0.20S/java/org/apache/hadoop/hive/shims/Hadoop20SShims.java =================================================================== --- shims/src/0.20S/java/org/apache/hadoop/hive/shims/Hadoop20SShims.java (revision 1073470) +++ shims/src/0.20S/java/org/apache/hadoop/hive/shims/Hadoop20SShims.java (working copy) @@ -150,35 +150,6 @@ } } - public static class CombineHiveKey implements WritableComparable { - Object key; - - public CombineHiveKey(Object key) { - this.key = key; - } - - public Object getKey() { - return key; - } - - public void setKey(Object key) { - this.key = key; - } - - public void write(DataOutput out) throws IOException { - throw new IOException("Method not supported"); - } - - public void readFields(DataInput in) throws IOException { - throw new IOException("Method not supported"); - } - - public int compareTo(Object w) { - assert false; - return 0; - } - } - /* This class should be replaced with org.apache.hadoop.mapred.lib.CombineFileRecordReader class, once * https://issues.apache.org/jira/browse/MAPREDUCE-955 is fixed. This code should be removed - it is a copy * of org.apache.hadoop.mapred.lib.CombineFileRecordReader Index: shims/src/common/java/org/apache/hadoop/hive/shims/CombineHiveKey.java =================================================================== --- shims/src/common/java/org/apache/hadoop/hive/shims/CombineHiveKey.java (revision 0) +++ shims/src/common/java/org/apache/hadoop/hive/shims/CombineHiveKey.java (revision 0) @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.shims; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.WritableComparable; + +public class CombineHiveKey implements WritableComparable { + Object key; + + public CombineHiveKey(Object key) { + this.key = key; + } + + public Object getKey() { + return key; + } + + public void setKey(Object key) { + this.key = key; + } + + public void write(DataOutput out) throws IOException { + throw new IOException("Method not supported"); + } + + public void readFields(DataInput in) throws IOException { + throw new IOException("Method not supported"); + } + + public int compareTo(Object w) { + assert false; + return 0; + } +} \ No newline at end of file