Index: ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java (revision 1555283) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java (working copy) @@ -70,6 +70,7 @@ /* * We need the base (operator.java) implementation of start/endGroup. * The parent class has functionality in those that map join can't use. + * Note: The mapjoin can be run in the reducer only on Tez. */ @Override public void endGroup() throws HiveException { @@ -87,6 +88,8 @@ int tagLen = conf.getTagLength(); + // On Tez only: The hash map might already be cached in the container we run + // the task in. On MR: The cache is a no-op. tableKey = "__HASH_MAP_"+this.getOperatorId()+"_container"; serdeKey = "__HASH_MAP_"+this.getOperatorId()+"_serde"; Index: ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java (revision 1555283) +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java (working copy) @@ -256,6 +256,12 @@ pathToPartitionInfo = mrwork.getPathToPartitionInfo(); } + /* + * AddSplitsForGroup collects separate calls to setInputPaths into one where possible. + * The reason for this is that this is faster on some InputFormats. E.g.: Orc will start + * a threadpool to do the work and calling it multiple times unnecessarily will create a lot + * of unnecessary thread pools. + */ private void addSplitsForGroup(List dirs, TableScanOperator tableScan, JobConf conf, InputFormat inputFormat, Class inputFormatClass, int splits, TableDesc table, List result) throws IOException {