diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionVertex.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionVertex.java index 26afe90faa..bd1f54ddf7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionVertex.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionVertex.java @@ -108,14 +108,15 @@ public int compare(InputSplit inp1, InputSplit inp2) { private final Multimap bucketToTaskMap = HashMultimap. create(); private final Map> inputToGroupedSplitMap = - new HashMap>(); + new HashMap<>(); private int numInputsAffectingRootInputSpecUpdate = 1; private int numInputsSeenSoFar = 0; private final Map emMap = Maps.newHashMap(); private final List finalSplits = Lists.newLinkedList(); private final Map inputNameInputSpecMap = - new HashMap(); + new HashMap<>(); + private Map inputToBucketMap; public CustomPartitionVertex(VertexManagerPluginContext context) { super(context); @@ -137,6 +138,7 @@ public void initialize() { this.mainWorkName = vertexConf.getInputName(); this.vertexType = vertexConf.getVertexType(); this.numInputsAffectingRootInputSpecUpdate = vertexConf.getNumInputs(); + this.inputToBucketMap = vertexConf.getInputToBucketMap(); } @Override @@ -242,7 +244,7 @@ public void onRootVertexInitialized(String inputName, InputDescriptor inputDescr } Multimap bucketToInitialSplitMap = - getBucketSplitMapForPath(pathFileSplitsMap); + getBucketSplitMapForPath(inputName, pathFileSplitsMap); try { int totalResource = context.getTotalAvailableResource().getMemory(); @@ -532,20 +534,50 @@ private FileSplit getFileSplitFromEvent(InputDataInformationEvent event) throws /* * This method generates the map of bucket to file splits. */ - private Multimap getBucketSplitMapForPath( + private Multimap getBucketSplitMapForPath(String inputName, Map> pathFileSplitsMap) { - int bucketNum = 0; Multimap bucketToInitialSplitMap = - ArrayListMultimap. create(); + ArrayListMultimap.create(); + boolean fallback = false; + Map bucketIds = new HashMap<>(); for (Map.Entry> entry : pathFileSplitsMap.entrySet()) { - int bucketId = bucketNum % numBuckets; + // Extract the buckedID from pathFilesMap, this is more accurate method, + // however. it may not work in certain cases where buckets are named + // after files used while loading data. In such case, fallback to old + // potential inaccurate method. + // The accepted file names are such as 000000_0, 000001_0_copy_1. + String bucketIdStr = + Utilities.getBucketFileNameFromPathSubString(entry.getKey()); + int bucketId = Utilities.getBucketIdFromFile(bucketIdStr); + if (bucketId == -1) { + fallback = true; + LOG.info("Fallback to using older sort based logic to assign " + + "buckets to splits."); + bucketIds.clear(); + break; + } + // Make sure the bucketId is at max the numBuckets + bucketId = bucketId % numBuckets; + bucketIds.put(bucketId, bucketId); for (FileSplit fsplit : entry.getValue()) { bucketToInitialSplitMap.put(bucketId, fsplit); } - bucketNum++; + } + + int bucketNum = 0; + if (fallback) { + // This is the old logic which assumes that the filenames are sorted in + // alphanumeric order and mapped to appropriate bucket number. + for (Map.Entry> entry : pathFileSplitsMap.entrySet()) { + int bucketId = bucketNum % numBuckets; + for (FileSplit fsplit : entry.getValue()) { + bucketToInitialSplitMap.put(bucketId, fsplit); + } + bucketNum++; + } } // this is just for SMB join use-case. The numBuckets would be equal to that of the big table @@ -553,16 +585,35 @@ private FileSplit getFileSplitFromEvent(InputDataInformationEvent event) throws // data from the right buckets to the big table side. For e.g. Big table has 8 buckets and small // table has 4 buckets, bucket 0 of small table needs to be sent to bucket 4 of the big table as // well. - if (bucketNum < numBuckets) { - int loopedBucketId = 0; - for (; bucketNum < numBuckets; bucketNum++) { - for (InputSplit fsplit : bucketToInitialSplitMap.get(loopedBucketId)) { - bucketToInitialSplitMap.put(bucketNum, fsplit); + if (numInputsAffectingRootInputSpecUpdate != 1) { + // small table + if (fallback && bucketNum < numBuckets) { + // Old logic. + int loopedBucketId = 0; + for (; bucketNum < numBuckets; bucketNum++) { + for (InputSplit fsplit : bucketToInitialSplitMap.get(loopedBucketId)) { + bucketToInitialSplitMap.put(bucketNum, fsplit); + } + loopedBucketId++; + } + } else { + // new logic. + if (inputToBucketMap.containsKey(inputName)) { + int inputNumBuckets = inputToBucketMap.get(inputName); + if (inputNumBuckets < numBuckets) { + // Need to send the splits to multiple buckets + for (int i = 1; i < numBuckets / inputNumBuckets; i++) { + int bucketIdBase = i * inputNumBuckets; + for (Integer bucketId : bucketIds.keySet()) { + for (InputSplit fsplit : bucketToInitialSplitMap.get(bucketId)) { + bucketToInitialSplitMap.put(bucketIdBase + bucketId, fsplit); + } + } + } + } } - loopedBucketId++; } } - return bucketToInitialSplitMap; } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomVertexConfiguration.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomVertexConfiguration.java index ef5e7edcd6..4301829517 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomVertexConfiguration.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomVertexConfiguration.java @@ -21,7 +21,10 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import com.google.common.base.Preconditions; import org.apache.hadoop.hive.ql.plan.TezWork.VertexType; import org.apache.hadoop.io.Writable; @@ -39,22 +42,24 @@ private VertexType vertexType = VertexType.AUTO_INITIALIZED_EDGES; private int numInputs; private String inputName; + private Map inputToBucketMap; public CustomVertexConfiguration() { } // this is the constructor to use for the Bucket map join case. public CustomVertexConfiguration(int numBuckets, VertexType vertexType) { - this(numBuckets, vertexType, "", 1); + this(numBuckets, vertexType, "", 1, null); } // this is the constructor to use for SMB. public CustomVertexConfiguration(int numBuckets, VertexType vertexType, String inputName, - int numInputs) { + int numInputs, Map inputToBucketMap) { this.numBuckets = numBuckets; this.vertexType = vertexType; this.numInputs = numInputs; this.inputName = inputName; + this.inputToBucketMap = inputToBucketMap; } @Override @@ -63,6 +68,14 @@ public void write(DataOutput out) throws IOException { out.writeInt(this.numBuckets); out.writeInt(numInputs); out.writeUTF(inputName); + int sz = inputToBucketMap != null ? inputToBucketMap.size() : 0; + out.writeInt(sz); + if (sz > 0) { + for (Map.Entry entry : inputToBucketMap.entrySet()) { + out.writeUTF(entry.getKey()); + out.writeInt(entry.getValue()); + } + } } @Override @@ -71,6 +84,16 @@ public void readFields(DataInput in) throws IOException { this.numBuckets = in.readInt(); this.numInputs = in.readInt(); this.inputName = in.readUTF(); + int sz = in.readInt(); + Preconditions.checkState(sz >= 0); + if (sz == 0) { + this.inputToBucketMap = null; + } else { + this.inputToBucketMap = new HashMap<>(); + for (int i = 0; i < sz; i++) { + this.inputToBucketMap.put(in.readUTF(), in.readInt()); + } + } } public int getNumBuckets() { @@ -88,4 +111,8 @@ public String getInputName() { public int getNumInputs() { return numInputs; } + + public Map getInputToBucketMap() { + return inputToBucketMap; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java index 9885038588..0e75f6e5e8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java @@ -21,6 +21,7 @@ import java.util.concurrent.ConcurrentHashMap; import com.google.common.base.Function; +import com.google.common.base.Preconditions; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import javax.security.auth.login.LoginException; @@ -568,13 +569,26 @@ private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, FileSyste MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build()); } + // To be populated for SMB joins only for all the small tables + Map inputToBucketMap = new HashMap<>(); + if (mergeJoinWork.getMergeJoinOperator().getParentOperators().size() == 1 + && mergeJoinWork.getMergeJoinOperator().getOpTraits() != null) { + // This is an SMB join. + for (BaseWork work : mapWorkList) { + MapWork mw = (MapWork) work; + Map> aliasToWork = mw.getAliasToWork(); + Preconditions.checkState(aliasToWork.size() == 1, + "More than 1 alias in SMB mapwork"); + inputToBucketMap.put(mw.getName(), mw.getWorks().get(0).getOpTraits().getNumBuckets()); + } + } VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName()); // the +1 to the size is because of the main work. CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf() .getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), - mapWorkList.size() + 1); + mapWorkList.size() + 1, inputToBucketMap); DataOutputBuffer dob = new DataOutputBuffer(); vertexConf.write(dob); byte[] userPayload = dob.getData(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java index dc698c8de8..a235f3fbf4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java @@ -576,6 +576,7 @@ private boolean checkConvertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcCont return false; } ReduceSinkOperator rsOp = (ReduceSinkOperator) parentOp; + if (!checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getSortCols(), rsOp .getOpTraits().getSortCols(), rsOp.getColumnExprMap(), false)) { LOG.info("We cannot convert to SMB because the sort column names do not match."); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index 54f5bab6de..361976311d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -23,6 +23,7 @@ import java.io.Serializable; import java.net.URI; import java.net.URISyntaxException; +import java.util.Arrays; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -160,6 +161,48 @@ private URI initializeFromURI(String fromPath, boolean isLocal) throws IOExcepti "source contains directory: " + oneSrc.getPath().toString())); } } + // Do another loop if table is bucketed + List bucketCols = table.getBucketCols(); + if (bucketCols != null && !bucketCols.isEmpty()) { + // Hive assumes that user names the files as per the corresponding + // bucket. For e.g, file names should follow the format 000000_0, 000000_1 etc. + // Here the 1st file will belong to bucket 0 and 2nd to bucket 1 and so on. + boolean[] bucketArray = new boolean[table.getNumBuckets()]; + // initialize the array + Arrays.fill(bucketArray, false); + int numBuckets = table.getNumBuckets(); + + for (FileStatus oneSrc : srcs) { + String bucketName = oneSrc.getPath().getName(); + + //get the bucket id + String bucketIdStr = + Utilities.getBucketFileNameFromPathSubString(bucketName); + int bucketId = Utilities.getBucketIdFromFile(bucketIdStr); + LOG.debug("bucket ID for file " + oneSrc.getPath() + " = " + bucketId + + " for table " + table.getFullyQualifiedName()); + if (bucketId == -1) { + throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg( + "The file name is invalid : " + + oneSrc.getPath().toString() + " for table " + + table.getFullyQualifiedName())); + } + if (bucketId >= numBuckets) { + throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg( + "The file name corresponds to invalid bucketId : " + + oneSrc.getPath().toString()) + + ". Maximum number of buckets can be " + numBuckets + + " for table " + table.getFullyQualifiedName()); + } + if (bucketArray[bucketId]) { + throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg( + "Multiple files for same bucket : " + bucketId + + ". Only 1 file per bucket allowed in single load command. To load multiple files for same bucket, use multiple statements for table " + + table.getFullyQualifiedName())); + } + bucketArray[bucketId] = true; + } + } } catch (IOException e) { // Has to use full name to make sure it does not conflict with // org.apache.commons.lang.StringUtils diff --git a/ql/src/test/queries/clientnegative/bucket_mapjoin_mismatch1.q b/ql/src/test/queries/clientnegative/bucket_mapjoin_mismatch1.q index 5f653bc9bb..118db89375 100644 --- a/ql/src/test/queries/clientnegative/bucket_mapjoin_mismatch1.q +++ b/ql/src/test/queries/clientnegative/bucket_mapjoin_mismatch1.q @@ -14,9 +14,9 @@ load data local inpath '../../data/files/bmj/000002_0' CREATE TABLE srcbucket_mapjoin_part_2 (key int, value string) partitioned by (ds string) CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; -load data local inpath '../../data/files/bmj/000002_0' +load data local inpath '../../data/files/bmj/000000_0' INTO TABLE srcbucket_mapjoin_part_2 partition(ds='2008-04-08'); -load data local inpath '../../data/files/bmj/000003_0' +load data local inpath '../../data/files/bmj/000001_0' INTO TABLE srcbucket_mapjoin_part_2 partition(ds='2008-04-08'); -- The number of buckets in the 2 tables above (being joined later) dont match. diff --git a/ql/src/test/queries/clientpositive/auto_sortmerge_join_16.q b/ql/src/test/queries/clientpositive/auto_sortmerge_join_16.q index 8216b538c2..6d177aaaf0 100644 --- a/ql/src/test/queries/clientpositive/auto_sortmerge_join_16.q +++ b/ql/src/test/queries/clientpositive/auto_sortmerge_join_16.q @@ -1,12 +1,12 @@ set hive.strict.checks.bucketing=false; -set hive.auto.convert.join=true; +set hive.auto.convert.join=false; set hive.exec.dynamic.partition.mode=nonstrict; -set hive.auto.convert.sortmerge.join=true; +set hive.auto.convert.sortmerge.join=false; set hive.optimize.bucketmapjoin = true; set hive.optimize.bucketmapjoin.sortedmerge = true; @@ -58,39 +58,121 @@ where file_tag between 1 and 2; load data local inpath '../../data/files/auto_sortmerge_join/small/000000_0' overwrite into table stage_bucket_big partition (file_tag='1'); -insert overwrite table bucket_big partition(day,pri) -select -key, -value, -'day1' as day, -1 as pri -from -stage_bucket_big -where -file_tag='1'; - -select -a.key , -a.value , -b.value , -'day1' as day, -1 as pri -from -( -select -key, -value -from bucket_big where day='day1' -) a -left outer join -( -select -key, -value -from bucket_small -where pri between 1 and 2 -) b -on +insert overwrite table bucket_big partition(day,pri) + select key, value, 'day1' as day, 1 as pri + from stage_bucket_big + where file_tag='1'; + +explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +; + +select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b +on (a.key = b.key) -; +; + +set hive.auto.convert.join=true; +set hive.auto.convert.join.noconditionaltask.size=1; +set hive.auto.convert.sortmerge.join=true; + +explain select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +; + +select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +; + +drop table bucket_big; +drop table bucket_small; + +-- Test to make sure SMB is not kicked in when small table has more buckets than big table + +CREATE TABLE bucket_big +( +key BIGINT, +value STRING +) +PARTITIONED BY (day STRING, pri bigint) +clustered by (key) sorted by (key) into 12 buckets +stored as RCFile; + +CREATE TABLE bucket_small +( +key BIGINT, +value string +) +PARTITIONED BY (pri bigint) +clustered by (key) sorted by (key) into 24 buckets +stored as RCFile; + +insert overwrite table bucket_small partition(pri) +select +key, +value, +file_tag as pri +from +stage_bucket_small +where file_tag between 1 and 2; +insert overwrite table bucket_big partition(day,pri) +select key, value, 'day1' as day, 1 as pri +from stage_bucket_big +where file_tag='1'; + + +explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +; + +select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +; diff --git a/ql/src/test/results/clientnegative/bucket_mapjoin_mismatch1.q.out b/ql/src/test/results/clientnegative/bucket_mapjoin_mismatch1.q.out index b9c2e6f827..6ca6c5d553 100644 --- a/ql/src/test/results/clientnegative/bucket_mapjoin_mismatch1.q.out +++ b/ql/src/test/results/clientnegative/bucket_mapjoin_mismatch1.q.out @@ -53,23 +53,23 @@ POSTHOOK: query: CREATE TABLE srcbucket_mapjoin_part_2 (key int, value string) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@srcbucket_mapjoin_part_2 -PREHOOK: query: load data local inpath '../../data/files/bmj/000002_0' +PREHOOK: query: load data local inpath '../../data/files/bmj/000000_0' INTO TABLE srcbucket_mapjoin_part_2 partition(ds='2008-04-08') PREHOOK: type: LOAD #### A masked pattern was here #### PREHOOK: Output: default@srcbucket_mapjoin_part_2 -POSTHOOK: query: load data local inpath '../../data/files/bmj/000002_0' +POSTHOOK: query: load data local inpath '../../data/files/bmj/000000_0' INTO TABLE srcbucket_mapjoin_part_2 partition(ds='2008-04-08') POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@srcbucket_mapjoin_part_2 POSTHOOK: Output: default@srcbucket_mapjoin_part_2@ds=2008-04-08 -PREHOOK: query: load data local inpath '../../data/files/bmj/000003_0' +PREHOOK: query: load data local inpath '../../data/files/bmj/000001_0' INTO TABLE srcbucket_mapjoin_part_2 partition(ds='2008-04-08') PREHOOK: type: LOAD #### A masked pattern was here #### PREHOOK: Output: default@srcbucket_mapjoin_part_2@ds=2008-04-08 -POSTHOOK: query: load data local inpath '../../data/files/bmj/000003_0' +POSTHOOK: query: load data local inpath '../../data/files/bmj/000001_0' INTO TABLE srcbucket_mapjoin_part_2 partition(ds='2008-04-08') POSTHOOK: type: LOAD #### A masked pattern was here #### @@ -110,19 +110,19 @@ STAGE PLANS: value expressions: _col1 (type: string) TableScan alias: b - Statistics: Num rows: 78 Data size: 30620 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 70 Data size: 27500 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) - Statistics: Num rows: 78 Data size: 30620 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 70 Data size: 27500 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int), value (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 78 Data size: 30620 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 70 Data size: 27500 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 78 Data size: 30620 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 70 Data size: 27500 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: string) Reduce Operator Tree: Join Operator @@ -177,10 +177,10 @@ STAGE PLANS: b TableScan alias: b - Statistics: Num rows: 102 Data size: 30620 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 92 Data size: 27500 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) - Statistics: Num rows: 102 Data size: 30620 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 92 Data size: 27500 Basic stats: COMPLETE Column stats: NONE HashTable Sink Operator keys: 0 key (type: int) diff --git a/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_16.q.out b/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_16.q.out index 91408df129..b67deeaa26 100644 --- a/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_16.q.out +++ b/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_16.q.out @@ -136,58 +136,130 @@ POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@stage_bucket_big POSTHOOK: Output: default@stage_bucket_big@file_tag=1 -PREHOOK: query: insert overwrite table bucket_big partition(day,pri) -select -key, -value, -'day1' as day, -1 as pri -from -stage_bucket_big -where -file_tag='1' +PREHOOK: query: insert overwrite table bucket_big partition(day,pri) + select key, value, 'day1' as day, 1 as pri + from stage_bucket_big + where file_tag='1' PREHOOK: type: QUERY PREHOOK: Input: default@stage_bucket_big PREHOOK: Input: default@stage_bucket_big@file_tag=1 PREHOOK: Output: default@bucket_big -POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) -select -key, -value, -'day1' as day, -1 as pri -from -stage_bucket_big -where -file_tag='1' +POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) + select key, value, 'day1' as day, 1 as pri + from stage_bucket_big + where file_tag='1' POSTHOOK: type: QUERY POSTHOOK: Input: default@stage_bucket_big POSTHOOK: Input: default@stage_bucket_big@file_tag=1 POSTHOOK: Output: default@bucket_big@day=day1/pri=1 POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).key SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:key, type:bigint, comment:null), ] POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).value SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:value, type:string, comment:null), ] -PREHOOK: query: select -a.key , -a.value , -b.value , -'day1' as day, -1 as pri -from -( -select -key, -value -from bucket_big where day='day1' -) a -left outer join -( -select -key, -value -from bucket_small -where pri between 1 and 2 -) b -on +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 495 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 495 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 5 Data size: 495 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: bucket_small + Statistics: Num rows: 236 Data size: 23364 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 236 Data size: 23364 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 236 Data size: 23364 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 15 Data size: 2850 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 15 Data size: 4230 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 15 Data size: 4230 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b +on (a.key = b.key) PREHOOK: type: QUERY PREHOOK: Input: default@bucket_big @@ -196,28 +268,390 @@ PREHOOK: Input: default@bucket_small PREHOOK: Input: default@bucket_small@pri=1 PREHOOK: Input: default@bucket_small@pri=2 #### A masked pattern was here #### -POSTHOOK: query: select -a.key , -a.value , -b.value , -'day1' as day, -1 as pri -from -( -select -key, -value -from bucket_big where day='day1' -) a -left outer join -( -select -key, -value -from bucket_small -where pri between 1 and 2 -) b -on +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@day=day1/pri=1 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@pri=1 +POSTHOOK: Input: default@bucket_small@pri=2 +#### A masked pattern was here #### +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +374 val_374 val_374 day1 1 +374 val_374 val_374 day1 1 +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_small + Statistics: Num rows: 236 Data size: 23364 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 236 Data size: 23364 Basic stats: COMPLETE Column stats: COMPLETE + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 495 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 495 Basic stats: COMPLETE Column stats: COMPLETE + Merge Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 15 Data size: 2850 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 15 Data size: 4230 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 15 Data size: 4230 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: llap + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@day=day1/pri=1 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@pri=1 +PREHOOK: Input: default@bucket_small@pri=2 +#### A masked pattern was here #### +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@day=day1/pri=1 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@pri=1 +POSTHOOK: Input: default@bucket_small@pri=2 +#### A masked pattern was here #### +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +374 val_374 val_374 day1 1 +374 val_374 val_374 day1 1 +PREHOOK: query: drop table bucket_big +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@bucket_big +PREHOOK: Output: default@bucket_big +POSTHOOK: query: drop table bucket_big +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@bucket_big +POSTHOOK: Output: default@bucket_big +PREHOOK: query: drop table bucket_small +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@bucket_small +PREHOOK: Output: default@bucket_small +POSTHOOK: query: drop table bucket_small +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@bucket_small +POSTHOOK: Output: default@bucket_small +PREHOOK: query: CREATE TABLE bucket_big +( +key BIGINT, +value STRING +) +PARTITIONED BY (day STRING, pri bigint) +clustered by (key) sorted by (key) into 12 buckets +stored as RCFile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bucket_big +POSTHOOK: query: CREATE TABLE bucket_big +( +key BIGINT, +value STRING +) +PARTITIONED BY (day STRING, pri bigint) +clustered by (key) sorted by (key) into 12 buckets +stored as RCFile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bucket_big +PREHOOK: query: CREATE TABLE bucket_small +( +key BIGINT, +value string +) +PARTITIONED BY (pri bigint) +clustered by (key) sorted by (key) into 24 buckets +stored as RCFile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bucket_small +POSTHOOK: query: CREATE TABLE bucket_small +( +key BIGINT, +value string +) +PARTITIONED BY (pri bigint) +clustered by (key) sorted by (key) into 24 buckets +stored as RCFile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bucket_small +PREHOOK: query: insert overwrite table bucket_small partition(pri) +select +key, +value, +file_tag as pri +from +stage_bucket_small +where file_tag between 1 and 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@stage_bucket_small +PREHOOK: Input: default@stage_bucket_small@file_tag=1 +PREHOOK: Input: default@stage_bucket_small@file_tag=2 +PREHOOK: Output: default@bucket_small +POSTHOOK: query: insert overwrite table bucket_small partition(pri) +select +key, +value, +file_tag as pri +from +stage_bucket_small +where file_tag between 1 and 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@stage_bucket_small +POSTHOOK: Input: default@stage_bucket_small@file_tag=1 +POSTHOOK: Input: default@stage_bucket_small@file_tag=2 +POSTHOOK: Output: default@bucket_small@pri=1 +POSTHOOK: Output: default@bucket_small@pri=2 +POSTHOOK: Lineage: bucket_small PARTITION(pri=1).key SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=1).value SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=2).key SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=2).value SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: insert overwrite table bucket_big partition(day,pri) +select key, value, 'day1' as day, 1 as pri +from stage_bucket_big +where file_tag='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@stage_bucket_big +PREHOOK: Input: default@stage_bucket_big@file_tag=1 +PREHOOK: Output: default@bucket_big +POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) +select key, value, 'day1' as day, 1 as pri +from stage_bucket_big +where file_tag='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@stage_bucket_big +POSTHOOK: Input: default@stage_bucket_big@file_tag=1 +POSTHOOK: Output: default@bucket_big@day=day1/pri=1 +POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).key SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).value SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_small + Statistics: Num rows: 236 Data size: 23364 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 236 Data size: 23364 Basic stats: COMPLETE Column stats: COMPLETE + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 495 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 495 Basic stats: COMPLETE Column stats: COMPLETE + Merge Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 15 Data size: 2850 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 15 Data size: 4230 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 15 Data size: 4230 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: llap + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@day=day1/pri=1 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@pri=1 +PREHOOK: Input: default@bucket_small@pri=2 +#### A masked pattern was here #### +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on (a.key = b.key) POSTHOOK: type: QUERY POSTHOOK: Input: default@bucket_big diff --git a/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_16.q.out b/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_16.q.out index 9939e834bd..3e2aafca62 100644 --- a/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_16.q.out +++ b/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_16.q.out @@ -136,58 +136,124 @@ POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@stage_bucket_big POSTHOOK: Output: default@stage_bucket_big@file_tag=1 -PREHOOK: query: insert overwrite table bucket_big partition(day,pri) -select -key, -value, -'day1' as day, -1 as pri -from -stage_bucket_big -where -file_tag='1' +PREHOOK: query: insert overwrite table bucket_big partition(day,pri) + select key, value, 'day1' as day, 1 as pri + from stage_bucket_big + where file_tag='1' PREHOOK: type: QUERY PREHOOK: Input: default@stage_bucket_big PREHOOK: Input: default@stage_bucket_big@file_tag=1 PREHOOK: Output: default@bucket_big -POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) -select -key, -value, -'day1' as day, -1 as pri -from -stage_bucket_big -where -file_tag='1' +POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) + select key, value, 'day1' as day, 1 as pri + from stage_bucket_big + where file_tag='1' POSTHOOK: type: QUERY POSTHOOK: Input: default@stage_bucket_big POSTHOOK: Input: default@stage_bucket_big@file_tag=1 POSTHOOK: Output: default@bucket_big@day=day1/pri=1 POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).key SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:key, type:bigint, comment:null), ] POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).value SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:value, type:string, comment:null), ] -PREHOOK: query: select -a.key , -a.value , -b.value , -'day1' as day, -1 as pri -from -( -select -key, -value -from bucket_big where day='day1' -) a -left outer join -( -select -key, -value -from bucket_small -where pri between 1 and 2 -) b -on +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 4), Map 3 (PARTITION-LEVEL SORT, 4) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Map 3 + Map Operator Tree: + TableScan + alias: bucket_small + Statistics: Num rows: 236 Data size: 2244 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 236 Data size: 2244 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 236 Data size: 2244 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reducer 2 + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b +on (a.key = b.key) PREHOOK: type: QUERY PREHOOK: Input: default@bucket_big @@ -196,28 +262,372 @@ PREHOOK: Input: default@bucket_small PREHOOK: Input: default@bucket_small@pri=1 PREHOOK: Input: default@bucket_small@pri=2 PREHOOK: Output: hdfs://### HDFS PATH ### -POSTHOOK: query: select -a.key , -a.value , -b.value , -'day1' as day, -1 as pri -from -( -select -key, -value -from bucket_big where day='day1' -) a -left outer join -( -select -key, -value -from bucket_small -where pri between 1 and 2 -) b -on +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@day=day1/pri=1 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@pri=1 +POSTHOOK: Input: default@bucket_small@pri=2 +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +374 val_374 val_374 day1 1 +374 val_374 val_374 day1 1 +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Sorted Merge Bucket Map Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@day=day1/pri=1 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@pri=1 +PREHOOK: Input: default@bucket_small@pri=2 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@day=day1/pri=1 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@pri=1 +POSTHOOK: Input: default@bucket_small@pri=2 +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +374 val_374 val_374 day1 1 +374 val_374 val_374 day1 1 +PREHOOK: query: drop table bucket_big +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@bucket_big +PREHOOK: Output: default@bucket_big +POSTHOOK: query: drop table bucket_big +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@bucket_big +POSTHOOK: Output: default@bucket_big +PREHOOK: query: drop table bucket_small +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@bucket_small +PREHOOK: Output: default@bucket_small +POSTHOOK: query: drop table bucket_small +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@bucket_small +POSTHOOK: Output: default@bucket_small +PREHOOK: query: CREATE TABLE bucket_big +( +key BIGINT, +value STRING +) +PARTITIONED BY (day STRING, pri bigint) +clustered by (key) sorted by (key) into 12 buckets +stored as RCFile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bucket_big +POSTHOOK: query: CREATE TABLE bucket_big +( +key BIGINT, +value STRING +) +PARTITIONED BY (day STRING, pri bigint) +clustered by (key) sorted by (key) into 12 buckets +stored as RCFile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bucket_big +PREHOOK: query: CREATE TABLE bucket_small +( +key BIGINT, +value string +) +PARTITIONED BY (pri bigint) +clustered by (key) sorted by (key) into 24 buckets +stored as RCFile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bucket_small +POSTHOOK: query: CREATE TABLE bucket_small +( +key BIGINT, +value string +) +PARTITIONED BY (pri bigint) +clustered by (key) sorted by (key) into 24 buckets +stored as RCFile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bucket_small +PREHOOK: query: insert overwrite table bucket_small partition(pri) +select +key, +value, +file_tag as pri +from +stage_bucket_small +where file_tag between 1 and 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@stage_bucket_small +PREHOOK: Input: default@stage_bucket_small@file_tag=1 +PREHOOK: Input: default@stage_bucket_small@file_tag=2 +PREHOOK: Output: default@bucket_small +POSTHOOK: query: insert overwrite table bucket_small partition(pri) +select +key, +value, +file_tag as pri +from +stage_bucket_small +where file_tag between 1 and 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@stage_bucket_small +POSTHOOK: Input: default@stage_bucket_small@file_tag=1 +POSTHOOK: Input: default@stage_bucket_small@file_tag=2 +POSTHOOK: Output: default@bucket_small@pri=1 +POSTHOOK: Output: default@bucket_small@pri=2 +POSTHOOK: Lineage: bucket_small PARTITION(pri=1).key SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=1).value SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=2).key SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=2).value SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: insert overwrite table bucket_big partition(day,pri) +select key, value, 'day1' as day, 1 as pri +from stage_bucket_big +where file_tag='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@stage_bucket_big +PREHOOK: Input: default@stage_bucket_big@file_tag=1 +PREHOOK: Output: default@bucket_big +POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) +select key, value, 'day1' as day, 1 as pri +from stage_bucket_big +where file_tag='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@stage_bucket_big +POSTHOOK: Input: default@stage_bucket_big@file_tag=1 +POSTHOOK: Output: default@bucket_big@day=day1/pri=1 +POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).key SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).value SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Sorted Merge Bucket Map Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@day=day1/pri=1 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@pri=1 +PREHOOK: Input: default@bucket_small@pri=2 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on (a.key = b.key) POSTHOOK: type: QUERY POSTHOOK: Input: default@bucket_big diff --git a/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_16.q.out_spark b/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_16.q.out_spark index 91408df129..6aa60ba5a9 100644 --- a/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_16.q.out_spark +++ b/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_16.q.out_spark @@ -136,58 +136,124 @@ POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@stage_bucket_big POSTHOOK: Output: default@stage_bucket_big@file_tag=1 -PREHOOK: query: insert overwrite table bucket_big partition(day,pri) -select -key, -value, -'day1' as day, -1 as pri -from -stage_bucket_big -where -file_tag='1' +PREHOOK: query: insert overwrite table bucket_big partition(day,pri) + select key, value, 'day1' as day, 1 as pri + from stage_bucket_big + where file_tag='1' PREHOOK: type: QUERY PREHOOK: Input: default@stage_bucket_big PREHOOK: Input: default@stage_bucket_big@file_tag=1 PREHOOK: Output: default@bucket_big -POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) -select -key, -value, -'day1' as day, -1 as pri -from -stage_bucket_big -where -file_tag='1' +POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) + select key, value, 'day1' as day, 1 as pri + from stage_bucket_big + where file_tag='1' POSTHOOK: type: QUERY POSTHOOK: Input: default@stage_bucket_big POSTHOOK: Input: default@stage_bucket_big@file_tag=1 POSTHOOK: Output: default@bucket_big@day=day1/pri=1 POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).key SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:key, type:bigint, comment:null), ] POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).value SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:value, type:string, comment:null), ] -PREHOOK: query: select -a.key , -a.value , -b.value , -'day1' as day, -1 as pri -from -( -select -key, -value -from bucket_big where day='day1' -) a -left outer join -( -select -key, -value -from bucket_small -where pri between 1 and 2 -) b -on +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 2), Map 3 (PARTITION-LEVEL SORT, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Map 3 + Map Operator Tree: + TableScan + alias: bucket_small + Statistics: Num rows: 236 Data size: 2244 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 236 Data size: 2244 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 236 Data size: 2244 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reducer 2 + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b +on (a.key = b.key) PREHOOK: type: QUERY PREHOOK: Input: default@bucket_big @@ -196,28 +262,372 @@ PREHOOK: Input: default@bucket_small PREHOOK: Input: default@bucket_small@pri=1 PREHOOK: Input: default@bucket_small@pri=2 #### A masked pattern was here #### -POSTHOOK: query: select -a.key , -a.value , -b.value , -'day1' as day, -1 as pri -from -( -select -key, -value -from bucket_big where day='day1' -) a -left outer join -( -select -key, -value -from bucket_small -where pri between 1 and 2 -) b -on +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@day=day1/pri=1 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@pri=1 +POSTHOOK: Input: default@bucket_small@pri=2 +#### A masked pattern was here #### +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +374 val_374 val_374 day1 1 +374 val_374 val_374 day1 1 +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Sorted Merge Bucket Map Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@day=day1/pri=1 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@pri=1 +PREHOOK: Input: default@bucket_small@pri=2 +#### A masked pattern was here #### +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@day=day1/pri=1 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@pri=1 +POSTHOOK: Input: default@bucket_small@pri=2 +#### A masked pattern was here #### +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +0 val_0 val_0 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +103 val_103 val_103 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +169 val_169 val_169 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +172 val_172 val_172 day1 1 +374 val_374 val_374 day1 1 +374 val_374 val_374 day1 1 +PREHOOK: query: drop table bucket_big +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@bucket_big +PREHOOK: Output: default@bucket_big +POSTHOOK: query: drop table bucket_big +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@bucket_big +POSTHOOK: Output: default@bucket_big +PREHOOK: query: drop table bucket_small +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@bucket_small +PREHOOK: Output: default@bucket_small +POSTHOOK: query: drop table bucket_small +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@bucket_small +POSTHOOK: Output: default@bucket_small +PREHOOK: query: CREATE TABLE bucket_big +( +key BIGINT, +value STRING +) +PARTITIONED BY (day STRING, pri bigint) +clustered by (key) sorted by (key) into 12 buckets +stored as RCFile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bucket_big +POSTHOOK: query: CREATE TABLE bucket_big +( +key BIGINT, +value STRING +) +PARTITIONED BY (day STRING, pri bigint) +clustered by (key) sorted by (key) into 12 buckets +stored as RCFile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bucket_big +PREHOOK: query: CREATE TABLE bucket_small +( +key BIGINT, +value string +) +PARTITIONED BY (pri bigint) +clustered by (key) sorted by (key) into 24 buckets +stored as RCFile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bucket_small +POSTHOOK: query: CREATE TABLE bucket_small +( +key BIGINT, +value string +) +PARTITIONED BY (pri bigint) +clustered by (key) sorted by (key) into 24 buckets +stored as RCFile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bucket_small +PREHOOK: query: insert overwrite table bucket_small partition(pri) +select +key, +value, +file_tag as pri +from +stage_bucket_small +where file_tag between 1 and 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@stage_bucket_small +PREHOOK: Input: default@stage_bucket_small@file_tag=1 +PREHOOK: Input: default@stage_bucket_small@file_tag=2 +PREHOOK: Output: default@bucket_small +POSTHOOK: query: insert overwrite table bucket_small partition(pri) +select +key, +value, +file_tag as pri +from +stage_bucket_small +where file_tag between 1 and 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@stage_bucket_small +POSTHOOK: Input: default@stage_bucket_small@file_tag=1 +POSTHOOK: Input: default@stage_bucket_small@file_tag=2 +POSTHOOK: Output: default@bucket_small@pri=1 +POSTHOOK: Output: default@bucket_small@pri=2 +POSTHOOK: Lineage: bucket_small PARTITION(pri=1).key SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=1).value SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=2).key SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_small PARTITION(pri=2).value SIMPLE [(stage_bucket_small)stage_bucket_small.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: insert overwrite table bucket_big partition(day,pri) +select key, value, 'day1' as day, 1 as pri +from stage_bucket_big +where file_tag='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@stage_bucket_big +PREHOOK: Input: default@stage_bucket_big@file_tag=1 +PREHOOK: Output: default@bucket_big +POSTHOOK: query: insert overwrite table bucket_big partition(day,pri) +select key, value, 'day1' as day, 1 as pri +from stage_bucket_big +where file_tag='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@stage_bucket_big +POSTHOOK: Input: default@stage_bucket_big@file_tag=1 +POSTHOOK: Output: default@bucket_big@day=day1/pri=1 +POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).key SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:key, type:bigint, comment:null), ] +POSTHOOK: Lineage: bucket_big PARTITION(day=day1,pri=1).value SIMPLE [(stage_bucket_big)stage_bucket_big.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +PREHOOK: type: QUERY +POSTHOOK: query: explain select a.key , a.value , b.value , 'day1' as day, 1 as pri + from + ( select key, value + from bucket_big where day='day1' ) a + left outer join + ( select key, value + from bucket_small + where pri between 1 and 2 ) b + on + (a.key = b.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: bucket_big + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 46 Basic stats: COMPLETE Column stats: NONE + Sorted Merge Bucket Map Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string), 'day1' (type: string), 1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 259 Data size: 2468 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on +(a.key = b.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@day=day1/pri=1 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@pri=1 +PREHOOK: Input: default@bucket_small@pri=2 +#### A masked pattern was here #### +POSTHOOK: query: select a.key , a.value , b.value , 'day1' as day, 1 as pri +from +( select key, value + from bucket_big where day='day1' ) a +left outer join +( select key, value + from bucket_small + where pri between 1 and 2 ) b +on (a.key = b.key) POSTHOOK: type: QUERY POSTHOOK: Input: default@bucket_big