diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java index 24a812d..5d41fa1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator; +import org.apache.hadoop.hive.ql.exec.vector.VectorSMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.CollectDesc; @@ -121,6 +122,7 @@ public OpTuple(Class descClass, Class> opClass) { vectorOpvec.add(new OpTuple(SelectDesc.class, VectorSelectOperator.class)); vectorOpvec.add(new OpTuple(GroupByDesc.class, VectorGroupByOperator.class)); vectorOpvec.add(new OpTuple(MapJoinDesc.class, VectorMapJoinOperator.class)); + vectorOpvec.add(new OpTuple(SMBJoinDesc.class, VectorSMBMapJoinOperator.class)); vectorOpvec.add(new OpTuple(ReduceSinkDesc.class, VectorReduceSinkOperator.class)); vectorOpvec.add(new OpTuple(FileSinkDesc.class, VectorFileSinkOperator.class)); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java index 81a1232..7a7dc88 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java @@ -226,6 +226,11 @@ private byte tagForAlias(String alias) { public void cleanUpInputFileChangedOp() throws HiveException { inputFileChanged = true; } + + protected List smbJoinComputeKeys(Object row, byte alias) throws HiveException { + return JoinUtil.computeKeys(row, joinKeys[alias], + joinKeysObjectInspectors[alias]); + } @Override public void processOp(Object row, int tag) throws HiveException { @@ -260,8 +265,8 @@ public void processOp(Object row, int tag) throws HiveException { byte alias = (byte) tag; // compute keys and values as StandardObjects - ArrayList key = JoinUtil.computeKeys(row, joinKeys[alias], - joinKeysObjectInspectors[alias]); + List key = smbJoinComputeKeys(row, alias); + List value = getFilteredValue(alias, row); @@ -495,7 +500,7 @@ private void putDummyOrEmpty(Byte i) { return smallestOne == null ? null : result; } - private boolean processKey(byte alias, ArrayList key) + private boolean processKey(byte alias, List key) throws HiveException { List keyWritable = keyWritables[alias]; if (keyWritable == null) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 19f7d79..a6e1a90 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -3077,5 +3077,19 @@ private static void createTmpDirs(Configuration conf, } } } + + /** + * Returns true if a plan is both configured for vectorized execution + * and vectorization is allowed. The plan may be configured for vectorization + * but vectorization dissalowed eg. for FetchOperator execution. + */ + public static boolean isVectorMode(Configuration conf) { + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) && + Utilities.getPlanPath(conf) != null && Utilities + .getMapRedWork(conf).getMapWork().getVectorMode()) { + return true; + } + return false; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java new file mode 100644 index 0000000..619ef4f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java @@ -0,0 +1,307 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; +import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; + +/** + * VectorSMBJoinOperator. + * + */ +public class VectorSMBMapJoinOperator extends SMBMapJoinOperator implements VectorizationContextRegion { + + private static final Log LOG = LogFactory.getLog( + VectorSMBMapJoinOperator.class.getName()); + + private static final long serialVersionUID = 1L; + + private int tagLen; + + private transient VectorizedRowBatch outputBatch; + private transient VectorizationContext vOutContext = null; + private transient VectorizedRowBatchCtx vrbCtx = null; + + private String fileKey; + + private VectorExpression[] bigTableValueExpressions; + + private VectorExpression[] bigTableFilterExpressions; + + private VectorExpression[] keyExpressions; + + private VectorExpressionWriter[] keyOutputWriters; + + private transient VectorHashKeyWrapperBatch keyWrapperBatch; + + private transient Map outputVectorAssigners; + + private transient int batchIndex = -1; + + private transient VectorHashKeyWrapper[] keyValues; + + private transient SMBJoinKeyEvaluator keyEvaluator; + + private transient VectorExpressionWriter[] valueWriters; + + private interface SMBJoinKeyEvaluator { + List evaluate(VectorHashKeyWrapper kw) throws HiveException; +} + + public VectorSMBMapJoinOperator() { + super(); + } + + public VectorSMBMapJoinOperator(VectorizationContext vContext, OperatorDesc conf) + throws HiveException { + this(); + SMBJoinDesc desc = (SMBJoinDesc) conf; + this.conf = desc; + + order = desc.getTagOrder(); + numAliases = desc.getExprs().size(); + posBigTable = (byte) desc.getPosBigTable(); + filterMaps = desc.getFilterMap(); + tagLen = desc.getTagLength(); + noOuterJoin = desc.isNoOuterJoin(); + + Map> filterExpressions = desc.getFilters(); + bigTableFilterExpressions = vContext.getVectorExpressions(filterExpressions.get(posBigTable), + VectorExpressionDescriptor.Mode.FILTER); + + List keyDesc = desc.getKeys().get(posBigTable); + keyExpressions = vContext.getVectorExpressions(keyDesc); + keyOutputWriters = VectorExpressionWriterFactory.getExpressionWriters(keyDesc); + + Map> exprs = desc.getExprs(); + bigTableValueExpressions = vContext.getVectorExpressions(exprs.get(posBigTable)); + + List outColNames = desc.getOutputColumnNames(); + + Map mapOutCols = new HashMap(outColNames.size()); + + int outColIndex = 0; + for(String outCol: outColNames) { + mapOutCols.put(outCol, outColIndex++); + } + + vOutContext = new VectorizationContext(mapOutCols, outColIndex); + vOutContext.setFileKey(vContext.getFileKey() + "/SMB_JOIN_" + desc.getBigTableAlias()); + this.fileKey = vOutContext.getFileKey(); + } + + @Override + protected List smbJoinComputeKeys(Object row, byte alias) throws HiveException { + if (alias == this.posBigTable) { + VectorizedRowBatch inBatch = (VectorizedRowBatch) row; + return keyEvaluator.evaluate(keyValues[batchIndex]); + } else { + return super.smbJoinComputeKeys(row, alias); + } + } + + @Override + protected void initializeOp(Configuration hconf) throws HiveException { + super.initializeOp(hconf); + + vrbCtx = new VectorizedRowBatchCtx(); + vrbCtx.init(hconf, this.fileKey, (StructObjectInspector) this.outputObjInspector); + + outputBatch = vrbCtx.createVectorizedRowBatch(); + + keyWrapperBatch =VectorHashKeyWrapperBatch.compileKeyWrapperBatch(keyExpressions); + + outputVectorAssigners = new HashMap(); + + // This key evaluator translates from the vectorized VectorHashKeyWrapper format + // into the row-mode MapJoinKey + keyEvaluator = new SMBJoinKeyEvaluator() { + private List key; + + public SMBJoinKeyEvaluator init() { + key = new ArrayList(); + for(int i=0; i evaluate(VectorHashKeyWrapper kw) throws HiveException { + for(int i=0; i> valueExpressions = conf.getExprs(); + List bigTableExpressions = valueExpressions.get(posBigTable); + + // We're hijacking the big table evaluators an replace them with our own custom ones + // which are going to return values from the input batch vector expressions + List vectorNodeEvaluators = new ArrayList(bigTableExpressions.size()); + + VectorExpressionWriterFactory.processVectorExpressions( + bigTableExpressions, + new VectorExpressionWriterFactory.ListOIDClosure() { + + @Override + public void assign(VectorExpressionWriter[] writers, List oids) { + valueWriters = writers; + joinValuesObjectInspectors[posBigTable] = oids; + } + }); + + for(int i=0; i(desc) { + int columnIndex;; + int writerIndex; + + public ExprNodeEvaluator initVectorExpr(int columnIndex, int writerIndex) { + this.columnIndex = columnIndex; + this.writerIndex = writerIndex; + return this; + } + + @Override + public ObjectInspector initialize(ObjectInspector rowInspector) throws HiveException { + throw new HiveException("should never reach here"); + } + + @Override + protected Object _evaluate(Object row, int version) throws HiveException { + VectorizedRowBatch inBatch = (VectorizedRowBatch) row; + int rowIndex = inBatch.selectedInUse ? inBatch.selected[batchIndex] : batchIndex; + return valueWriters[writerIndex].writeValue(inBatch.cols[columnIndex], rowIndex); + } + }.initVectorExpr(vectorExpr.getOutputColumn(), i); + vectorNodeEvaluators.add(eval); + } + // Now replace the old evaluators with our own + joinValues[posBigTable] = vectorNodeEvaluators; + + } + + @Override + public void processOp(Object row, int tag) throws HiveException { + byte alias = (byte) tag; + + if (alias != this.posBigTable) { + super.processOp(row, tag); + } else { + + VectorizedRowBatch inBatch = (VectorizedRowBatch) row; + + if (null != bigTableFilterExpressions) { + for(VectorExpression ve:bigTableFilterExpressions) { + ve.evaluate(inBatch); + } + } + + if (null != bigTableValueExpressions) { + for(VectorExpression ve: bigTableValueExpressions) { + ve.evaluate(inBatch); + } + } + + keyWrapperBatch.evaluateBatch(inBatch); + keyValues = keyWrapperBatch.getVectorHashKeyWrappers(); + + // This implementation of vectorized JOIN is delegating all the work + // to the row-mode implementation by hijacking the big table node evaluators + // and calling the row-mode join processOp for each row in the input batch. + // Since the JOIN operator is not fully vectorized anyway atm (due to the use + // of row-mode small-tables) this is a reasonable trade-off. + // + for(batchIndex=0; batchIndex < inBatch.size; ++batchIndex ) { + super.processOp(row, tag); + } + + // Set these two to invalid values so any attempt to use them + // outside the inner loop results in NPE/OutOfBounds errors + batchIndex = -1; + keyValues = null; + } + } + + @Override + public void closeOp(boolean aborted) throws HiveException { + super.closeOp(aborted); + if (!aborted && 0 < outputBatch.size) { + flushOutput(); + } + } + + @Override + protected void internalForward(Object row, ObjectInspector outputOI) throws HiveException { + Object[] values = (Object[]) row; + VectorColumnAssign[] vcas = outputVectorAssigners.get(outputOI); + if (null == vcas) { + Map> allColumnMaps = Utilities. + getMapRedWork(hconf).getMapWork().getScratchColumnMap(); + Map columnMap = allColumnMaps.get(fileKey); + vcas = VectorColumnAssignFactory.buildAssigners( + outputBatch, outputOI, columnMap, conf.getOutputColumnNames()); + outputVectorAssigners.put(outputOI, vcas); + } + for (int i=0; i files) throws IOException { - boolean vectorPath = - conf.getBoolean(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.toString(), true); + boolean vectorPath = Utilities.isVectorMode(conf); if (vectorPath) { return vrcif.validateInput(fs, conf, files); } else { @@ -133,7 +133,7 @@ public boolean validateInput(FileSystem fs, HiveConf conf, ArrayList @Override public RecordReader getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { - boolean vectorPath = conf.getBoolean(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.toString(), true); + boolean vectorPath = Utilities.isVectorMode(conf); if (vectorPath) { RecordReader vrcrr = vrcif.getRecordReader(split, conf, reporter); return new CommonOrcRecordReader(vrcrr, null); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index abdc165..0684dc9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -268,11 +268,7 @@ public boolean validateInput(FileSystem fs, HiveConf conf, } private boolean isVectorMode(Configuration conf) { - if (Utilities.getPlanPath(conf) != null && Utilities - .getMapRedWork(conf).getMapWork().getVectorMode()) { - return true; - } - return false; + return Utilities.isVectorMode(conf); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 7859e56..45a8f29 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -43,6 +43,7 @@ import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorFactory; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; @@ -76,6 +77,7 @@ import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.udf.UDFAcos; @@ -535,6 +537,8 @@ boolean validateOperator(Operator op) { case MAPJOIN: if (op instanceof MapJoinOperator) { ret = validateMapJoinOperator((MapJoinOperator) op); + } else if (op instanceof SMBMapJoinOperator) { + ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op); } break; case GROUPBY: @@ -563,6 +567,12 @@ boolean validateOperator(Operator op) { return ret; } + private boolean validateSMBMapJoinOperator(SMBMapJoinOperator op) { + SMBJoinDesc desc = op.getConf(); + // Validation is the same as for map join, since the 'small' tables are not vectorized + return validateMapJoinDesc(desc); + } + private boolean validateTableScanOperator(TableScanOperator op) { TableScanDesc desc = op.getConf(); return !desc.isGatherStats(); @@ -570,6 +580,10 @@ private boolean validateTableScanOperator(TableScanOperator op) { private boolean validateMapJoinOperator(MapJoinOperator op) { MapJoinDesc desc = op.getConf(); + return validateMapJoinDesc(desc); + } + + private boolean validateMapJoinDesc(MapJoinDesc desc) { byte posBigTable = (byte) desc.getPosBigTable(); List filterExprs = desc.getFilters().get(posBigTable); List keyExprs = desc.getKeys().get(posBigTable); diff --git ql/src/test/queries/clientpositive/vectorized_bucketmapjoin1.q ql/src/test/queries/clientpositive/vectorized_bucketmapjoin1.q new file mode 100644 index 0000000..e309713 --- /dev/null +++ ql/src/test/queries/clientpositive/vectorized_bucketmapjoin1.q @@ -0,0 +1,46 @@ +create table vsmb_bucket_1(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS ORC; +create table vsmb_bucket_2(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS ORC; + +create table vsmb_bucket_RC(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS RCFILE; + +create table vsmb_bucket_TXT(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS TEXTFILE; + +insert into table vsmb_bucket_1 select cint, cstring1 from alltypesorc limit 2; +insert into table vsmb_bucket_2 select cint, cstring1 from alltypesorc limit 2; +insert into table vsmb_bucket_RC select cint, cstring1 from alltypesorc limit 2; +insert into table vsmb_bucket_TXT select cint, cstring1 from alltypesorc limit 2; + +set hive.vectorized.execution.enabled=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.auto.convert.sortmerge.join.noconditionaltask = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; + +explain +select /*+MAPJOIN(a)*/ * from vsmb_bucket_1 a join vsmb_bucket_2 b on a.key = b.key; +select /*+MAPJOIN(a)*/ * from vsmb_bucket_1 a join vsmb_bucket_2 b on a.key = b.key; + +explain +select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_RC b on a.key = b.key; +select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_RC b on a.key = b.key; + +-- RC file does not yet provide the vectorized CommonRCFileformat out-of-the-box +-- explain +-- select /*+MAPJOIN(b)*/ * from vsmb_bucket_RC a join vsmb_bucket_2 b on a.key = b.key; +-- select /*+MAPJOIN(b)*/ * from vsmb_bucket_RC a join vsmb_bucket_2 b on a.key = b.key; + +explain +select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_TXT b on a.key = b.key; +select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_TXT b on a.key = b.key; diff --git ql/src/test/results/clientpositive/vectorized_bucketmapjoin1.q.out ql/src/test/results/clientpositive/vectorized_bucketmapjoin1.q.out new file mode 100644 index 0000000..ba4dea5 --- /dev/null +++ ql/src/test/results/clientpositive/vectorized_bucketmapjoin1.q.out @@ -0,0 +1,370 @@ +PREHOOK: query: create table vsmb_bucket_1(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS ORC +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table vsmb_bucket_1(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@vsmb_bucket_1 +PREHOOK: query: create table vsmb_bucket_2(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS ORC +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table vsmb_bucket_2(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@vsmb_bucket_2 +PREHOOK: query: create table vsmb_bucket_RC(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS RCFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table vsmb_bucket_RC(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS RCFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@vsmb_bucket_RC +PREHOOK: query: create table vsmb_bucket_TXT(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table vsmb_bucket_TXT(key int, value string) + CLUSTERED BY (key) + SORTED BY (key) INTO 1 BUCKETS + STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@vsmb_bucket_TXT +PREHOOK: query: insert into table vsmb_bucket_1 select cint, cstring1 from alltypesorc limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@vsmb_bucket_1 +POSTHOOK: query: insert into table vsmb_bucket_1 select cint, cstring1 from alltypesorc limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@vsmb_bucket_1 +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +PREHOOK: query: insert into table vsmb_bucket_2 select cint, cstring1 from alltypesorc limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@vsmb_bucket_2 +POSTHOOK: query: insert into table vsmb_bucket_2 select cint, cstring1 from alltypesorc limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@vsmb_bucket_2 +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +PREHOOK: query: insert into table vsmb_bucket_RC select cint, cstring1 from alltypesorc limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@vsmb_bucket_rc +POSTHOOK: query: insert into table vsmb_bucket_RC select cint, cstring1 from alltypesorc limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@vsmb_bucket_rc +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +PREHOOK: query: insert into table vsmb_bucket_TXT select cint, cstring1 from alltypesorc limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@vsmb_bucket_txt +POSTHOOK: query: insert into table vsmb_bucket_TXT select cint, cstring1 from alltypesorc limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@vsmb_bucket_txt +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +PREHOOK: query: explain +select /*+MAPJOIN(a)*/ * from vsmb_bucket_1 a join vsmb_bucket_2 b on a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain +select /*+MAPJOIN(a)*/ * from vsmb_bucket_1 a join vsmb_bucket_2 b on a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME vsmb_bucket_1) a) (TOK_TABREF (TOK_TABNAME vsmb_bucket_2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col4, _col5 + Position of Big Table: 1 + Vectorized execution: true + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col4 + type: int + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Vectorized execution: true + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Vectorized execution: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select /*+MAPJOIN(a)*/ * from vsmb_bucket_1 a join vsmb_bucket_2 b on a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@vsmb_bucket_1 +PREHOOK: Input: default@vsmb_bucket_2 +#### A masked pattern was here #### +POSTHOOK: query: select /*+MAPJOIN(a)*/ * from vsmb_bucket_1 a join vsmb_bucket_2 b on a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@vsmb_bucket_1 +POSTHOOK: Input: default@vsmb_bucket_2 +#### A masked pattern was here #### +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +PREHOOK: query: explain +select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_RC b on a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain +select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_RC b on a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME vsmb_bucket_1) a) (TOK_TABREF (TOK_TABNAME vsmb_bucket_RC) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST b))) (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col4, _col5 + Position of Big Table: 0 + Vectorized execution: true + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col4 + type: int + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Vectorized execution: true + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Vectorized execution: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_RC b on a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@vsmb_bucket_1 +PREHOOK: Input: default@vsmb_bucket_rc +#### A masked pattern was here #### +POSTHOOK: query: select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_RC b on a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@vsmb_bucket_1 +POSTHOOK: Input: default@vsmb_bucket_rc +#### A masked pattern was here #### +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +PREHOOK: query: -- RC file does not yet provide the vectorized CommonRCFileformat out-of-the-box +-- explain +-- select /*+MAPJOIN(b)*/ * from vsmb_bucket_RC a join vsmb_bucket_2 b on a.key = b.key; +-- select /*+MAPJOIN(b)*/ * from vsmb_bucket_RC a join vsmb_bucket_2 b on a.key = b.key; + +explain +select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_TXT b on a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- RC file does not yet provide the vectorized CommonRCFileformat out-of-the-box +-- explain +-- select /*+MAPJOIN(b)*/ * from vsmb_bucket_RC a join vsmb_bucket_2 b on a.key = b.key; +-- select /*+MAPJOIN(b)*/ * from vsmb_bucket_RC a join vsmb_bucket_2 b on a.key = b.key; + +explain +select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_TXT b on a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME vsmb_bucket_1) a) (TOK_TABREF (TOK_TABNAME vsmb_bucket_TXT) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST b))) (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col4, _col5 + Position of Big Table: 0 + Vectorized execution: true + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col4 + type: int + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Vectorized execution: true + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Vectorized execution: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_TXT b on a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@vsmb_bucket_1 +PREHOOK: Input: default@vsmb_bucket_txt +#### A masked pattern was here #### +POSTHOOK: query: select /*+MAPJOIN(b)*/ * from vsmb_bucket_1 a join vsmb_bucket_TXT b on a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@vsmb_bucket_1 +POSTHOOK: Input: default@vsmb_bucket_txt +#### A masked pattern was here #### +POSTHOOK: Lineage: vsmb_bucket_1.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_1.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_2.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_rc.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.key SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: vsmb_bucket_txt.value SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:from deserializer), ] +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p +528534767 cvLH6Eat2yFsyy7p 528534767 cvLH6Eat2yFsyy7p