diff --git common/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java common/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java index 8101be9..ad09015 100644 --- common/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java +++ common/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java @@ -273,13 +273,4 @@ public static BigDecimal enforcePrecisionScale(BigDecimal bd, int maxPrecision, return bd; } - - /** - * Sets the {@link BigDecimal} value in this object. - * @param bigDecimal - */ - public void setNormalize(BigDecimal bigDecimal) { - BigDecimal value = normalize(bigDecimal, true); - this.bd = value; - } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java index 8c66624..a65e594 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java @@ -38,7 +38,7 @@ /** * Select operator implementation. */ -public class VectorSelectOperator extends SelectOperator { +public class VectorSelectOperator extends SelectOperator implements VectorizationContextRegion { private static final long serialVersionUID = 1L; @@ -48,6 +48,9 @@ private transient VectorExpressionWriter [] valueWriters = null; + // Create a new outgoing vectorization context because column name map will change. + private VectorizationContext vOutContext; + public VectorSelectOperator(VectorizationContext vContext, OperatorDesc conf) throws HiveException { this.conf = (SelectDesc) conf; @@ -59,14 +62,21 @@ public VectorSelectOperator(VectorizationContext vContext, OperatorDesc conf) vExpressions[i] = ve; } - Map cMap = vContext.getColumnMap(); + /** + * Create a new vectorization context to update the column map but same output column manager + * must be inherited to track the scratch the columns. + */ + vOutContext = new VectorizationContext(vContext); + + // Set a fileKey, although this operator doesn't use it. + vOutContext.setFileKey(vContext.getFileKey() + "/_SELECT_"); + + // Update column map + vOutContext.getColumnMap().clear(); for (int i=0; i < colList.size(); ++i) { String columnName = this.conf.getOutputColumnNames().get(i); - if (!cMap.containsKey(columnName)) { - VectorExpression ve = vExpressions[i]; - // Update column map with output column names - vContext.addToColumnMap(columnName, ve.getOutputColumn()); - } + VectorExpression ve = vExpressions[i]; + vOutContext.addToColumnMap(columnName, ve.getOutputColumn()); } } @@ -153,4 +163,9 @@ public void setvExpressions(VectorExpression[] vExpressions) { public void setVExpressions(VectorExpression[] vExpressions) { this.vExpressions = vExpressions; } + + @Override + public VectorizationContext getOuputVectorizationContext() { + return vOutContext; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index eb8c4c5..70a236d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -120,6 +120,11 @@ //Map column number to type private final OutputColumnManager ocm; + // File key is used by operators to retrieve the scratch vectors + // from mapWork at runtime. The operators that modify the structure of + // a vector row batch, need to allocate scratch vectors as well. Every + // operator that creates a new Vectorization context should set a unique + // fileKey. private String fileKey = null; // Set of UDF classes for type casting data types in row-mode. @@ -151,6 +156,19 @@ public VectorizationContext(Map columnMap, vMap = new VectorExpressionDescriptor(); } + /** + * This constructor inherits the OutputColumnManger and from + * the 'parent' constructor, therefore this should be used only by operators + * that don't create a new vectorized row batch. This should be used only by + * operators that want to modify the columnName map without changing the row batch. + */ + public VectorizationContext(VectorizationContext parent) { + this.columnMap = new HashMap(parent.columnMap); + this.ocm = parent.ocm; + this.firstOutputColumnIndex = parent.firstOutputColumnIndex; + vMap = new VectorExpressionDescriptor(); + } + public String getFileKey() { return fileKey; } @@ -170,7 +188,7 @@ protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) { return columnMap.get(colExpr.getColumn()); } - private class OutputColumnManager { + private static class OutputColumnManager { private final int initialOutputCol; private int outputColCount = 0; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpressionWriterFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpressionWriterFactory.java index 886154d..ae65c30 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpressionWriterFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpressionWriterFactory.java @@ -447,28 +447,24 @@ private static VectorExpressionWriter genVectorExpressionWritableDecimal( SettableHiveDecimalObjectInspector fieldObjInspector) throws HiveException { return new VectorExpressionWriterDecimal() { - private HiveDecimal hd; private Object obj; public VectorExpressionWriter init(SettableHiveDecimalObjectInspector objInspector) throws HiveException { super.init(objInspector); - hd = HiveDecimal.create(BigDecimal.ZERO); obj = initValue(null); return this; } @Override public Object writeValue(Decimal128 value) throws HiveException { - hd.setNormalize(value.toBigDecimal()); - ((SettableHiveDecimalObjectInspector) this.objectInspector).set(obj, hd); - return obj; + return ((SettableHiveDecimalObjectInspector) this.objectInspector).set(obj, + HiveDecimal.create(value.toBigDecimal())); } @Override public Object setValue(Object field, Decimal128 value) { - hd.setNormalize(value.toBigDecimal()); - ((SettableHiveDecimalObjectInspector) this.objectInspector).set(field, hd); - return field; + return ((SettableHiveDecimalObjectInspector) this.objectInspector).set(field, + HiveDecimal.create(value.toBigDecimal())); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFYearLong.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFYearLong.java index bb79cfd..41c9d5b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFYearLong.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFYearLong.java @@ -29,16 +29,16 @@ private static final long serialVersionUID = 1L; /* year boundaries in nanoseconds */ - static transient final long[] YEAR_BOUNDARIES; - static transient final int MIN_YEAR = 1901; - static transient final int MAX_YEAR = 2038; + private static transient final long[] YEAR_BOUNDARIES; + private static transient final int MIN_YEAR = 1678; + private static transient final int MAX_YEAR = 2300; static { YEAR_BOUNDARIES = new long[MAX_YEAR-MIN_YEAR]; Calendar c = Calendar.getInstance(); c.setTimeInMillis(0); // c.set doesn't reset millis /* 1901 Jan is not with in range */ - for(int year=MIN_YEAR+1; year <= 2038; year++) { + for(int year=MIN_YEAR+1; year <= MAX_YEAR; year++) { c.set(year, Calendar.JANUARY, 1, 0, 0, 0); YEAR_BOUNDARIES[year-MIN_YEAR-1] = c.getTimeInMillis()*1000*1000; } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 225e91b..c2240c0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -33,7 +33,6 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.exec.*; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.exec.tez.TezTask; @@ -53,11 +52,7 @@ import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.metadata.Partition; -import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; -import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; -import org.apache.hadoop.hive.ql.parse.RowResolver; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc; import org.apache.hadoop.hive.ql.plan.AggregationDesc; @@ -378,7 +373,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, class VectorizationNodeProcessor implements NodeProcessor { private final MapWork mWork; - private final Map vectorizationContexts = + + // This is used to extract scratch column types for each file key + private final Map scratchColumnContext = new HashMap(); private final Map, VectorizationContext> vContextsByTSOp = @@ -394,8 +391,8 @@ public VectorizationNodeProcessor(MapWork mWork) { public Map> getScratchColumnVectorTypes() { Map> scratchColumnVectorTypes = new HashMap>(); - for (String onefile : vectorizationContexts.keySet()) { - VectorizationContext vc = vectorizationContexts.get(onefile); + for (String onefile : scratchColumnContext.keySet()) { + VectorizationContext vc = scratchColumnContext.get(onefile); Map cmap = vc.getOutputColumnTypeMap(); scratchColumnVectorTypes.put(onefile, cmap); } @@ -405,8 +402,8 @@ public VectorizationNodeProcessor(MapWork mWork) { public Map> getScratchColumnMap() { Map> scratchColumnMap = new HashMap>(); - for(String oneFile: vectorizationContexts.keySet()) { - VectorizationContext vc = vectorizationContexts.get(oneFile); + for(String oneFile: scratchColumnContext.keySet()) { + VectorizationContext vc = scratchColumnContext.get(oneFile); Map cmap = vc.getColumnMap(); scratchColumnMap.put(oneFile, cmap); } @@ -433,7 +430,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // Each partition gets a copy // vContext.setFileKey(onefile); - vectorizationContexts.put(onefile, vContext); + scratchColumnContext.put(onefile, vContext); break; } } @@ -472,7 +469,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; VectorizationContext vOutContext = vcRegion.getOuputVectorizationContext(); vContextsByTSOp.put(op, vOutContext); - vectorizationContexts.put(vOutContext.getFileKey(), vOutContext); + scratchColumnContext.put(vOutContext.getFileKey(), vOutContext); } } } catch (HiveException e) { diff --git ql/src/test/queries/clientpositive/vectorized_nested_mapjoin.q ql/src/test/queries/clientpositive/vectorized_nested_mapjoin.q new file mode 100644 index 0000000..ce4227c --- /dev/null +++ ql/src/test/queries/clientpositive/vectorized_nested_mapjoin.q @@ -0,0 +1,8 @@ +SET hive.vectorized.execution.enabled=true; +SET hive.auto.convert.join=true; +SET hive.auto.convert.join.nonconditionaltask=true; +SET hive.auto.convert.join.nonconditionaltask.size=1000000000; + +explain select sum(t1.td) from (select v1.csmallint as tsi, v1.cdouble as td from alltypesorc v1, alltypesorc v2 where v1.ctinyint=v2.ctinyint) t1 join alltypesorc v3 on t1.tsi=v3.csmallint; + +select sum(t1.td) from (select v1.csmallint as tsi, v1.cdouble as td from alltypesorc v1, alltypesorc v2 where v1.ctinyint=v2.ctinyint) t1 join alltypesorc v3 on t1.tsi=v3.csmallint; diff --git ql/src/test/results/clientpositive/vectorized_nested_mapjoin.q.out ql/src/test/results/clientpositive/vectorized_nested_mapjoin.q.out new file mode 100644 index 0000000..c76fc15 --- /dev/null +++ ql/src/test/results/clientpositive/vectorized_nested_mapjoin.q.out @@ -0,0 +1,125 @@ +PREHOOK: query: explain select sum(t1.td) from (select v1.csmallint as tsi, v1.cdouble as td from alltypesorc v1, alltypesorc v2 where v1.ctinyint=v2.ctinyint) t1 join alltypesorc v3 on t1.tsi=v3.csmallint +PREHOOK: type: QUERY +POSTHOOK: query: explain select sum(t1.td) from (select v1.csmallint as tsi, v1.cdouble as td from alltypesorc v1, alltypesorc v2 where v1.ctinyint=v2.ctinyint) t1 join alltypesorc v3 on t1.tsi=v3.csmallint +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-7 is a root stage + Stage-2 depends on stages: Stage-7 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + t1:v1 + Fetch Operator + limit: -1 + v3 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + t1:v1 + TableScan + alias: v1 + Statistics: Num rows: 23577 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {ctinyint} {csmallint} {cdouble} + 1 {ctinyint} + keys: + 0 ctinyint (type: tinyint) + 1 ctinyint (type: tinyint) + v3 + TableScan + alias: v3 + Statistics: Num rows: 94309 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {_col1} + 1 + keys: + 0 _col0 (type: smallint) + 1 csmallint (type: smallint) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: v2 + Statistics: Num rows: 94309 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {ctinyint} {csmallint} {cdouble} + 1 {ctinyint} + keys: + 0 ctinyint (type: tinyint) + 1 ctinyint (type: tinyint) + outputColumnNames: _col0, _col1, _col5, _col14 + Statistics: Num rows: 103739 Data size: 414960 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (_col0 = _col14) (type: boolean) + Statistics: Num rows: 51869 Data size: 207477 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: smallint), _col5 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 51869 Data size: 207477 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col1} + 1 + keys: + 0 _col0 (type: smallint) + 1 csmallint (type: smallint) + outputColumnNames: _col1 + Statistics: Num rows: 103739 Data size: 414960 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: double) + outputColumnNames: _col1 + Statistics: Num rows: 103739 Data size: 414960 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: double) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: double) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select sum(t1.td) from (select v1.csmallint as tsi, v1.cdouble as td from alltypesorc v1, alltypesorc v2 where v1.ctinyint=v2.ctinyint) t1 join alltypesorc v3 on t1.tsi=v3.csmallint +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select sum(t1.td) from (select v1.csmallint as tsi, v1.cdouble as td from alltypesorc v1, alltypesorc v2 where v1.ctinyint=v2.ctinyint) t1 join alltypesorc v3 on t1.tsi=v3.csmallint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +6.065190932486892E11