diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index 1e24710..e3de4b4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -23,6 +23,8 @@ import java.util.LinkedList; import java.util.List; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; @@ -50,6 +52,7 @@ import org.apache.hadoop.io.Text; public class VectorizedBatchUtil { + private static final Log LOG = LogFactory.getLog(VectorizedBatchUtil.class); /** * Sets the IsNull value for ColumnVector at specified index @@ -242,6 +245,15 @@ public static void addRowToBatchFrom(Object row, StructObjectInspector oi, PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi; Object writableCol = poi.getPrimitiveWritableObject(fieldData); + if (batch.cols[off + i] == null) { + // This means the column was not included in the projection from the underlying read + continue; + } + if (batch.isPartitionCol[off + i]) { + // The value will have already been set before we're called, so don't overwrite it + continue; + } + // NOTE: The default value for null fields in vectorization is 1 for int types, NaN for // float/double. String types have no default value for null. switch (poi.getPrimitiveCategory()) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java index 4364572..205d3be 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java @@ -45,6 +45,11 @@ public int[] projectedColumns; public int projectionSize; + /** + * Track which columns in this row batch are partition columns. + */ + public boolean[] isPartitionCol; + /* * If no filtering has been applied yet, selectedInUse is false, * meaning that all rows qualify. If it is true, then the selected[] array @@ -94,6 +99,7 @@ public VectorizedRowBatch(int numCols, int size) { for (int i = 0; i < numCols; i++) { projectedColumns[i] = i; } + isPartitionCol = new boolean[numCols]; } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java index 22f5f5e..1ba039a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java @@ -286,10 +286,13 @@ public VectorizedRowBatch createVectorizedRowBatch() throws HiveException // If the column is included in the include list or if the column is a // partition column then create the column vector. Also note that partition columns are not // in the included list. - if ((colsToInclude == null) || colsToInclude.contains(j) - || ((partitionValues != null) && - partitionValues.containsKey(fieldRefs.get(j).getFieldName()))) { + boolean isPartCol = partitionValues != null && + partitionValues.containsKey(fieldRefs.get(j).getFieldName()); + if ((colsToInclude == null) || colsToInclude.contains(j) || isPartCol) { ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector(); + if (isPartCol) { + result.isPartitionCol[j] = true; + } switch (foi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java index ca90fc5..abfe60c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java @@ -48,7 +48,6 @@ private final OrcStruct value; private final VectorizedRowBatchCtx rowBatchCtx; private final ObjectInspector objectInspector; - private boolean needToSetPartition = true; private final DataOutputBuffer buffer = new DataOutputBuffer(); VectorizedOrcAcidRowReader(AcidInputFormat.RowReader inner, @@ -83,13 +82,10 @@ public boolean next(NullWritable nullWritable, if (!innerReader.next(key, value)) { return false; } - if (needToSetPartition) { - try { - rowBatchCtx.addPartitionColsToBatch(vectorizedRowBatch); - } catch (HiveException e) { - throw new IOException("Problem adding partition column", e); - } - needToSetPartition = false; + try { + rowBatchCtx.addPartitionColsToBatch(vectorizedRowBatch); + } catch (HiveException e) { + throw new IOException("Problem adding partition column", e); } try { VectorizedBatchUtil.addRowToBatch(value, diff --git ql/src/test/queries/clientpositive/acid_vectorization_partition.q ql/src/test/queries/clientpositive/acid_vectorization_partition.q new file mode 100644 index 0000000..9348d05 --- /dev/null +++ ql/src/test/queries/clientpositive/acid_vectorization_partition.q @@ -0,0 +1,10 @@ +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.enforce.bucketing=true; +set hive.exec.dynamic.partition.mode=nonstrict; + +CREATE TABLE acid_vectorized_part(a INT, b STRING) partitioned by (ds string) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); +insert into table acid_vectorized_part partition (ds = 'today') select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10; +insert into table acid_vectorized_part partition (ds = 'tomorrow') select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10; +set hive.vectorized.execution.enabled=true; +select * from acid_vectorized_part order by a, b; diff --git ql/src/test/queries/clientpositive/acid_vectorization_project.q ql/src/test/queries/clientpositive/acid_vectorization_project.q new file mode 100644 index 0000000..a44b57a --- /dev/null +++ ql/src/test/queries/clientpositive/acid_vectorization_project.q @@ -0,0 +1,11 @@ +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.enforce.bucketing=true; +set hive.exec.dynamic.partition.mode=nonstrict; + +CREATE TABLE acid_vectorized(a INT, b STRING, c float) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); +insert into table acid_vectorized select cint, cstring1, cfloat from alltypesorc where cint is not null order by cint limit 10; +set hive.vectorized.execution.enabled=true; +select a,b from acid_vectorized order by a; +select a,c from acid_vectorized order by a; +select b,c from acid_vectorized order by b; diff --git ql/src/test/results/clientpositive/acid_vectorization_partition.q.out ql/src/test/results/clientpositive/acid_vectorization_partition.q.out new file mode 100644 index 0000000..ee97cc9 --- /dev/null +++ ql/src/test/results/clientpositive/acid_vectorization_partition.q.out @@ -0,0 +1,60 @@ +PREHOOK: query: CREATE TABLE acid_vectorized_part(a INT, b STRING) partitioned by (ds string) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_vectorized_part +POSTHOOK: query: CREATE TABLE acid_vectorized_part(a INT, b STRING) partitioned by (ds string) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_vectorized_part +PREHOOK: query: insert into table acid_vectorized_part partition (ds = 'today') select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acid_vectorized_part@ds=today +POSTHOOK: query: insert into table acid_vectorized_part partition (ds = 'today') select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acid_vectorized_part@ds=today +POSTHOOK: Lineage: acid_vectorized_part PARTITION(ds=today).a SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: acid_vectorized_part PARTITION(ds=today).b SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +PREHOOK: query: insert into table acid_vectorized_part partition (ds = 'tomorrow') select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acid_vectorized_part@ds=tomorrow +POSTHOOK: query: insert into table acid_vectorized_part partition (ds = 'tomorrow') select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acid_vectorized_part@ds=tomorrow +POSTHOOK: Lineage: acid_vectorized_part PARTITION(ds=tomorrow).a SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: acid_vectorized_part PARTITION(ds=tomorrow).b SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +PREHOOK: query: select * from acid_vectorized_part order by a, b +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_vectorized_part +PREHOOK: Input: default@acid_vectorized_part@ds=today +PREHOOK: Input: default@acid_vectorized_part@ds=tomorrow +#### A masked pattern was here #### +POSTHOOK: query: select * from acid_vectorized_part order by a, b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_vectorized_part +POSTHOOK: Input: default@acid_vectorized_part@ds=today +POSTHOOK: Input: default@acid_vectorized_part@ds=tomorrow +#### A masked pattern was here #### +-1073279343 oj1YrV5Wa today +-1073279343 oj1YrV5Wa tomorrow +-1073051226 A34p7oRr2WvUJNf tomorrow +-1073051226 A34p7oRr2WvUJNf today +-1072910839 0iqrc5 tomorrow +-1072910839 0iqrc5 today +-1072081801 dPkN74F7 today +-1072081801 dPkN74F7 tomorrow +-1072076362 2uLyD28144vklju213J1mr today +-1072076362 2uLyD28144vklju213J1mr tomorrow +-1071480828 aw724t8c5558x2xneC624 tomorrow +-1071480828 aw724t8c5558x2xneC624 today +-1071363017 Anj0oF today +-1071363017 Anj0oF tomorrow +-1070883071 0ruyd6Y50JpdGRf6HqD tomorrow +-1070883071 0ruyd6Y50JpdGRf6HqD today +-1070551679 iUR3Q today +-1070551679 iUR3Q tomorrow +-1069736047 k17Am8uPHWk02cEf1jet tomorrow +-1069736047 k17Am8uPHWk02cEf1jet today diff --git ql/src/test/results/clientpositive/acid_vectorization_project.q.out ql/src/test/results/clientpositive/acid_vectorization_project.q.out new file mode 100644 index 0000000..1bdacb9 --- /dev/null +++ ql/src/test/results/clientpositive/acid_vectorization_project.q.out @@ -0,0 +1,73 @@ +PREHOOK: query: CREATE TABLE acid_vectorized(a INT, b STRING, c float) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_vectorized +POSTHOOK: query: CREATE TABLE acid_vectorized(a INT, b STRING, c float) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_vectorized +PREHOOK: query: insert into table acid_vectorized select cint, cstring1, cfloat from alltypesorc where cint is not null order by cint limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acid_vectorized +POSTHOOK: query: insert into table acid_vectorized select cint, cstring1, cfloat from alltypesorc where cint is not null order by cint limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acid_vectorized +POSTHOOK: Lineage: acid_vectorized.a SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: acid_vectorized.b SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +POSTHOOK: Lineage: acid_vectorized.c SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cfloat, type:float, comment:null), ] +PREHOOK: query: select a,b from acid_vectorized order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +POSTHOOK: query: select a,b from acid_vectorized order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +-1073279343 oj1YrV5Wa +-1073051226 A34p7oRr2WvUJNf +-1072910839 0iqrc5 +-1072081801 dPkN74F7 +-1072076362 2uLyD28144vklju213J1mr +-1071480828 aw724t8c5558x2xneC624 +-1071363017 Anj0oF +-1070883071 0ruyd6Y50JpdGRf6HqD +-1070551679 iUR3Q +-1069736047 k17Am8uPHWk02cEf1jet +PREHOOK: query: select a,c from acid_vectorized order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +POSTHOOK: query: select a,c from acid_vectorized order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +-1073279343 11.0 +-1073051226 NULL +-1072910839 11.0 +-1072081801 NULL +-1072076362 NULL +-1071480828 -51.0 +-1071363017 8.0 +-1070883071 NULL +-1070551679 NULL +-1069736047 11.0 +PREHOOK: query: select b,c from acid_vectorized order by b +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +POSTHOOK: query: select b,c from acid_vectorized order by b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +0iqrc5 11.0 +0ruyd6Y50JpdGRf6HqD NULL +2uLyD28144vklju213J1mr NULL +A34p7oRr2WvUJNf NULL +Anj0oF 8.0 +aw724t8c5558x2xneC624 -51.0 +dPkN74F7 NULL +iUR3Q NULL +k17Am8uPHWk02cEf1jet 11.0 +oj1YrV5Wa 11.0