diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index 1dde78e..d4e61d8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -92,8 +92,8 @@ protected transient ListBucketingCtx lbCtx; protected transient boolean isSkewedStoredAsSubDirectories; protected transient boolean statsCollectRawDataSize; - private transient boolean[] statsFromRecordWriter; - private transient boolean isCollectRWStats; + protected transient boolean[] statsFromRecordWriter; + protected transient boolean isCollectRWStats; private transient FSPaths prevFsp; private transient FSPaths fpaths; private transient ObjectInspector keyOI; @@ -626,7 +626,7 @@ public void processOp(Object row, int tag) throws HiveException { } } - private boolean areAllTrue(boolean[] statsFromRW) { + protected boolean areAllTrue(boolean[] statsFromRW) { for(boolean b : statsFromRW) { if (!b) { return false; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java index c6a7c00..e546dd1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java @@ -145,7 +145,11 @@ public void processOp(Object data, int tag) throws HiveException { } rowOutWriters = fpaths.getOutWriters(); - if (conf.isGatherStats()) { + // check if all record writers implement statistics. if atleast one RW + // doesn't implement stats interface we will fallback to conventional way + // of gathering stats + isCollectRWStats = areAllTrue(statsFromRecordWriter); + if (conf.isGatherStats() && !isCollectRWStats) { if (statsCollectRawDataSize) { SerDeStats stats = serializer.getSerDeStats(); if (stats != null) { diff --git ql/src/test/results/clientpositive/tez/vectorization_part_project.q.out ql/src/test/results/clientpositive/tez/vectorization_part_project.q.out index 8d28abd..d3e3923 100644 --- ql/src/test/results/clientpositive/tez/vectorization_part_project.q.out +++ ql/src/test/results/clientpositive/tez/vectorization_part_project.q.out @@ -65,28 +65,28 @@ STAGE PLANS: Map Operator Tree: TableScan alias: alltypesorc_part - Statistics: Num rows: 200 Data size: 4068 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 200 Data size: 41576 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: (cdouble + 2) (type: double) outputColumnNames: _col0 - Statistics: Num rows: 200 Data size: 4068 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 200 Data size: 41576 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: double) sort order: + - Statistics: Num rows: 200 Data size: 4068 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 200 Data size: 41576 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: double) outputColumnNames: _col0 - Statistics: Num rows: 200 Data size: 4068 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 200 Data size: 41576 Basic stats: COMPLETE Column stats: NONE Limit Number of rows: 10 - Statistics: Num rows: 10 Data size: 200 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 10 Data size: 2070 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 10 Data size: 200 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 10 Data size: 2070 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat diff --git ql/src/test/results/clientpositive/tez/vectorized_timestamp_funcs.q.out ql/src/test/results/clientpositive/tez/vectorized_timestamp_funcs.q.out index 8b39f9c..b8e46e9 100644 --- ql/src/test/results/clientpositive/tez/vectorized_timestamp_funcs.q.out +++ ql/src/test/results/clientpositive/tez/vectorized_timestamp_funcs.q.out @@ -106,15 +106,15 @@ STAGE PLANS: Map Operator Tree: TableScan alias: alltypesorc_string - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: to_unix_timestamp(ctimestamp1) (type: bigint), year(ctimestamp1) (type: int), month(ctimestamp1) (type: int), day(ctimestamp1) (type: int), dayofmonth(ctimestamp1) (type: int), weekofyear(ctimestamp1) (type: int), hour(ctimestamp1) (type: int), minute(ctimestamp1) (type: int), second(ctimestamp1) (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: bigint) sort order: + - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: int), _col5 (type: int), _col6 (type: int), _col7 (type: int), _col8 (type: int) Execution mode: vectorized Reducer 2 @@ -122,10 +122,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: bigint), VALUE._col0 (type: int), VALUE._col1 (type: int), VALUE._col2 (type: int), VALUE._col3 (type: int), VALUE._col4 (type: int), VALUE._col5 (type: int), VALUE._col6 (type: int), VALUE._col7 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -249,15 +249,15 @@ STAGE PLANS: Map Operator Tree: TableScan alias: alltypesorc_string - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: to_unix_timestamp(stimestamp1) (type: bigint), year(stimestamp1) (type: int), month(stimestamp1) (type: int), day(stimestamp1) (type: int), dayofmonth(stimestamp1) (type: int), weekofyear(stimestamp1) (type: int), hour(stimestamp1) (type: int), minute(stimestamp1) (type: int), second(stimestamp1) (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: bigint) sort order: + - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: int), _col5 (type: int), _col6 (type: int), _col7 (type: int), _col8 (type: int) Execution mode: vectorized Reducer 2 @@ -265,10 +265,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: bigint), VALUE._col0 (type: int), VALUE._col1 (type: int), VALUE._col2 (type: int), VALUE._col3 (type: int), VALUE._col4 (type: int), VALUE._col5 (type: int), VALUE._col6 (type: int), VALUE._col7 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -392,15 +392,15 @@ STAGE PLANS: Map Operator Tree: TableScan alias: alltypesorc_string - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: (to_unix_timestamp(ctimestamp1) = to_unix_timestamp(stimestamp1)) (type: boolean), (year(ctimestamp1) = year(stimestamp1)) (type: boolean), (month(ctimestamp1) = month(stimestamp1)) (type: boolean), (day(ctimestamp1) = day(stimestamp1)) (type: boolean), (dayofmonth(ctimestamp1) = dayofmonth(stimestamp1)) (type: boolean), (weekofyear(ctimestamp1) = weekofyear(stimestamp1)) (type: boolean), (hour(ctimestamp1) = hour(stimestamp1)) (type: boolean), (minute(ctimestamp1) = minute(stimestamp1)) (type: boolean), (second(ctimestamp1) = second(stimestamp1)) (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: boolean) sort order: + - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: boolean), _col2 (type: boolean), _col3 (type: boolean), _col4 (type: boolean), _col5 (type: boolean), _col6 (type: boolean), _col7 (type: boolean), _col8 (type: boolean) Execution mode: vectorized Reducer 2 @@ -408,10 +408,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: boolean), VALUE._col0 (type: boolean), VALUE._col1 (type: boolean), VALUE._col2 (type: boolean), VALUE._col3 (type: boolean), VALUE._col4 (type: boolean), VALUE._col5 (type: boolean), VALUE._col6 (type: boolean), VALUE._col7 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 40 Data size: 622 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 40 Data size: 5694 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -539,15 +539,15 @@ STAGE PLANS: Map Operator Tree: TableScan alias: alltypesorc_wrong - Statistics: Num rows: 1 Data size: 698 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 103 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: to_unix_timestamp(stimestamp1) (type: bigint), year(stimestamp1) (type: int), month(stimestamp1) (type: int), day(stimestamp1) (type: int), dayofmonth(stimestamp1) (type: int), weekofyear(stimestamp1) (type: int), hour(stimestamp1) (type: int), minute(stimestamp1) (type: int), second(stimestamp1) (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 1 Data size: 698 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 103 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: bigint) sort order: + - Statistics: Num rows: 1 Data size: 698 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 103 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: int), _col5 (type: int), _col6 (type: int), _col7 (type: int), _col8 (type: int) Execution mode: vectorized Reducer 2 @@ -555,10 +555,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: bigint), VALUE._col0 (type: int), VALUE._col1 (type: int), VALUE._col2 (type: int), VALUE._col3 (type: int), VALUE._col4 (type: int), VALUE._col5 (type: int), VALUE._col6 (type: int), VALUE._col7 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 1 Data size: 698 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 103 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 698 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 103 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat