commit d46898bb56c7be3b27c27519d5d50350a2d2522a Author: Vihang Karajgaonkar Date: Thu Nov 2 15:57:41 2017 -0700 HIVE-17961 : NPE during initialization of VectorizedParquetRecordReader when input split is null diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java index a14b7900afb00a7d304b0dc4f6482a2b87716919..e4d11fdb126e7687db70770179f8dcaf8d2f5b7b 100644 --- a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java +++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.io.orc.OrcSerde; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.VectorizedColumnReaderTestBase; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; @@ -332,15 +333,7 @@ public RecordReader getVectorizedRecordReader(Path inputPath) throws Exception { // types. conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,1,2,3,6"); - conf.set(ReadSupport.PARQUET_READ_SCHEMA, "test schema"); - HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); - HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); - Job vectorJob = new Job(conf, "read vector"); - ParquetInputFormat.setInputPaths(vectorJob, inputPath); - ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class); - InputSplit split = (InputSplit) parquetInputFormat.getSplits(vectorJob).get(0); - initialVectorizedRowBatchCtx(conf); - return new VectorizedParquetRecordReader(split, new JobConf(conf)); + return VectorizedColumnReaderTestBase.createTestParquetReader("test schema", conf); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java index dd1e1a976c672b2d0314d21c69f8e090cc70fe44..1d9dba7842835b8024410ae8d15e9b6d2f159675 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java @@ -118,20 +118,6 @@ */ protected long totalRowCount = 0; - @VisibleForTesting - public VectorizedParquetRecordReader( - InputSplit inputSplit, - JobConf conf) { - try { - serDeStats = new SerDeStats(); - projectionPusher = new ProjectionPusher(); - initialize(inputSplit, conf); - } catch (Throwable e) { - LOG.error("Failed to create the vectorized reader due to exception " + e); - throw new RuntimeException(e); - } - } - public VectorizedParquetRecordReader( org.apache.hadoop.mapred.InputSplit oldInputSplit, JobConf conf) { this(oldInputSplit, conf, null, null, null); @@ -146,6 +132,10 @@ public VectorizedParquetRecordReader( this.cacheConf = cacheConf; serDeStats = new SerDeStats(); projectionPusher = new ProjectionPusher(); + colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf); + //initialize the rowbatchContext + jobConf = conf; + rbCtx = Utilities.getVectorizedRowBatchCtx(jobConf); ParquetInputSplit inputSplit = getSplit(oldInputSplit, conf); if (inputSplit != null) { initialize(inputSplit, conf); @@ -171,10 +161,6 @@ private void initPartitionValues(FileSplit fileSplit, JobConf conf) throws IOExc public void initialize( InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException { - colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration); - //initialize the rowbatchContext - jobConf = configuration; - rbCtx = Utilities.getVectorizedRowBatchCtx(jobConf); // the oldSplit may be null during the split phase if (oldSplit == null) { return; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedColumnReader.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedColumnReader.java index 670bfa609704d3001dd171b703b657f57fbd4c74..81d8cffa85fabc4ccdb52a9ff6ece28b33a48ca5 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedColumnReader.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedColumnReader.java @@ -22,15 +22,21 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.Job; +import org.apache.parquet.hadoop.ParquetInputFormat; +import org.apache.parquet.hadoop.ParquetInputSplit; import org.junit.AfterClass; +import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import java.io.IOException; import static junit.framework.TestCase.assertFalse; +import static org.apache.parquet.hadoop.api.ReadSupport.PARQUET_READ_SCHEMA; public class TestVectorizedColumnReader extends VectorizedColumnReaderTestBase { static boolean isDictionaryEncoding = false; @@ -97,16 +103,34 @@ public void decimalRead() throws Exception { decimalRead(isDictionaryEncoding); } + private class TestVectorizedParquetRecordReader extends VectorizedParquetRecordReader { + public TestVectorizedParquetRecordReader( + org.apache.hadoop.mapred.InputSplit oldInputSplit, JobConf conf) { + super(oldInputSplit, conf); + } + @Override + protected ParquetInputSplit getSplit( + org.apache.hadoop.mapred.InputSplit oldInputSplit, JobConf conf) { + return null; + } + } + @Test public void testNullSplitForParquetReader() throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS,"int32_field"); conf.set(IOConstants.COLUMNS_TYPES,"int"); + conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + conf.set(PARQUET_READ_SCHEMA, "message test { required int32 int32_field;}"); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); + Job vectorJob = new Job(conf, "read vector"); + ParquetInputFormat.setInputPaths(vectorJob, file); initialVectorizedRowBatchCtx(conf); - VectorizedParquetRecordReader reader = - new VectorizedParquetRecordReader((InputSplit)null, new JobConf(conf)); - assertFalse(reader.next(reader.createKey(), reader.createValue())); + FileSplit fsplit = getFileSplit(vectorJob); + JobConf jobConf = new JobConf(conf); + TestVectorizedParquetRecordReader testReader = new TestVectorizedParquetRecordReader(fsplit, jobConf); + Assert.assertNull("Test should return null split from getSplit() method", testReader.getSplit(fsplit, jobConf)); } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/VectorizedColumnReaderTestBase.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/VectorizedColumnReaderTestBase.java index f537ceee505c5f41d513df3c89b63453012c9979..1a5d0952fa0ed1b85a1cba843d9537ef5b5d0c3c 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/VectorizedColumnReaderTestBase.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/VectorizedColumnReaderTestBase.java @@ -41,6 +41,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; @@ -213,18 +214,22 @@ protected static boolean isNull(int index) { return (index % NULL_FREQUENCY == 0); } - protected VectorizedParquetRecordReader createParquetReader(String schemaString, Configuration conf) + public static VectorizedParquetRecordReader createTestParquetReader(String schemaString, Configuration conf) throws IOException, InterruptedException, HiveException { conf.set(PARQUET_READ_SCHEMA, schemaString); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); - Job vectorJob = new Job(conf, "read vector"); ParquetInputFormat.setInputPaths(vectorJob, file); + initialVectorizedRowBatchCtx(conf); + return new VectorizedParquetRecordReader(getFileSplit(vectorJob),new JobConf(conf)); + } + + protected static FileSplit getFileSplit(Job vectorJob) throws IOException, InterruptedException { ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class); InputSplit split = (InputSplit) parquetInputFormat.getSplits(vectorJob).get(0); - initialVectorizedRowBatchCtx(conf); - return new VectorizedParquetRecordReader(split, new JobConf(conf)); + FileSplit fsplit = new FileSplit(file,0L,split.getLength(),split.getLocations()); + return fsplit; } protected static void writeData(ParquetWriter writer, boolean isDictionaryEncoding) throws IOException { @@ -295,7 +300,7 @@ protected static void writeData(ParquetWriter writer, boolean isDictionar writer.close(); } - protected void initialVectorizedRowBatchCtx(Configuration conf) throws HiveException { + protected static void initialVectorizedRowBatchCtx(Configuration conf) throws HiveException { MapWork mapWork = new MapWork(); VectorizedRowBatchCtx rbCtx = new VectorizedRowBatchCtx(); rbCtx.init(createStructObjectInspector(conf), new String[0]); @@ -304,7 +309,7 @@ protected void initialVectorizedRowBatchCtx(Configuration conf) throws HiveExcep Utilities.setMapWork(conf, mapWork); } - private StructObjectInspector createStructObjectInspector(Configuration conf) { + private static StructObjectInspector createStructObjectInspector(Configuration conf) { // Create row related objects String columnNames = conf.get(IOConstants.COLUMNS); List columnNamesList = DataWritableReadSupport.getColumnNames(columnNames); @@ -321,7 +326,7 @@ protected void intRead(boolean isDictionaryEncoding) throws InterruptedException conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createParquetReader("message test { required int32 int32_field;}", conf); + createTestParquetReader("message test { required int32 int32_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -350,7 +355,7 @@ protected void longRead(boolean isDictionaryEncoding) throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createParquetReader("message test { required int64 int64_field;}", conf); + createTestParquetReader("message test { required int64 int64_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -379,7 +384,7 @@ protected void doubleRead(boolean isDictionaryEncoding) throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createParquetReader("message test { required double double_field;}", conf); + createTestParquetReader("message test { required double double_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -409,7 +414,7 @@ protected void floatRead(boolean isDictionaryEncoding) throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createParquetReader("message test { required float float_field;}", conf); + createTestParquetReader("message test { required float float_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -439,7 +444,7 @@ protected void booleanRead() throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createParquetReader("message test { required boolean boolean_field;}", conf); + createTestParquetReader("message test { required boolean boolean_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -468,7 +473,7 @@ protected void binaryRead(boolean isDictionaryEncoding) throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createParquetReader("message test { required binary binary_field_some_null;}", conf); + createTestParquetReader("message test { required binary binary_field_some_null;}", conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { @@ -511,7 +516,7 @@ protected void structRead(boolean isDictionaryEncoding) throws Exception { + " optional double b;\n" + "}\n" + "}\n"; - VectorizedParquetRecordReader reader = createParquetReader(schema, conf); + VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { @@ -551,7 +556,7 @@ protected void nestedStructRead0(boolean isDictionaryEncoding) throws Exception + " }" + "optional double e;\n" + "}\n"; - VectorizedParquetRecordReader reader = createParquetReader(schema, conf); + VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { @@ -592,7 +597,7 @@ protected void nestedStructRead1(boolean isDictionaryEncoding) throws Exception + " optional int32 c;\n" + " }" + "}\n"; - VectorizedParquetRecordReader reader = createParquetReader(schema, conf); + VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { @@ -628,7 +633,7 @@ protected void structReadSomeNull(boolean isDictionaryEncoding) throws Exception + " optional int32 f;\n" + " optional double g;\n" + "}\n"; - VectorizedParquetRecordReader reader = createParquetReader(schema, conf); + VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { @@ -669,7 +674,7 @@ protected void decimalRead(boolean isDictionaryEncoding) throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createParquetReader("message hive_schema { required value (DECIMAL(5,2));}", conf); + createTestParquetReader("message hive_schema { required value (DECIMAL(5,2));}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; diff --git a/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q index 8865c797419bc2b85bc301007ddaa2bb3643822c..56f8909991197434187d06278a7ab8e447849e0d 100644 --- a/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q +++ b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q @@ -77,3 +77,20 @@ group by m1["color"]; select m1["color"], count(*) from parquet_project_test group by m1["color"]; + + +create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet; + +insert into table parquet_nullsplit partition(len=1) +values ('one', 'red'); + +explain vectorization select count(*) from parquet_nullsplit where len = 1; +select count(*) from parquet_nullsplit where len = 1; + +explain vectorization select count(*) from parquet_nullsplit where len = 99; +select count(*) from parquet_nullsplit where len = 99; + +drop table parquet_nullsplit; +drop table parquet_project_test; +drop table parquet_types_staging; diff --git a/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out index faa22f9dcce4dc43d686143db3a05c6fef1061b0..3af758fa36d425cbec63f8f9415749e946013d02 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out @@ -456,3 +456,214 @@ POSTHOOK: Input: default@parquet_project_test blue 7 green 7 red 8 +PREHOOK: query: create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_nullsplit +POSTHOOK: query: create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_nullsplit +PREHOOK: query: insert into table parquet_nullsplit partition(len=1) +values ('one', 'red') +PREHOOK: type: QUERY +PREHOOK: Output: default@parquet_nullsplit@len=1 +POSTHOOK: query: insert into table parquet_nullsplit partition(len=1) +values ('one', 'red') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquet_nullsplit@len=1 +POSTHOOK: Lineage: parquet_nullsplit PARTITION(len=1).key SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: parquet_nullsplit PARTITION(len=1).val SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 1 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_nullsplit + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_nullsplit where len = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_nullsplit +PREHOOK: Input: default@parquet_nullsplit@len=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_nullsplit where len = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_nullsplit +POSTHOOK: Input: default@parquet_nullsplit@len=1 +#### A masked pattern was here #### +1 +PREHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 99 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 99 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_nullsplit + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (len = 99) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_nullsplit where len = 99 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_nullsplit +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_nullsplit where len = 99 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_nullsplit +#### A masked pattern was here #### +0 +PREHOOK: query: drop table parquet_nullsplit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_nullsplit +PREHOOK: Output: default@parquet_nullsplit +POSTHOOK: query: drop table parquet_nullsplit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_nullsplit +POSTHOOK: Output: default@parquet_nullsplit +PREHOOK: query: drop table parquet_project_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_project_test +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: drop table parquet_project_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_project_test +POSTHOOK: Output: default@parquet_project_test +PREHOOK: query: drop table parquet_types_staging +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: drop table parquet_types_staging +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_types_staging diff --git a/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out index bd8fbb15de8a6fa097eec317a633af060219f500..2b4c801b06724a7f0c28554115b8ec80118cb729 100644 --- a/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out +++ b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out @@ -426,3 +426,196 @@ POSTHOOK: Input: default@parquet_project_test blue 7 green 7 red 8 +PREHOOK: query: create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_nullsplit +POSTHOOK: query: create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_nullsplit +PREHOOK: query: insert into table parquet_nullsplit partition(len=1) +values ('one', 'red') +PREHOOK: type: QUERY +PREHOOK: Output: default@parquet_nullsplit@len=1 +POSTHOOK: query: insert into table parquet_nullsplit partition(len=1) +values ('one', 'red') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquet_nullsplit@len=1 +POSTHOOK: Lineage: parquet_nullsplit PARTITION(len=1).key SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: parquet_nullsplit PARTITION(len=1).val SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 1 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_nullsplit + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_nullsplit where len = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_nullsplit +PREHOOK: Input: default@parquet_nullsplit@len=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_nullsplit where len = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_nullsplit +POSTHOOK: Input: default@parquet_nullsplit@len=1 +#### A masked pattern was here #### +1 +PREHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 99 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 99 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_nullsplit + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (len = 99) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_nullsplit where len = 99 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_nullsplit +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_nullsplit where len = 99 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_nullsplit +#### A masked pattern was here #### +0 +PREHOOK: query: drop table parquet_nullsplit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_nullsplit +PREHOOK: Output: default@parquet_nullsplit +POSTHOOK: query: drop table parquet_nullsplit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_nullsplit +POSTHOOK: Output: default@parquet_nullsplit +PREHOOK: query: drop table parquet_project_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_project_test +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: drop table parquet_project_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_project_test +POSTHOOK: Output: default@parquet_project_test +PREHOOK: query: drop table parquet_types_staging +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: drop table parquet_types_staging +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_types_staging