diff --git data/files/customer_demographics.txt data/files/customer_demographics.txt new file mode 100644 index 0000000..90ab999 --- /dev/null +++ data/files/customer_demographics.txt @@ -0,0 +1,200 @@ +1|M|M|Primary|500|Good|0|0|0| +2|F|M|Primary|500|Good|0|0|0| +3|M|S|Primary|500|Good|0|0|0| +4|F|S|Primary|500|Good|0|0|0| +5|M|D|Primary|500|Good|0|0|0| +6|F|D|Primary|500|Good|0|0|0| +7|M|W|Primary|500|Good|0|0|0| +8|F|W|Primary|500|Good|0|0|0| +9|M|U|Primary|500|Good|0|0|0| +10|F|U|Primary|500|Good|0|0|0| +11|M|M|Secondary|500|Good|0|0|0| +12|F|M|Secondary|500|Good|0|0|0| +13|M|S|Secondary|500|Good|0|0|0| +14|F|S|Secondary|500|Good|0|0|0| +15|M|D|Secondary|500|Good|0|0|0| +16|F|D|Secondary|500|Good|0|0|0| +17|M|W|Secondary|500|Good|0|0|0| +18|F|W|Secondary|500|Good|0|0|0| +19|M|U|Secondary|500|Good|0|0|0| +20|F|U|Secondary|500|Good|0|0|0| +21|M|M|College|500|Good|0|0|0| +22|F|M|College|500|Good|0|0|0| +23|M|S|College|500|Good|0|0|0| +24|F|S|College|500|Good|0|0|0| +25|M|D|College|500|Good|0|0|0| +26|F|D|College|500|Good|0|0|0| +27|M|W|College|500|Good|0|0|0| +28|F|W|College|500|Good|0|0|0| +29|M|U|College|500|Good|0|0|0| +30|F|U|College|500|Good|0|0|0| +31|M|M|2 yr Degree|500|Good|0|0|0| +32|F|M|2 yr Degree|500|Good|0|0|0| +33|M|S|2 yr Degree|500|Good|0|0|0| +34|F|S|2 yr Degree|500|Good|0|0|0| +35|M|D|2 yr Degree|500|Good|0|0|0| +36|F|D|2 yr Degree|500|Good|0|0|0| +37|M|W|2 yr Degree|500|Good|0|0|0| +38|F|W|2 yr Degree|500|Good|0|0|0| +39|M|U|2 yr Degree|500|Good|0|0|0| +40|F|U|2 yr Degree|500|Good|0|0|0| +41|M|M|4 yr Degree|500|Good|0|0|0| +42|F|M|4 yr Degree|500|Good|0|0|0| +43|M|S|4 yr Degree|500|Good|0|0|0| +44|F|S|4 yr Degree|500|Good|0|0|0| +45|M|D|4 yr Degree|500|Good|0|0|0| +46|F|D|4 yr Degree|500|Good|0|0|0| +47|M|W|4 yr Degree|500|Good|0|0|0| +48|F|W|4 yr Degree|500|Good|0|0|0| +49|M|U|4 yr Degree|500|Good|0|0|0| +50|F|U|4 yr Degree|500|Good|0|0|0| +51|M|M|Advanced Degree|500|Good|0|0|0| +52|F|M|Advanced Degree|500|Good|0|0|0| +53|M|S|Advanced Degree|500|Good|0|0|0| +54|F|S|Advanced Degree|500|Good|0|0|0| +55|M|D|Advanced Degree|500|Good|0|0|0| +56|F|D|Advanced Degree|500|Good|0|0|0| +57|M|W|Advanced Degree|500|Good|0|0|0| +58|F|W|Advanced Degree|500|Good|0|0|0| +59|M|U|Advanced Degree|500|Good|0|0|0| +60|F|U|Advanced Degree|500|Good|0|0|0| +61|M|M|Unknown|500|Good|0|0|0| +62|F|M|Unknown|500|Good|0|0|0| +63|M|S|Unknown|500|Good|0|0|0| +64|F|S|Unknown|500|Good|0|0|0| +65|M|D|Unknown|500|Good|0|0|0| +66|F|D|Unknown|500|Good|0|0|0| +67|M|W|Unknown|500|Good|0|0|0| +68|F|W|Unknown|500|Good|0|0|0| +69|M|U|Unknown|500|Good|0|0|0| +70|F|U|Unknown|500|Good|0|0|0| +71|M|M|Primary|1000|Good|0|0|0| +72|F|M|Primary|1000|Good|0|0|0| +73|M|S|Primary|1000|Good|0|0|0| +74|F|S|Primary|1000|Good|0|0|0| +75|M|D|Primary|1000|Good|0|0|0| +76|F|D|Primary|1000|Good|0|0|0| +77|M|W|Primary|1000|Good|0|0|0| +78|F|W|Primary|1000|Good|0|0|0| +79|M|U|Primary|1000|Good|0|0|0| +80|F|U|Primary|1000|Good|0|0|0| +81|M|M|Secondary|1000|Good|0|0|0| +82|F|M|Secondary|1000|Good|0|0|0| +83|M|S|Secondary|1000|Good|0|0|0| +84|F|S|Secondary|1000|Good|0|0|0| +85|M|D|Secondary|1000|Good|0|0|0| +86|F|D|Secondary|1000|Good|0|0|0| +87|M|W|Secondary|1000|Good|0|0|0| +88|F|W|Secondary|1000|Good|0|0|0| +89|M|U|Secondary|1000|Good|0|0|0| +90|F|U|Secondary|1000|Good|0|0|0| +91|M|M|College|1000|Good|0|0|0| +92|F|M|College|1000|Good|0|0|0| +93|M|S|College|1000|Good|0|0|0| +94|F|S|College|1000|Good|0|0|0| +95|M|D|College|1000|Good|0|0|0| +96|F|D|College|1000|Good|0|0|0| +97|M|W|College|1000|Good|0|0|0| +98|F|W|College|1000|Good|0|0|0| +99|M|U|College|1000|Good|0|0|0| +100|F|U|College|1000|Good|0|0|0| +101|M|M|2 yr Degree|1000|Good|0|0|0| +102|F|M|2 yr Degree|1000|Good|0|0|0| +103|M|S|2 yr Degree|1000|Good|0|0|0| +104|F|S|2 yr Degree|1000|Good|0|0|0| +105|M|D|2 yr Degree|1000|Good|0|0|0| +106|F|D|2 yr Degree|1000|Good|0|0|0| +107|M|W|2 yr Degree|1000|Good|0|0|0| +108|F|W|2 yr Degree|1000|Good|0|0|0| +109|M|U|2 yr Degree|1000|Good|0|0|0| +110|F|U|2 yr Degree|1000|Good|0|0|0| +111|M|M|4 yr Degree|1000|Good|0|0|0| +112|F|M|4 yr Degree|1000|Good|0|0|0| +113|M|S|4 yr Degree|1000|Good|0|0|0| +114|F|S|4 yr Degree|1000|Good|0|0|0| +115|M|D|4 yr Degree|1000|Good|0|0|0| +116|F|D|4 yr Degree|1000|Good|0|0|0| +117|M|W|4 yr Degree|1000|Good|0|0|0| +118|F|W|4 yr Degree|1000|Good|0|0|0| +119|M|U|4 yr Degree|1000|Good|0|0|0| +120|F|U|4 yr Degree|1000|Good|0|0|0| +121|M|M|Advanced Degree|1000|Good|0|0|0| +122|F|M|Advanced Degree|1000|Good|0|0|0| +123|M|S|Advanced Degree|1000|Good|0|0|0| +124|F|S|Advanced Degree|1000|Good|0|0|0| +125|M|D|Advanced Degree|1000|Good|0|0|0| +126|F|D|Advanced Degree|1000|Good|0|0|0| +127|M|W|Advanced Degree|1000|Good|0|0|0| +128|F|W|Advanced Degree|1000|Good|0|0|0| +129|M|U|Advanced Degree|1000|Good|0|0|0| +130|F|U|Advanced Degree|1000|Good|0|0|0| +131|M|M|Unknown|1000|Good|0|0|0| +132|F|M|Unknown|1000|Good|0|0|0| +133|M|S|Unknown|1000|Good|0|0|0| +134|F|S|Unknown|1000|Good|0|0|0| +135|M|D|Unknown|1000|Good|0|0|0| +136|F|D|Unknown|1000|Good|0|0|0| +137|M|W|Unknown|1000|Good|0|0|0| +138|F|W|Unknown|1000|Good|0|0|0| +139|M|U|Unknown|1000|Good|0|0|0| +140|F|U|Unknown|1000|Good|0|0|0| +141|M|M|Primary|1500|Good|0|0|0| +142|F|M|Primary|1500|Good|0|0|0| +143|M|S|Primary|1500|Good|0|0|0| +144|F|S|Primary|1500|Good|0|0|0| +145|M|D|Primary|1500|Good|0|0|0| +146|F|D|Primary|1500|Good|0|0|0| +147|M|W|Primary|1500|Good|0|0|0| +148|F|W|Primary|1500|Good|0|0|0| +149|M|U|Primary|1500|Good|0|0|0| +150|F|U|Primary|1500|Good|0|0|0| +151|M|M|Secondary|1500|Good|0|0|0| +152|F|M|Secondary|1500|Good|0|0|0| +153|M|S|Secondary|1500|Good|0|0|0| +154|F|S|Secondary|1500|Good|0|0|0| +155|M|D|Secondary|1500|Good|0|0|0| +156|F|D|Secondary|1500|Good|0|0|0| +157|M|W|Secondary|1500|Good|0|0|0| +158|F|W|Secondary|1500|Good|0|0|0| +159|M|U|Secondary|1500|Good|0|0|0| +160|F|U|Secondary|1500|Good|0|0|0| +161|M|M|College|1500|Good|0|0|0| +162|F|M|College|1500|Good|0|0|0| +163|M|S|College|1500|Good|0|0|0| +164|F|S|College|1500|Good|0|0|0| +165|M|D|College|1500|Good|0|0|0| +166|F|D|College|1500|Good|0|0|0| +167|M|W|College|1500|Good|0|0|0| +168|F|W|College|1500|Good|0|0|0| +169|M|U|College|1500|Good|0|0|0| +170|F|U|College|1500|Good|0|0|0| +171|M|M|2 yr Degree|1500|Good|0|0|0| +172|F|M|2 yr Degree|1500|Good|0|0|0| +173|M|S|2 yr Degree|1500|Good|0|0|0| +174|F|S|2 yr Degree|1500|Good|0|0|0| +175|M|D|2 yr Degree|1500|Good|0|0|0| +176|F|D|2 yr Degree|1500|Good|0|0|0| +177|M|W|2 yr Degree|1500|Good|0|0|0| +178|F|W|2 yr Degree|1500|Good|0|0|0| +179|M|U|2 yr Degree|1500|Good|0|0|0| +180|F|U|2 yr Degree|1500|Good|0|0|0| +181|M|M|4 yr Degree|1500|Good|0|0|0| +182|F|M|4 yr Degree|1500|Good|0|0|0| +183|M|S|4 yr Degree|1500|Good|0|0|0| +184|F|S|4 yr Degree|1500|Good|0|0|0| +185|M|D|4 yr Degree|1500|Good|0|0|0| +186|F|D|4 yr Degree|1500|Good|0|0|0| +187|M|W|4 yr Degree|1500|Good|0|0|0| +188|F|W|4 yr Degree|1500|Good|0|0|0| +189|M|U|4 yr Degree|1500|Good|0|0|0| +190|F|U|4 yr Degree|1500|Good|0|0|0| +191|M|M|Advanced Degree|1500|Good|0|0|0| +192|F|M|Advanced Degree|1500|Good|0|0|0| +193|M|S|Advanced Degree|1500|Good|0|0|0| +194|F|S|Advanced Degree|1500|Good|0|0|0| +195|M|D|Advanced Degree|1500|Good|0|0|0| +196|F|D|Advanced Degree|1500|Good|0|0|0| +197|M|W|Advanced Degree|1500|Good|0|0|0| +198|F|W|Advanced Degree|1500|Good|0|0|0| +199|M|U|Advanced Degree|1500|Good|0|0|0| +200|F|U|Advanced Degree|1500|Good|0|0|0| diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index 46350a3..6cd13a7 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -300,6 +300,7 @@ minitez.query.files.shared=acid_globallimit.q,\ vector_groupby_mapjoin.q,\ vector_groupby_reduce.q,\ vector_grouping_sets.q,\ + vector_include_no_sel.q,\ vector_if_expr.q,\ vector_inner_join.q,\ vector_interval_1.q,\ @@ -515,6 +516,7 @@ minillap.shared.query.files=bucket_map_join_tez1.q,\ tez_union_group_by.q,\ tez_smb_main.q,\ tez_smb_1.q,\ + vector_include_no_sel.q,\ vector_join_part_col_char.q,\ vectorized_dynamic_partition_pruning.q,\ tez_multi_union.q,\ diff --git llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java index 298f788..9798170 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java @@ -131,7 +131,6 @@ private final SearchArgument sarg; private final String[] columnNames; private final VectorizedRowBatchCtx rbCtx; - private final boolean[] columnsToIncludeTruncated; private final Object[] partitionValues; private final LinkedList pendingData = new LinkedList(); @@ -173,8 +172,6 @@ public LlapRecordReader( MapWork mapWork = Utilities.getMapWork(job); rbCtx = mapWork.getVectorizedRowBatchCtx(); - columnsToIncludeTruncated = rbCtx.getColumnsToIncludeTruncated(job); - int partitionColumnCount = rbCtx.getPartitionColumnCount(); if (partitionColumnCount > 0) { partitionValues = new Object[partitionColumnCount]; @@ -294,7 +291,7 @@ public NullWritable createKey() { @Override public VectorizedRowBatch createValue() { - return rbCtx.createVectorizedRowBatch(columnsToIncludeTruncated); + return rbCtx.createVectorizedRowBatch(); } @Override diff --git orc/src/java/org/apache/orc/impl/TreeReaderFactory.java orc/src/java/org/apache/orc/impl/TreeReaderFactory.java index 5901c8c..ba175a3 100644 --- orc/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ orc/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -1732,9 +1732,12 @@ public void nextBatch(VectorizedRowBatch batch, int batchSize) throws IOException { for(int i=0; i < fields.length && (vectorColumnCount == -1 || i < vectorColumnCount); ++i) { - batch.cols[i].reset(); - batch.cols[i].ensureSize((int) batchSize, false); - fields[i].nextVector(batch.cols[i], null, batchSize); + ColumnVector colVector = batch.cols[i]; + if (colVector != null) { + batch.cols[i].reset(); + batch.cols[i].ensureSize((int) batchSize, false); + fields[i].nextVector(batch.cols[i], null, batchSize); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java index 894ef59..9c84937 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java @@ -335,11 +335,6 @@ public int initConversion(TypeInfo[] sourceTypeInfos, TypeInfo[] targetTypeInfos */ public void assignRowColumn(VectorizedRowBatch batch, int batchIndex, int logicalColumnIndex, Object object) { - final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; - if (object == null) { - VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); - return; - } Category targetCategory = targetCategories[logicalColumnIndex]; if (targetCategory == null) { /* @@ -347,6 +342,11 @@ public void assignRowColumn(VectorizedRowBatch batch, int batchIndex, int logica */ return; } + final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); + return; + } switch (targetCategory) { case PRIMITIVE: { @@ -493,19 +493,19 @@ public void assignRowColumn(VectorizedRowBatch batch, int batchIndex, int logica public void assignConvertRowColumn(VectorizedRowBatch batch, int batchIndex, int logicalColumnIndex, Object object) { Preconditions.checkState(isConvert[logicalColumnIndex]); + Category targetCategory = targetCategories[logicalColumnIndex]; + if (targetCategory == null) { + /* + * This is a column that we don't want (i.e. not included) -- we are done. + */ + return; + } final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; if (object == null) { VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); return; } try { - Category targetCategory = targetCategories[logicalColumnIndex]; - if (targetCategory == null) { - /* - * This is a column that we don't want (i.e. not included) -- we are done. - */ - return; - } switch (targetCategory) { case PRIMITIVE: PrimitiveCategory targetPrimitiveCategory = targetPrimitiveCategories[logicalColumnIndex]; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java index 2e8331a..fca1882 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java @@ -342,21 +342,24 @@ public void init() throws HiveException { */ private void deserializeRowColumn(VectorizedRowBatch batch, int batchIndex, int logicalColumnIndex) throws IOException { - final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; - if (deserializeRead.readCheckNull()) { - VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); - return; - } - - // We have a value for the row column. Category sourceCategory = sourceCategories[logicalColumnIndex]; if (sourceCategory == null) { /* * This is a column that we don't want (i.e. not included). - * The deserializeRead.readCheckNull() has read the field, so we are done. + * The deserializeRead.readCheckNull() will read the field. */ + boolean isNull = deserializeRead.readCheckNull(); + Preconditions.checkState(isNull); return; } + + final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); + return; + } + + // We have a value for the row column. switch (sourceCategory) { case PRIMITIVE: { @@ -480,21 +483,24 @@ private void deserializeRowColumn(VectorizedRowBatch batch, int batchIndex, */ private void deserializeConvertRowColumn(VectorizedRowBatch batch, int batchIndex, int logicalColumnIndex) throws IOException { - final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; - if (deserializeRead.readCheckNull()) { - VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); - return; - } - - // We have a value for the row column. Category sourceCategory = sourceCategories[logicalColumnIndex]; if (sourceCategory == null) { /* * This is a column that we don't want (i.e. not included). - * The deserializeRead.readCheckNull() has read the field, so we are done. + * The deserializeRead.readCheckNull() will read the field. */ + boolean isNull = deserializeRead.readCheckNull(); + Preconditions.checkState(isNull); return; } + + final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); + return; + } + + // We have a value for the row column. Writable convertSourceWritable = convertSourceWritables[logicalColumnIndex]; switch (sourceCategory) { case PRIMITIVE: diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java index b7b5ae8..e6dc9ec 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java @@ -183,8 +183,8 @@ public Object extractRowColumn(VectorizedRowBatch batch, int batchIndex, int log final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; ColumnVector colVector = batch.cols[projectionColumnNum]; if (colVector == null) { - // In rare cases, the planner will not include columns for reading but other parts of - // execution will ask for but not use them.. + // The planner will not include unneeded columns for reading but other parts of execution + // may ask for them.. return null; } int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java index 6979956..d39757e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java @@ -100,6 +100,10 @@ private transient TypeInfo[] tableRowTypeInfos; + private transient int[] dataColumnNums; + + private transient StandardStructObjectInspector neededStandardStructObjectInspector; + private transient VectorizedRowBatchCtx batchContext; // The context for creating the VectorizedRowBatch for this Map node that // the Vectorizer class determined. @@ -122,7 +126,7 @@ private transient int partitionColumnCount; private transient Object[] partitionValues; - private transient boolean[] columnsToIncludeTruncated; + private transient boolean[] dataColumnsToIncludeTruncated; /* * The following members have context information for the current partition file being read. @@ -264,7 +268,7 @@ public void init(Configuration hconf) // Initialize with data row type conversion parameters. readerColumnCount = - vectorDeserializeRow.initConversion(tableRowTypeInfos, columnsToIncludeTruncated); + vectorDeserializeRow.initConversion(tableRowTypeInfos, dataColumnsToIncludeTruncated); deserializeRead = lazySimpleDeserializeRead; } @@ -280,7 +284,7 @@ public void init(Configuration hconf) // Initialize with data row type conversion parameters. readerColumnCount = - vectorDeserializeRow.initConversion(tableRowTypeInfos, columnsToIncludeTruncated); + vectorDeserializeRow.initConversion(tableRowTypeInfos, dataColumnsToIncludeTruncated); deserializeRead = lazyBinaryDeserializeRead; } @@ -351,7 +355,7 @@ public void init(Configuration hconf) // Initialize with data type conversion parameters. readerColumnCount = - vectorAssign.initConversion(dataTypeInfos, tableRowTypeInfos, columnsToIncludeTruncated); + vectorAssign.initConversion(dataTypeInfos, tableRowTypeInfos, dataColumnsToIncludeTruncated); } } @@ -396,40 +400,24 @@ public VectorPartitionContext createAndInitPartitionContext(PartitionDesc partDe return vectorPartitionContext; } - private void determineColumnsToInclude(Configuration hconf) { - - columnsToIncludeTruncated = null; + private void determineDataColumnsToIncludeTruncated() { - List columnsToIncludeTruncatedList = ColumnProjectionUtils.getReadColumnIDs(hconf); - if (columnsToIncludeTruncatedList != null && - columnsToIncludeTruncatedList.size() > 0 && columnsToIncludeTruncatedList.size() < dataColumnCount ) { - - // Partitioned columns will not be in the include list. - - boolean[] columnsToInclude = new boolean[dataColumnCount]; - Arrays.fill(columnsToInclude, false); - for (int columnNum : columnsToIncludeTruncatedList) { - columnsToInclude[columnNum] = true; - } + Preconditions.checkState(batchContext != null); + Preconditions.checkState(dataColumnNums != null); - // Work backwards to find the highest wanted column. + boolean[] columnsToInclude = new boolean[dataColumnCount];; + final int count = dataColumnNums.length; + int columnNum = -1; + for (int i = 0; i < count; i++) { + columnNum = dataColumnNums[i]; + Preconditions.checkState(columnNum < dataColumnCount); + columnsToInclude[columnNum] = true; + } - int highestWantedColumnNum = -1; - for (int i = dataColumnCount - 1; i >= 0; i--) { - if (columnsToInclude[i]) { - highestWantedColumnNum = i; - break; - } - } - if (highestWantedColumnNum == -1) { - throw new RuntimeException("No columns to include?"); - } - int newColumnCount = highestWantedColumnNum + 1; - if (newColumnCount == dataColumnCount) { - columnsToIncludeTruncated = columnsToInclude; - } else { - columnsToIncludeTruncated = Arrays.copyOf(columnsToInclude, newColumnCount); - } + if (columnNum == -1) { + dataColumnsToIncludeTruncated = new boolean[0]; + } else { + dataColumnsToIncludeTruncated = Arrays.copyOf(columnsToInclude, columnNum + 1); } } @@ -479,23 +467,20 @@ private void internalSetChildren(Configuration hconf) throws Exception { // so set it here to none. currentReadType = VectorMapOperatorReadType.NONE; - determineColumnsToInclude(hconf); - batchContext = conf.getVectorizedRowBatchCtx(); - /* * Use a different batch for vectorized Input File Format readers so they can do their work * overlapped with work of the row collection that vector/row deserialization does. This allows * the partitions to mix modes (e.g. for us to flush the previously batched rows on file change). */ vectorizedInputFileFormatBatch = - batchContext.createVectorizedRowBatch(columnsToIncludeTruncated); + batchContext.createVectorizedRowBatch(); conf.setVectorizedRowBatch(vectorizedInputFileFormatBatch); /* * This batch is used by vector/row deserializer readers. */ - deserializerBatch = batchContext.createVectorizedRowBatch(columnsToIncludeTruncated); + deserializerBatch = batchContext.createVectorizedRowBatch(); batchCounter = 0; @@ -503,13 +488,21 @@ private void internalSetChildren(Configuration hconf) throws Exception { partitionColumnCount = batchContext.getPartitionColumnCount(); partitionValues = new Object[partitionColumnCount]; + dataColumnNums = batchContext.getDataColumnNums(); + Preconditions.checkState(dataColumnNums != null); + + // Form a truncated boolean include array for our vector/row deserializers. + determineDataColumnsToIncludeTruncated(); + /* * Create table related objects */ + final String[] rowColumnNames = batchContext.getRowColumnNames(); + final TypeInfo[] rowColumnTypeInfos = batchContext.getRowColumnTypeInfos(); tableStructTypeInfo = TypeInfoFactory.getStructTypeInfo( - Arrays.asList(batchContext.getRowColumnNames()), - Arrays.asList(batchContext.getRowColumnTypeInfos())); + Arrays.asList(rowColumnNames), + Arrays.asList(rowColumnTypeInfos)); tableStandardStructObjectInspector = (StandardStructObjectInspector) TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(tableStructTypeInfo); @@ -517,6 +510,20 @@ private void internalSetChildren(Configuration hconf) throws Exception { tableRowTypeInfos = batchContext.getRowColumnTypeInfos(); /* + * NOTE: We do not alter the projectedColumns / projectionSize of the batches to just be + * the included columns (+ partition columns). + * + * For now, we need to model the object inspector rows because there are still several + * vectorized operators that use them. + * + * We need to continue to model the Object[] as having null objects for not included columns + * until the following has been fixed: + * o When we have to output a STRUCT for AVG we switch to row GroupBy operators. + * o Some variations of VectorMapOperator, VectorReduceSinkOperator, VectorFileSinkOperator + * use the row super class to process rows. + */ + + /* * The Vectorizer class enforces that there is only one TableScanOperator, so * we don't need the more complicated multiple root operator mapping that MapOperator has. */ @@ -657,13 +664,15 @@ private void setupPartitionContextVars(String nominalPath) throws HiveException if (currentDataColumnCount < dataColumnCount) { /* - * Default any additional data columns to NULL once for the file. + * Default any additional data columns to NULL once for the file (if they are present). */ for (int i = currentDataColumnCount; i < dataColumnCount; i++) { ColumnVector colVector = deserializerBatch.cols[i]; - colVector.isNull[0] = true; - colVector.noNulls = false; - colVector.isRepeating = true; + if (colVector != null) { + colVector.isNull[0] = true; + colVector.noNulls = false; + colVector.isRepeating = true; + } } } @@ -788,8 +797,11 @@ public void process(Writable value) throws HiveException { * because they are not present in the partition, and not partition columns. */ for (int c = 0; c < currentDataColumnCount; c++) { - deserializerBatch.cols[c].reset(); - deserializerBatch.cols[c].init(); + ColumnVector colVector = deserializerBatch.cols[c]; + if (colVector != null) { + deserializerBatch.cols[c].reset(); + deserializerBatch.cols[c].init(); + } } deserializerBatch.selectedInUse = false; deserializerBatch.size = 0; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index c4f47e1..321ae79 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -186,6 +186,37 @@ public VectorizationContext(String contextName, List initialColumnNames) vMap = new VectorExpressionDescriptor(); } + public VectorizationContext(String contextName, List tableColumnNames, + int dataColumnCount, List dataColumnNums) { + this.contextName = contextName; + level = 0; + initialColumnNames = tableColumnNames; + + // Make a copy since will be adding partition columns below. + projectedColumns = new ArrayList(dataColumnNums); + + projectionColumnNames = new ArrayList(); + projectionColumnMap = new HashMap(); + for (int dataColumnNum : dataColumnNums) { + String columnName = tableColumnNames.get(dataColumnNum); + projectionColumnNames.add(columnName); + projectionColumnMap.put(columnName, dataColumnNum); + } + + final int endColumnNum = tableColumnNames.size(); + for (int partColumnNum = dataColumnCount; partColumnNum < endColumnNum; partColumnNum++) { + projectedColumns.add(partColumnNum); + String columnName = tableColumnNames.get(partColumnNum); + projectionColumnNames.add(columnName); + projectionColumnMap.put(columnName, partColumnNum); + } + + int firstOutputColumnIndex = tableColumnNames.size(); + this.ocm = new OutputColumnManager(firstOutputColumnIndex); + this.firstOutputColumnIndex = firstOutputColumnIndex; + vMap = new VectorExpressionDescriptor(); + } + // Constructor to with the individual addInitialColumn method // followed by a call to finishedAddingInitialColumns. public VectorizationContext(String contextName) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java index 82a97e0..e06349d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java @@ -50,6 +50,8 @@ import org.apache.hadoop.mapred.FileSplit; import org.apache.hive.common.util.DateUtils; +import com.google.common.base.Preconditions; + /** * Context for Vectorized row batch. this class does eager deserialization of row data using serde * in the RecordReader layer. @@ -68,6 +70,7 @@ // It will be stored in MapWork and ReduceWork. private String[] rowColumnNames; private TypeInfo[] rowColumnTypeInfos; + private int[] dataColumnNums; private int dataColumnCount; private int partitionColumnCount; @@ -80,9 +83,10 @@ public VectorizedRowBatchCtx() { } public VectorizedRowBatchCtx(String[] rowColumnNames, TypeInfo[] rowColumnTypeInfos, - int partitionColumnCount, String[] scratchColumnTypeNames) { + int[] dataColumnNums, int partitionColumnCount, String[] scratchColumnTypeNames) { this.rowColumnNames = rowColumnNames; this.rowColumnTypeInfos = rowColumnTypeInfos; + this.dataColumnNums = dataColumnNums; this.partitionColumnCount = partitionColumnCount; this.scratchColumnTypeNames = scratchColumnTypeNames; @@ -97,6 +101,10 @@ public VectorizedRowBatchCtx(String[] rowColumnNames, TypeInfo[] rowColumnTypeIn return rowColumnTypeInfos; } + public int[] getDataColumnNums() { + return dataColumnNums; + } + public int getDataColumnCount() { return dataColumnCount; } @@ -123,6 +131,7 @@ public void init(StructObjectInspector structObjectInspector, String[] scratchCo // Row column information. rowColumnNames = VectorizedBatchUtil.columnNamesFromStructObjectInspector(structObjectInspector); rowColumnTypeInfos = VectorizedBatchUtil.typeInfosFromStructObjectInspector(structObjectInspector); + dataColumnNums = null; partitionColumnCount = 0; dataColumnCount = rowColumnTypeInfos.length; @@ -185,44 +194,30 @@ public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, PartitionDes */ public VectorizedRowBatch createVectorizedRowBatch() { - int totalColumnCount = rowColumnTypeInfos.length + scratchColumnTypeNames.length; - VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount); - - LOG.info("createVectorizedRowBatch columnsToIncludeTruncated NONE"); - for (int i = 0; i < rowColumnTypeInfos.length; i++) { - TypeInfo typeInfo = rowColumnTypeInfos[i]; - result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo); - } - - for (int i = 0; i < scratchColumnTypeNames.length; i++) { - String typeName = scratchColumnTypeNames[i]; - result.cols[rowColumnTypeInfos.length + i] = - VectorizedBatchUtil.createColumnVector(typeName); - } - - result.setPartitionInfo(dataColumnCount, partitionColumnCount); - - result.reset(); - return result; - } - - public VectorizedRowBatch createVectorizedRowBatch(boolean[] columnsToIncludeTruncated) - { - if (columnsToIncludeTruncated == null) { - return createVectorizedRowBatch(); - } - - LOG.info("createVectorizedRowBatch columnsToIncludeTruncated " + Arrays.toString(columnsToIncludeTruncated)); - int totalColumnCount = rowColumnTypeInfos.length + scratchColumnTypeNames.length; + final int dataAndPartColumnCount = rowColumnTypeInfos.length; + final int totalColumnCount = dataAndPartColumnCount + scratchColumnTypeNames.length; VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount); - for (int i = 0; i < dataColumnCount; i++) { - TypeInfo typeInfo = rowColumnTypeInfos[i]; - result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo); - } - for (int i = dataColumnCount; i < dataColumnCount + partitionColumnCount; i++) { - TypeInfo typeInfo = rowColumnTypeInfos[i]; - result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo); + if (dataColumnNums == null) { + // All data and partition columns. + for (int i = 0; i < dataAndPartColumnCount; i++) { + TypeInfo typeInfo = rowColumnTypeInfos[i]; + result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo); + } + } else { + // Create only needed/included columns data columns. + for (int i = 0; i < dataColumnNums.length; i++) { + int columnNum = dataColumnNums[i]; + Preconditions.checkState(columnNum < dataAndPartColumnCount); + TypeInfo typeInfo = rowColumnTypeInfos[columnNum]; + result.cols[columnNum] = VectorizedBatchUtil.createColumnVector(typeInfo); + } + // Always create partition columns. + final int endColumnNum = dataColumnCount + partitionColumnCount; + for (int partitionColumnNum = dataColumnCount; partitionColumnNum < endColumnNum; partitionColumnNum++) { + TypeInfo typeInfo = rowColumnTypeInfos[partitionColumnNum]; + result.cols[partitionColumnNum] = VectorizedBatchUtil.createColumnVector(typeInfo); + } } for (int i = 0; i < scratchColumnTypeNames.length; i++) { @@ -237,45 +232,6 @@ public VectorizedRowBatch createVectorizedRowBatch(boolean[] columnsToIncludeTru return result; } - public boolean[] getColumnsToIncludeTruncated(Configuration conf) { - boolean[] columnsToIncludeTruncated = null; - - List columnsToIncludeTruncatedList = ColumnProjectionUtils.getReadColumnIDs(conf); - if (columnsToIncludeTruncatedList != null && columnsToIncludeTruncatedList.size() > 0 ) { - - // Partitioned columns will not be in the include list. - - boolean[] columnsToInclude = new boolean[dataColumnCount]; - Arrays.fill(columnsToInclude, false); - for (int columnNum : columnsToIncludeTruncatedList) { - if (columnNum < dataColumnCount) { - columnsToInclude[columnNum] = true; - } - } - - // Work backwards to find the highest wanted column. - - int highestWantedColumnNum = -1; - for (int i = dataColumnCount - 1; i >= 0; i--) { - if (columnsToInclude[i]) { - highestWantedColumnNum = i; - break; - } - } - if (highestWantedColumnNum == -1) { - throw new RuntimeException("No columns to include?"); - } - int newColumnCount = highestWantedColumnNum + 1; - if (newColumnCount == dataColumnCount) { - // Didn't trim any columns off the end. Use the original. - columnsToIncludeTruncated = columnsToInclude; - } else { - columnsToIncludeTruncated = Arrays.copyOf(columnsToInclude, newColumnCount); - } - } - return columnsToIncludeTruncated; - } - /** * Add the partition values to the batch * @@ -299,72 +255,72 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partition lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Boolean) value == true ? 1 : 0); lcv.isNull[0] = false; } } - break; - + break; + case BYTE: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Byte) value); lcv.isNull[0] = false; } } - break; - + break; + case SHORT: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Short) value); lcv.isNull[0] = false; } } break; - + case INT: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Integer) value); lcv.isNull[0] = false; - } + } } break; - + case LONG: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Long) value); lcv.isNull[0] = false; - } + } } break; - + case DATE: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill(DateWritable.dateToDays((Date) value)); lcv.isNull[0] = false; } @@ -417,10 +373,10 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partition } else { dcv.fill((Float) value); dcv.isNull[0] = false; - } + } } break; - + case DOUBLE: { DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex]; if (value == null) { @@ -433,7 +389,7 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partition } } break; - + case DECIMAL: { DecimalColumnVector dv = (DecimalColumnVector) batch.cols[colIndex]; if (value == null) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java index 80858a9..bca5096 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java @@ -67,7 +67,6 @@ public DummyInputSplit(String path) { private int counter; protected final VectorizedRowBatchCtx rbCtx; - private final boolean[] columnsToIncludeTruncated; private final Object[] partitionValues; private boolean addPartitionCols = true; @@ -78,7 +77,6 @@ public NullRowsRecordReader(Configuration conf, InputSplit split) throws IOExcep } if (isVectorMode) { rbCtx = Utilities.getVectorizedRowBatchCtx(conf); - columnsToIncludeTruncated = rbCtx.getColumnsToIncludeTruncated(conf); int partitionColumnCount = rbCtx.getPartitionColumnCount(); if (partitionColumnCount > 0) { partitionValues = new Object[partitionColumnCount]; @@ -88,7 +86,6 @@ public NullRowsRecordReader(Configuration conf, InputSplit split) throws IOExcep } } else { rbCtx = null; - columnsToIncludeTruncated = null; partitionValues = null; } } @@ -105,7 +102,7 @@ public NullWritable createKey() { @Override public Object createValue() { return rbCtx == null ? NullWritable.get() : - rbCtx.createVectorizedRowBatch(columnsToIncludeTruncated); + rbCtx.createVectorizedRowBatch(); } @Override @@ -143,11 +140,10 @@ protected void makeNullVrb(Object value, int size) { vrb.size = size; vrb.selectedInUse = false; for (int i = 0; i < rbCtx.getDataColumnCount(); i++) { - if (columnsToIncludeTruncated != null - && (columnsToIncludeTruncated.length <= i || !columnsToIncludeTruncated[i])) { + ColumnVector cv = vrb.cols[i]; + if (cv == null) { continue; } - ColumnVector cv = vrb.cols[i]; cv.noNulls = false; cv.isRepeating = true; cv.isNull[0] = true; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java index e4d2e6e..214b84f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java @@ -59,7 +59,6 @@ private final long length; private float progress = 0.0f; private VectorizedRowBatchCtx rbCtx; - private final boolean[] columnsToIncludeTruncated; private final Object[] partitionValues; private boolean addPartitionCols = true; @@ -102,8 +101,6 @@ this.reader = file.rowsOptions(options); - columnsToIncludeTruncated = rbCtx.getColumnsToIncludeTruncated(conf); - int partitionColumnCount = rbCtx.getPartitionColumnCount(); if (partitionColumnCount > 0) { partitionValues = new Object[partitionColumnCount]; @@ -145,7 +142,7 @@ public NullWritable createKey() { @Override public VectorizedRowBatch createValue() { - return rbCtx.createVectorizedRowBatch(columnsToIncludeTruncated); + return rbCtx.createVectorizedRowBatch(); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 00203ae..e8cb9d6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -34,6 +34,7 @@ import java.util.regex.Pattern; import org.apache.calcite.util.Pair; +import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -92,6 +93,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; @@ -160,6 +162,7 @@ import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.*; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.NullStructSerDe; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; @@ -352,8 +355,10 @@ public Vectorizer() { } private class VectorTaskColumnInfo { - List columnNames; - List typeInfos; + List allColumnNames; + List allTypeInfos; + List dataColumnNums; + int partitionColumnCount; boolean useVectorizedInputFileFormat; @@ -361,15 +366,20 @@ public Vectorizer() { Set> nonVectorizedOps; + TableScanOperator tableScanOperator; + VectorTaskColumnInfo() { partitionColumnCount = 0; } - public void setColumnNames(List columnNames) { - this.columnNames = columnNames; + public void setAllColumnNames(List allColumnNames) { + this.allColumnNames = allColumnNames; + } + public void setAllTypeInfos(List allTypeInfos) { + this.allTypeInfos = allTypeInfos; } - public void setTypeInfos(List typeInfos) { - this.typeInfos = typeInfos; + public void setDataColumnNums(List dataColumnNums) { + this.dataColumnNums = dataColumnNums; } public void setPartitionColumnCount(int partitionColumnCount) { this.partitionColumnCount = partitionColumnCount; @@ -383,6 +393,9 @@ public void setUseVectorizedInputFileFormat(boolean useVectorizedInputFileFormat public void setNonVectorizedOps(Set> nonVectorizedOps) { this.nonVectorizedOps = nonVectorizedOps; } + public void setTableScanOperator(TableScanOperator tableScanOperator) { + this.tableScanOperator = tableScanOperator; + } public Set> getNonVectorizedOps() { return nonVectorizedOps; @@ -390,13 +403,20 @@ public void setNonVectorizedOps(Set> nonVectori public void transferToBaseWork(BaseWork baseWork) { - String[] columnNameArray = columnNames.toArray(new String[0]); - TypeInfo[] typeInfoArray = typeInfos.toArray(new TypeInfo[0]); + String[] allColumnNameArray = allColumnNames.toArray(new String[0]); + TypeInfo[] allTypeInfoArray = allTypeInfos.toArray(new TypeInfo[0]); + int[] dataColumnNumsArray; + if (dataColumnNums != null) { + dataColumnNumsArray = ArrayUtils.toPrimitive(dataColumnNums.toArray(new Integer[0])); + } else { + dataColumnNumsArray = null; + } VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx( - columnNameArray, - typeInfoArray, + allColumnNameArray, + allTypeInfoArray, + dataColumnNumsArray, partitionColumnCount, scratchTypeNameArray); baseWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx); @@ -515,6 +535,22 @@ private void getTableScanOperatorSchemaInfo(TableScanOperator tableScanOperator, } } + private void determineDataColumnNums(TableScanOperator tableScanOperator, + List allColumnNameList, int dataColumnCount, List dataColumnNums) { + + /* + * The TableScanOperator's needed columns are just the data columns. + */ + Set neededColumns = new HashSet(tableScanOperator.getNeededColumns()); + + for (int dataColumnNum = 0; dataColumnNum < dataColumnCount; dataColumnNum++) { + String columnName = allColumnNameList.get(dataColumnNum); + if (neededColumns.contains(columnName)) { + dataColumnNums.add(dataColumnNum); + } + } + } + private String getHiveOptionsString() { StringBuilder sb = new StringBuilder(); sb.append(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); @@ -656,6 +692,9 @@ private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String al final List allTypeInfoList = new ArrayList(); getTableScanOperatorSchemaInfo(tableScanOperator, allColumnNameList, allTypeInfoList); + + final List dataColumnNums = new ArrayList(); + final int allColumnCount = allColumnNameList.size(); /* @@ -705,6 +744,9 @@ private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String al dataColumnCount = allColumnCount; } + determineDataColumnNums(tableScanOperator, allColumnNameList, dataColumnCount, + dataColumnNums); + tableDataColumnList = allColumnNameList.subList(0, dataColumnCount); tableDataTypeInfoList = allTypeInfoList.subList(0, dataColumnCount); @@ -778,11 +820,15 @@ private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String al vectorPartDesc.setDataTypeInfos(nextDataTypeInfoList); } - vectorTaskColumnInfo.setColumnNames(allColumnNameList); - vectorTaskColumnInfo.setTypeInfos(allTypeInfoList); + vectorTaskColumnInfo.setAllColumnNames(allColumnNameList); + vectorTaskColumnInfo.setAllTypeInfos(allTypeInfoList); + vectorTaskColumnInfo.setDataColumnNums(dataColumnNums); vectorTaskColumnInfo.setPartitionColumnCount(partitionColumnCount); vectorTaskColumnInfo.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat); + // Helps to keep this for debugging. + vectorTaskColumnInfo.setTableScanOperator(tableScanOperator); + return true; } @@ -903,8 +949,8 @@ private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork, throw new SemanticException(e); } - vectorTaskColumnInfo.setColumnNames(reduceColumnNames); - vectorTaskColumnInfo.setTypeInfos(reduceTypeInfos); + vectorTaskColumnInfo.setAllColumnNames(reduceColumnNames); + vectorTaskColumnInfo.setAllTypeInfos(reduceTypeInfos); return true; } @@ -1241,9 +1287,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, boolean saveRootVectorOp = false; if (op.getParentOperators().size() == 0) { - LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + vectorTaskColumnInfo.columnNames.toString()); + LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + vectorTaskColumnInfo.allColumnNames.toString()); - vContext = new VectorizationContext("__Reduce_Shuffle__", vectorTaskColumnInfo.columnNames); + vContext = new VectorizationContext("__Reduce_Shuffle__", vectorTaskColumnInfo.allColumnNames); taskVectorizationContext = vContext; saveRootVectorOp = true; @@ -1894,7 +1940,11 @@ private boolean validateDataType(String type, VectorExpressionDescriptor.Mode mo private VectorizationContext getVectorizationContext(String contextName, VectorTaskColumnInfo vectorTaskColumnInfo) { - VectorizationContext vContext = new VectorizationContext(contextName, vectorTaskColumnInfo.columnNames); + final int dataColumnCount = + vectorTaskColumnInfo.allColumnNames.size() - vectorTaskColumnInfo.partitionColumnCount; + VectorizationContext vContext = + new VectorizationContext(contextName, vectorTaskColumnInfo.allColumnNames, + dataColumnCount, vectorTaskColumnInfo.dataColumnNums); return vContext; } @@ -2459,12 +2509,12 @@ public void debugDisplayAllMaps(BaseWork work) { VectorizedRowBatchCtx vectorizedRowBatchCtx = work.getVectorizedRowBatchCtx(); - String[] columnNames = vectorizedRowBatchCtx.getRowColumnNames(); + String[] allColumnNames = vectorizedRowBatchCtx.getRowColumnNames(); Object columnTypeInfos = vectorizedRowBatchCtx.getRowColumnTypeInfos(); int partitionColumnCount = vectorizedRowBatchCtx.getPartitionColumnCount(); String[] scratchColumnTypeNames =vectorizedRowBatchCtx.getScratchColumnTypeNames(); - LOG.debug("debugDisplayAllMaps columnNames " + Arrays.toString(columnNames)); + LOG.debug("debugDisplayAllMaps allColumnNames " + Arrays.toString(allColumnNames)); LOG.debug("debugDisplayAllMaps columnTypeInfos " + Arrays.deepToString((Object[]) columnTypeInfos)); LOG.debug("debugDisplayAllMaps partitionColumnCount " + partitionColumnCount); LOG.debug("debugDisplayAllMaps scratchColumnTypeNames " + Arrays.toString(scratchColumnTypeNames)); diff --git ql/src/test/queries/clientpositive/vector_include_no_sel.q ql/src/test/queries/clientpositive/vector_include_no_sel.q new file mode 100644 index 0000000..b78b149 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_include_no_sel.q @@ -0,0 +1,78 @@ +set hive.explain.user=false; +SET hive.vectorized.execution.enabled=true; +SET hive.vectorized.execution.reducesink.new.enabled=false; +set hive.cbo.enable=false; +SET hive.auto.convert.join=true; +SET hive.auto.convert.join.noconditionaltask=true; +SET hive.auto.convert.join.noconditionaltask.size=1000000000; + +-- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile; + +LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt; + +create table store_sales stored as orc as select * from store_sales_txt; + + +create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile; + +LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt; + +create table customer_demographics stored as orc as select * from customer_demographics_txt; + +explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')); + +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')); \ No newline at end of file diff --git ql/src/test/results/clientpositive/llap/vector_include_no_sel.q.out ql/src/test/results/clientpositive/llap/vector_include_no_sel.q.out new file mode 100644 index 0000000..e939c67 --- /dev/null +++ ql/src/test/results/clientpositive/llap/vector_include_no_sel.q.out @@ -0,0 +1,286 @@ +PREHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@store_sales_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@store_sales_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +POSTHOOK: Lineage: store_sales.ss_addr_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_addr_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_cdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_cdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_coupon_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_coupon_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_customer_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_customer_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_discount_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_discount_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_wholesale_cost, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_hdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_hdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_item_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_item_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid_inc_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid_inc_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_profit SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_profit, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_promo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_promo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_quantity SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_quantity, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_date_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_date_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_time_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_time_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_store_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_store_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ticket_number SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ticket_number, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_wholesale_cost, type:float, comment:null), ] +PREHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@customer_demographics_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics +POSTHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@customer_demographics_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics +POSTHOOK: Lineage: customer_demographics.cd_credit_rating SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_credit_rating, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_demo_sk SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_demo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_college_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_college_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_employed_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_employed_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_education_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_education_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_gender SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_gender, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_marital_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_marital_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_purchase_estimate SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_purchase_estimate, type:int, comment:null), ] +Warning: Map Join MAPJOIN[13][bigTable=store_sales] in task 'Map 2' is a cross product +PREHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 2 <- Map 1 (BROADCAST_EDGE) + Reducer 3 <- Map 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: customer_demographics + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + value expressions: cd_demo_sk (type: int), cd_marital_status (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 2 + Map Operator Tree: + TableScan + alias: store_sales + Statistics: Num rows: 1000 Data size: 88276 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col2, _col16 + input vertices: + 0 Map 1 + Statistics: Num rows: 200000 Data size: 92055200 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((_col0 = _col16) and (_col2 = 'M')) or ((_col0 = _col16) and (_col2 = 'U'))) (type: boolean) + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Map Join MAPJOIN[13][bigTable=store_sales] in task 'Map 2' is a cross product +PREHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +PREHOOK: Input: default@customer_demographics +PREHOOK: Input: default@store_sales +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer_demographics +POSTHOOK: Input: default@store_sales +#### A masked pattern was here #### +0 diff --git ql/src/test/results/clientpositive/tez/vector_include_no_sel.q.out ql/src/test/results/clientpositive/tez/vector_include_no_sel.q.out new file mode 100644 index 0000000..be991b2 --- /dev/null +++ ql/src/test/results/clientpositive/tez/vector_include_no_sel.q.out @@ -0,0 +1,284 @@ +PREHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@store_sales_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@store_sales_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +POSTHOOK: Lineage: store_sales.ss_addr_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_addr_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_cdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_cdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_coupon_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_coupon_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_customer_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_customer_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_discount_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_discount_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_wholesale_cost, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_hdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_hdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_item_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_item_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid_inc_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid_inc_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_profit SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_profit, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_promo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_promo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_quantity SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_quantity, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_date_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_date_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_time_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_time_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_store_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_store_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ticket_number SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ticket_number, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_wholesale_cost, type:float, comment:null), ] +PREHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@customer_demographics_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics +POSTHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@customer_demographics_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics +POSTHOOK: Lineage: customer_demographics.cd_credit_rating SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_credit_rating, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_demo_sk SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_demo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_college_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_college_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_employed_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_employed_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_education_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_education_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_gender SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_gender, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_marital_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_marital_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_purchase_estimate SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_purchase_estimate, type:int, comment:null), ] +Warning: Map Join MAPJOIN[13][bigTable=store_sales] in task 'Map 2' is a cross product +PREHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 2 <- Map 1 (BROADCAST_EDGE) + Reducer 3 <- Map 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: customer_demographics + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + value expressions: cd_demo_sk (type: int), cd_marital_status (type: string) + Execution mode: vectorized + Map 2 + Map Operator Tree: + TableScan + alias: store_sales + Statistics: Num rows: 1000 Data size: 88276 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col2, _col16 + input vertices: + 0 Map 1 + Statistics: Num rows: 200000 Data size: 92055200 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((_col0 = _col16) and (_col2 = 'M')) or ((_col0 = _col16) and (_col2 = 'U'))) (type: boolean) + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reducer 3 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Map Join MAPJOIN[13][bigTable=store_sales] in task 'Map 2' is a cross product +PREHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +PREHOOK: Input: default@customer_demographics +PREHOOK: Input: default@store_sales +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer_demographics +POSTHOOK: Input: default@store_sales +#### A masked pattern was here #### +0 diff --git ql/src/test/results/clientpositive/vector_include_no_sel.q.out ql/src/test/results/clientpositive/vector_include_no_sel.q.out new file mode 100644 index 0000000..697d422 --- /dev/null +++ ql/src/test/results/clientpositive/vector_include_no_sel.q.out @@ -0,0 +1,282 @@ +PREHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@store_sales_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@store_sales_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +POSTHOOK: Lineage: store_sales.ss_addr_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_addr_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_cdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_cdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_coupon_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_coupon_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_customer_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_customer_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_discount_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_discount_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_wholesale_cost, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_hdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_hdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_item_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_item_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid_inc_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid_inc_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_profit SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_profit, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_promo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_promo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_quantity SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_quantity, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_date_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_date_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_time_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_time_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_store_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_store_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ticket_number SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ticket_number, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_wholesale_cost, type:float, comment:null), ] +PREHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@customer_demographics_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics +POSTHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@customer_demographics_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics +POSTHOOK: Lineage: customer_demographics.cd_credit_rating SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_credit_rating, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_demo_sk SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_demo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_college_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_college_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_employed_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_employed_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_education_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_education_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_gender SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_gender, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_marital_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_marital_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_purchase_estimate SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_purchase_estimate, type:int, comment:null), ] +Warning: Map Join MAPJOIN[15][bigTable=store_sales] in task 'Stage-2:MAPRED' is a cross product +PREHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-5 is a root stage + Stage-2 depends on stages: Stage-5 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-5 + Map Reduce Local Work + Alias -> Map Local Tables: + customer_demographics + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + customer_demographics + TableScan + alias: customer_demographics + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + keys: + 0 + 1 + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: store_sales + Statistics: Num rows: 1000 Data size: 88276 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col2, _col16 + Statistics: Num rows: 200000 Data size: 92055200 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((_col0 = _col16) and (_col2 = 'M')) or ((_col0 = _col16) and (_col2 = 'U'))) (type: boolean) + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Map Join MAPJOIN[15][bigTable=store_sales] in task 'Stage-2:MAPRED' is a cross product +PREHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +PREHOOK: Input: default@customer_demographics +PREHOOK: Input: default@store_sales +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer_demographics +POSTHOOK: Input: default@store_sales +#### A masked pattern was here #### +0