diff --git data/files/customer_demographics.txt data/files/customer_demographics.txt new file mode 100644 index 0000000..90ab999 --- /dev/null +++ data/files/customer_demographics.txt @@ -0,0 +1,200 @@ +1|M|M|Primary|500|Good|0|0|0| +2|F|M|Primary|500|Good|0|0|0| +3|M|S|Primary|500|Good|0|0|0| +4|F|S|Primary|500|Good|0|0|0| +5|M|D|Primary|500|Good|0|0|0| +6|F|D|Primary|500|Good|0|0|0| +7|M|W|Primary|500|Good|0|0|0| +8|F|W|Primary|500|Good|0|0|0| +9|M|U|Primary|500|Good|0|0|0| +10|F|U|Primary|500|Good|0|0|0| +11|M|M|Secondary|500|Good|0|0|0| +12|F|M|Secondary|500|Good|0|0|0| +13|M|S|Secondary|500|Good|0|0|0| +14|F|S|Secondary|500|Good|0|0|0| +15|M|D|Secondary|500|Good|0|0|0| +16|F|D|Secondary|500|Good|0|0|0| +17|M|W|Secondary|500|Good|0|0|0| +18|F|W|Secondary|500|Good|0|0|0| +19|M|U|Secondary|500|Good|0|0|0| +20|F|U|Secondary|500|Good|0|0|0| +21|M|M|College|500|Good|0|0|0| +22|F|M|College|500|Good|0|0|0| +23|M|S|College|500|Good|0|0|0| +24|F|S|College|500|Good|0|0|0| +25|M|D|College|500|Good|0|0|0| +26|F|D|College|500|Good|0|0|0| +27|M|W|College|500|Good|0|0|0| +28|F|W|College|500|Good|0|0|0| +29|M|U|College|500|Good|0|0|0| +30|F|U|College|500|Good|0|0|0| +31|M|M|2 yr Degree|500|Good|0|0|0| +32|F|M|2 yr Degree|500|Good|0|0|0| +33|M|S|2 yr Degree|500|Good|0|0|0| +34|F|S|2 yr Degree|500|Good|0|0|0| +35|M|D|2 yr Degree|500|Good|0|0|0| +36|F|D|2 yr Degree|500|Good|0|0|0| +37|M|W|2 yr Degree|500|Good|0|0|0| +38|F|W|2 yr Degree|500|Good|0|0|0| +39|M|U|2 yr Degree|500|Good|0|0|0| +40|F|U|2 yr Degree|500|Good|0|0|0| +41|M|M|4 yr Degree|500|Good|0|0|0| +42|F|M|4 yr Degree|500|Good|0|0|0| +43|M|S|4 yr Degree|500|Good|0|0|0| +44|F|S|4 yr Degree|500|Good|0|0|0| +45|M|D|4 yr Degree|500|Good|0|0|0| +46|F|D|4 yr Degree|500|Good|0|0|0| +47|M|W|4 yr Degree|500|Good|0|0|0| +48|F|W|4 yr Degree|500|Good|0|0|0| +49|M|U|4 yr Degree|500|Good|0|0|0| +50|F|U|4 yr Degree|500|Good|0|0|0| +51|M|M|Advanced Degree|500|Good|0|0|0| +52|F|M|Advanced Degree|500|Good|0|0|0| +53|M|S|Advanced Degree|500|Good|0|0|0| +54|F|S|Advanced Degree|500|Good|0|0|0| +55|M|D|Advanced Degree|500|Good|0|0|0| +56|F|D|Advanced Degree|500|Good|0|0|0| +57|M|W|Advanced Degree|500|Good|0|0|0| +58|F|W|Advanced Degree|500|Good|0|0|0| +59|M|U|Advanced Degree|500|Good|0|0|0| +60|F|U|Advanced Degree|500|Good|0|0|0| +61|M|M|Unknown|500|Good|0|0|0| +62|F|M|Unknown|500|Good|0|0|0| +63|M|S|Unknown|500|Good|0|0|0| +64|F|S|Unknown|500|Good|0|0|0| +65|M|D|Unknown|500|Good|0|0|0| +66|F|D|Unknown|500|Good|0|0|0| +67|M|W|Unknown|500|Good|0|0|0| +68|F|W|Unknown|500|Good|0|0|0| +69|M|U|Unknown|500|Good|0|0|0| +70|F|U|Unknown|500|Good|0|0|0| +71|M|M|Primary|1000|Good|0|0|0| +72|F|M|Primary|1000|Good|0|0|0| +73|M|S|Primary|1000|Good|0|0|0| +74|F|S|Primary|1000|Good|0|0|0| +75|M|D|Primary|1000|Good|0|0|0| +76|F|D|Primary|1000|Good|0|0|0| +77|M|W|Primary|1000|Good|0|0|0| +78|F|W|Primary|1000|Good|0|0|0| +79|M|U|Primary|1000|Good|0|0|0| +80|F|U|Primary|1000|Good|0|0|0| +81|M|M|Secondary|1000|Good|0|0|0| +82|F|M|Secondary|1000|Good|0|0|0| +83|M|S|Secondary|1000|Good|0|0|0| +84|F|S|Secondary|1000|Good|0|0|0| +85|M|D|Secondary|1000|Good|0|0|0| +86|F|D|Secondary|1000|Good|0|0|0| +87|M|W|Secondary|1000|Good|0|0|0| +88|F|W|Secondary|1000|Good|0|0|0| +89|M|U|Secondary|1000|Good|0|0|0| +90|F|U|Secondary|1000|Good|0|0|0| +91|M|M|College|1000|Good|0|0|0| +92|F|M|College|1000|Good|0|0|0| +93|M|S|College|1000|Good|0|0|0| +94|F|S|College|1000|Good|0|0|0| +95|M|D|College|1000|Good|0|0|0| +96|F|D|College|1000|Good|0|0|0| +97|M|W|College|1000|Good|0|0|0| +98|F|W|College|1000|Good|0|0|0| +99|M|U|College|1000|Good|0|0|0| +100|F|U|College|1000|Good|0|0|0| +101|M|M|2 yr Degree|1000|Good|0|0|0| +102|F|M|2 yr Degree|1000|Good|0|0|0| +103|M|S|2 yr Degree|1000|Good|0|0|0| +104|F|S|2 yr Degree|1000|Good|0|0|0| +105|M|D|2 yr Degree|1000|Good|0|0|0| +106|F|D|2 yr Degree|1000|Good|0|0|0| +107|M|W|2 yr Degree|1000|Good|0|0|0| +108|F|W|2 yr Degree|1000|Good|0|0|0| +109|M|U|2 yr Degree|1000|Good|0|0|0| +110|F|U|2 yr Degree|1000|Good|0|0|0| +111|M|M|4 yr Degree|1000|Good|0|0|0| +112|F|M|4 yr Degree|1000|Good|0|0|0| +113|M|S|4 yr Degree|1000|Good|0|0|0| +114|F|S|4 yr Degree|1000|Good|0|0|0| +115|M|D|4 yr Degree|1000|Good|0|0|0| +116|F|D|4 yr Degree|1000|Good|0|0|0| +117|M|W|4 yr Degree|1000|Good|0|0|0| +118|F|W|4 yr Degree|1000|Good|0|0|0| +119|M|U|4 yr Degree|1000|Good|0|0|0| +120|F|U|4 yr Degree|1000|Good|0|0|0| +121|M|M|Advanced Degree|1000|Good|0|0|0| +122|F|M|Advanced Degree|1000|Good|0|0|0| +123|M|S|Advanced Degree|1000|Good|0|0|0| +124|F|S|Advanced Degree|1000|Good|0|0|0| +125|M|D|Advanced Degree|1000|Good|0|0|0| +126|F|D|Advanced Degree|1000|Good|0|0|0| +127|M|W|Advanced Degree|1000|Good|0|0|0| +128|F|W|Advanced Degree|1000|Good|0|0|0| +129|M|U|Advanced Degree|1000|Good|0|0|0| +130|F|U|Advanced Degree|1000|Good|0|0|0| +131|M|M|Unknown|1000|Good|0|0|0| +132|F|M|Unknown|1000|Good|0|0|0| +133|M|S|Unknown|1000|Good|0|0|0| +134|F|S|Unknown|1000|Good|0|0|0| +135|M|D|Unknown|1000|Good|0|0|0| +136|F|D|Unknown|1000|Good|0|0|0| +137|M|W|Unknown|1000|Good|0|0|0| +138|F|W|Unknown|1000|Good|0|0|0| +139|M|U|Unknown|1000|Good|0|0|0| +140|F|U|Unknown|1000|Good|0|0|0| +141|M|M|Primary|1500|Good|0|0|0| +142|F|M|Primary|1500|Good|0|0|0| +143|M|S|Primary|1500|Good|0|0|0| +144|F|S|Primary|1500|Good|0|0|0| +145|M|D|Primary|1500|Good|0|0|0| +146|F|D|Primary|1500|Good|0|0|0| +147|M|W|Primary|1500|Good|0|0|0| +148|F|W|Primary|1500|Good|0|0|0| +149|M|U|Primary|1500|Good|0|0|0| +150|F|U|Primary|1500|Good|0|0|0| +151|M|M|Secondary|1500|Good|0|0|0| +152|F|M|Secondary|1500|Good|0|0|0| +153|M|S|Secondary|1500|Good|0|0|0| +154|F|S|Secondary|1500|Good|0|0|0| +155|M|D|Secondary|1500|Good|0|0|0| +156|F|D|Secondary|1500|Good|0|0|0| +157|M|W|Secondary|1500|Good|0|0|0| +158|F|W|Secondary|1500|Good|0|0|0| +159|M|U|Secondary|1500|Good|0|0|0| +160|F|U|Secondary|1500|Good|0|0|0| +161|M|M|College|1500|Good|0|0|0| +162|F|M|College|1500|Good|0|0|0| +163|M|S|College|1500|Good|0|0|0| +164|F|S|College|1500|Good|0|0|0| +165|M|D|College|1500|Good|0|0|0| +166|F|D|College|1500|Good|0|0|0| +167|M|W|College|1500|Good|0|0|0| +168|F|W|College|1500|Good|0|0|0| +169|M|U|College|1500|Good|0|0|0| +170|F|U|College|1500|Good|0|0|0| +171|M|M|2 yr Degree|1500|Good|0|0|0| +172|F|M|2 yr Degree|1500|Good|0|0|0| +173|M|S|2 yr Degree|1500|Good|0|0|0| +174|F|S|2 yr Degree|1500|Good|0|0|0| +175|M|D|2 yr Degree|1500|Good|0|0|0| +176|F|D|2 yr Degree|1500|Good|0|0|0| +177|M|W|2 yr Degree|1500|Good|0|0|0| +178|F|W|2 yr Degree|1500|Good|0|0|0| +179|M|U|2 yr Degree|1500|Good|0|0|0| +180|F|U|2 yr Degree|1500|Good|0|0|0| +181|M|M|4 yr Degree|1500|Good|0|0|0| +182|F|M|4 yr Degree|1500|Good|0|0|0| +183|M|S|4 yr Degree|1500|Good|0|0|0| +184|F|S|4 yr Degree|1500|Good|0|0|0| +185|M|D|4 yr Degree|1500|Good|0|0|0| +186|F|D|4 yr Degree|1500|Good|0|0|0| +187|M|W|4 yr Degree|1500|Good|0|0|0| +188|F|W|4 yr Degree|1500|Good|0|0|0| +189|M|U|4 yr Degree|1500|Good|0|0|0| +190|F|U|4 yr Degree|1500|Good|0|0|0| +191|M|M|Advanced Degree|1500|Good|0|0|0| +192|F|M|Advanced Degree|1500|Good|0|0|0| +193|M|S|Advanced Degree|1500|Good|0|0|0| +194|F|S|Advanced Degree|1500|Good|0|0|0| +195|M|D|Advanced Degree|1500|Good|0|0|0| +196|F|D|Advanced Degree|1500|Good|0|0|0| +197|M|W|Advanced Degree|1500|Good|0|0|0| +198|F|W|Advanced Degree|1500|Good|0|0|0| +199|M|U|Advanced Degree|1500|Good|0|0|0| +200|F|U|Advanced Degree|1500|Good|0|0|0| diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index 46350a3..6cd13a7 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -300,6 +300,7 @@ minitez.query.files.shared=acid_globallimit.q,\ vector_groupby_mapjoin.q,\ vector_groupby_reduce.q,\ vector_grouping_sets.q,\ + vector_include_no_sel.q,\ vector_if_expr.q,\ vector_inner_join.q,\ vector_interval_1.q,\ @@ -515,6 +516,7 @@ minillap.shared.query.files=bucket_map_join_tez1.q,\ tez_union_group_by.q,\ tez_smb_main.q,\ tez_smb_1.q,\ + vector_include_no_sel.q,\ vector_join_part_col_char.q,\ vectorized_dynamic_partition_pruning.q,\ tez_multi_union.q,\ diff --git llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java index 298f788..9798170 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java @@ -131,7 +131,6 @@ private final SearchArgument sarg; private final String[] columnNames; private final VectorizedRowBatchCtx rbCtx; - private final boolean[] columnsToIncludeTruncated; private final Object[] partitionValues; private final LinkedList pendingData = new LinkedList(); @@ -173,8 +172,6 @@ public LlapRecordReader( MapWork mapWork = Utilities.getMapWork(job); rbCtx = mapWork.getVectorizedRowBatchCtx(); - columnsToIncludeTruncated = rbCtx.getColumnsToIncludeTruncated(job); - int partitionColumnCount = rbCtx.getPartitionColumnCount(); if (partitionColumnCount > 0) { partitionValues = new Object[partitionColumnCount]; @@ -294,7 +291,7 @@ public NullWritable createKey() { @Override public VectorizedRowBatch createValue() { - return rbCtx.createVectorizedRowBatch(columnsToIncludeTruncated); + return rbCtx.createVectorizedRowBatch(); } @Override diff --git orc/src/java/org/apache/orc/impl/TreeReaderFactory.java orc/src/java/org/apache/orc/impl/TreeReaderFactory.java index 5901c8c..ba175a3 100644 --- orc/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ orc/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -1732,9 +1732,12 @@ public void nextBatch(VectorizedRowBatch batch, int batchSize) throws IOException { for(int i=0; i < fields.length && (vectorColumnCount == -1 || i < vectorColumnCount); ++i) { - batch.cols[i].reset(); - batch.cols[i].ensureSize((int) batchSize, false); - fields[i].nextVector(batch.cols[i], null, batchSize); + ColumnVector colVector = batch.cols[i]; + if (colVector != null) { + batch.cols[i].reset(); + batch.cols[i].ensureSize((int) batchSize, false); + fields[i].nextVector(batch.cols[i], null, batchSize); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java index 894ef59..9c84937 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java @@ -335,11 +335,6 @@ public int initConversion(TypeInfo[] sourceTypeInfos, TypeInfo[] targetTypeInfos */ public void assignRowColumn(VectorizedRowBatch batch, int batchIndex, int logicalColumnIndex, Object object) { - final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; - if (object == null) { - VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); - return; - } Category targetCategory = targetCategories[logicalColumnIndex]; if (targetCategory == null) { /* @@ -347,6 +342,11 @@ public void assignRowColumn(VectorizedRowBatch batch, int batchIndex, int logica */ return; } + final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); + return; + } switch (targetCategory) { case PRIMITIVE: { @@ -493,19 +493,19 @@ public void assignRowColumn(VectorizedRowBatch batch, int batchIndex, int logica public void assignConvertRowColumn(VectorizedRowBatch batch, int batchIndex, int logicalColumnIndex, Object object) { Preconditions.checkState(isConvert[logicalColumnIndex]); + Category targetCategory = targetCategories[logicalColumnIndex]; + if (targetCategory == null) { + /* + * This is a column that we don't want (i.e. not included) -- we are done. + */ + return; + } final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; if (object == null) { VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); return; } try { - Category targetCategory = targetCategories[logicalColumnIndex]; - if (targetCategory == null) { - /* - * This is a column that we don't want (i.e. not included) -- we are done. - */ - return; - } switch (targetCategory) { case PRIMITIVE: PrimitiveCategory targetPrimitiveCategory = targetPrimitiveCategories[logicalColumnIndex]; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java index b7b5ae8..e6dc9ec 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java @@ -183,8 +183,8 @@ public Object extractRowColumn(VectorizedRowBatch batch, int batchIndex, int log final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; ColumnVector colVector = batch.cols[projectionColumnNum]; if (colVector == null) { - // In rare cases, the planner will not include columns for reading but other parts of - // execution will ask for but not use them.. + // The planner will not include unneeded columns for reading but other parts of execution + // may ask for them.. return null; } int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java index 6979956..56a0986 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java @@ -100,6 +100,10 @@ private transient TypeInfo[] tableRowTypeInfos; + private transient int[] initialProjection; + + private transient StandardStructObjectInspector neededStandardStructObjectInspector; + private transient VectorizedRowBatchCtx batchContext; // The context for creating the VectorizedRowBatch for this Map node that // the Vectorizer class determined. @@ -122,7 +126,7 @@ private transient int partitionColumnCount; private transient Object[] partitionValues; - private transient boolean[] columnsToIncludeTruncated; + private transient boolean[] dataColumnsToIncludeTruncated; /* * The following members have context information for the current partition file being read. @@ -264,7 +268,7 @@ public void init(Configuration hconf) // Initialize with data row type conversion parameters. readerColumnCount = - vectorDeserializeRow.initConversion(tableRowTypeInfos, columnsToIncludeTruncated); + vectorDeserializeRow.initConversion(tableRowTypeInfos, dataColumnsToIncludeTruncated); deserializeRead = lazySimpleDeserializeRead; } @@ -280,7 +284,7 @@ public void init(Configuration hconf) // Initialize with data row type conversion parameters. readerColumnCount = - vectorDeserializeRow.initConversion(tableRowTypeInfos, columnsToIncludeTruncated); + vectorDeserializeRow.initConversion(tableRowTypeInfos, dataColumnsToIncludeTruncated); deserializeRead = lazyBinaryDeserializeRead; } @@ -351,7 +355,7 @@ public void init(Configuration hconf) // Initialize with data type conversion parameters. readerColumnCount = - vectorAssign.initConversion(dataTypeInfos, tableRowTypeInfos, columnsToIncludeTruncated); + vectorAssign.initConversion(dataTypeInfos, tableRowTypeInfos, dataColumnsToIncludeTruncated); } } @@ -396,40 +400,29 @@ public VectorPartitionContext createAndInitPartitionContext(PartitionDesc partDe return vectorPartitionContext; } - private void determineColumnsToInclude(Configuration hconf) { - - columnsToIncludeTruncated = null; + private void determineDataColumnsToIncludeTruncated() { - List columnsToIncludeTruncatedList = ColumnProjectionUtils.getReadColumnIDs(hconf); - if (columnsToIncludeTruncatedList != null && - columnsToIncludeTruncatedList.size() > 0 && columnsToIncludeTruncatedList.size() < dataColumnCount ) { + Preconditions.checkState(batchContext != null); - // Partitioned columns will not be in the include list. + boolean[] columnsToInclude = new boolean[dataColumnCount];; - boolean[] columnsToInclude = new boolean[dataColumnCount]; - Arrays.fill(columnsToInclude, false); - for (int columnNum : columnsToIncludeTruncatedList) { - columnsToInclude[columnNum] = true; + int[] initialProjection = batchContext.getInitialProjection(); + Preconditions.checkState(initialProjection != null); + final int count = initialProjection.length; + int lastColumnNum = -1; + for (int i = 0; i < count; i++) { + int columnNum = initialProjection[i]; + if (columnNum >= dataColumnCount) { + break; } + columnsToInclude[columnNum] = true; + lastColumnNum = columnNum; + } - // Work backwards to find the highest wanted column. - - int highestWantedColumnNum = -1; - for (int i = dataColumnCount - 1; i >= 0; i--) { - if (columnsToInclude[i]) { - highestWantedColumnNum = i; - break; - } - } - if (highestWantedColumnNum == -1) { - throw new RuntimeException("No columns to include?"); - } - int newColumnCount = highestWantedColumnNum + 1; - if (newColumnCount == dataColumnCount) { - columnsToIncludeTruncated = columnsToInclude; - } else { - columnsToIncludeTruncated = Arrays.copyOf(columnsToInclude, newColumnCount); - } + if (lastColumnNum == -1) { + dataColumnsToIncludeTruncated = new boolean[0]; + } else { + dataColumnsToIncludeTruncated = Arrays.copyOf(columnsToInclude, lastColumnNum + 1); } } @@ -479,23 +472,21 @@ private void internalSetChildren(Configuration hconf) throws Exception { // so set it here to none. currentReadType = VectorMapOperatorReadType.NONE; - determineColumnsToInclude(hconf); batchContext = conf.getVectorizedRowBatchCtx(); - /* * Use a different batch for vectorized Input File Format readers so they can do their work * overlapped with work of the row collection that vector/row deserialization does. This allows * the partitions to mix modes (e.g. for us to flush the previously batched rows on file change). */ vectorizedInputFileFormatBatch = - batchContext.createVectorizedRowBatch(columnsToIncludeTruncated); + batchContext.createVectorizedRowBatch(); conf.setVectorizedRowBatch(vectorizedInputFileFormatBatch); /* * This batch is used by vector/row deserializer readers. */ - deserializerBatch = batchContext.createVectorizedRowBatch(columnsToIncludeTruncated); + deserializerBatch = batchContext.createVectorizedRowBatch(); batchCounter = 0; @@ -503,13 +494,17 @@ private void internalSetChildren(Configuration hconf) throws Exception { partitionColumnCount = batchContext.getPartitionColumnCount(); partitionValues = new Object[partitionColumnCount]; + determineDataColumnsToIncludeTruncated(); + /* * Create table related objects */ + final String[] rowColumnNames = batchContext.getRowColumnNames(); + final TypeInfo[] rowColumnTypeInfos = batchContext.getRowColumnTypeInfos(); tableStructTypeInfo = TypeInfoFactory.getStructTypeInfo( - Arrays.asList(batchContext.getRowColumnNames()), - Arrays.asList(batchContext.getRowColumnTypeInfos())); + Arrays.asList(rowColumnNames), + Arrays.asList(rowColumnTypeInfos)); tableStandardStructObjectInspector = (StandardStructObjectInspector) TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(tableStructTypeInfo); @@ -554,13 +549,50 @@ private void internalSetChildren(Configuration hconf) throws Exception { children.add(oneRootOperator); setChildOperators(children); + + // For VectorizedRowBatch columns that are not needed (i.e. not included), we project them + // away here. We need to create a struct object inspector with needed data columns and all + // partition columns. + // + initialProjection = batchContext.getInitialProjection(); + Preconditions.checkState(initialProjection != null); + + final int neededColumnCount = initialProjection.length + partitionColumnCount; + List neededColumnNames = new ArrayList(neededColumnCount); + List neededTypeInfos = new ArrayList(neededColumnCount); + + final int count = initialProjection.length; + for (int i = 0; i < count; i++) { + int columnNum = initialProjection[i]; + neededColumnNames.add(rowColumnNames[columnNum]); + neededTypeInfos.add(rowColumnTypeInfos[columnNum]); + } + + // All partition columns. + final int endPartitionColumn = dataColumnCount + partitionColumnCount; + for (int i = dataColumnCount; i < endPartitionColumn; i++) { + neededColumnNames.add(rowColumnNames[i]); + neededTypeInfos.add(rowColumnTypeInfos[i]); + } + + TypeInfo neededStructTypeInfo = + TypeInfoFactory.getStructTypeInfo(neededColumnNames, neededTypeInfos); + neededStandardStructObjectInspector = + (StandardStructObjectInspector) + TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(neededStructTypeInfo); + + // Finally, project away unneeded columns. + vectorizedInputFileFormatBatch.projectedColumns = Arrays.copyOf(initialProjection, initialProjection.length); + vectorizedInputFileFormatBatch.projectionSize = initialProjection.length; + deserializerBatch.projectedColumns = Arrays.copyOf(initialProjection, initialProjection.length); + deserializerBatch.projectionSize = initialProjection.length; } @Override public void initializeMapOperator(Configuration hconf) throws HiveException { super.initializeMapOperator(hconf); - oneRootOperator.initialize(hconf, new ObjectInspector[] {tableStandardStructObjectInspector}); + oneRootOperator.initialize(hconf, new ObjectInspector[] {neededStandardStructObjectInspector}); } public void initializeContexts() throws HiveException { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index c4f47e1..959fa0a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -186,6 +186,29 @@ public VectorizationContext(String contextName, List initialColumnNames) vMap = new VectorExpressionDescriptor(); } + public VectorizationContext(String contextName, List tableColumnNames, + List initialProjection) { + this.contextName = contextName; + level = 0; + initialColumnNames = tableColumnNames; + + final int count = initialProjection.size(); + projectionColumnNames = new ArrayList(count); + projectedColumns = initialProjection; + projectionColumnMap = new HashMap(count); + for (int i = 0; i < count; i++) { + int columnNum = initialProjection.get(i); + String columnName = tableColumnNames.get(columnNum); + projectionColumnNames.add(columnName); + projectionColumnMap.put(columnName, columnNum); + } + + int firstOutputColumnIndex = tableColumnNames.size(); + this.ocm = new OutputColumnManager(firstOutputColumnIndex); + this.firstOutputColumnIndex = firstOutputColumnIndex; + vMap = new VectorExpressionDescriptor(); + } + // Constructor to with the individual addInitialColumn method // followed by a call to finishedAddingInitialColumns. public VectorizationContext(String contextName) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java index 82a97e0..a563715 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java @@ -50,6 +50,8 @@ import org.apache.hadoop.mapred.FileSplit; import org.apache.hive.common.util.DateUtils; +import com.google.common.base.Preconditions; + /** * Context for Vectorized row batch. this class does eager deserialization of row data using serde * in the RecordReader layer. @@ -68,6 +70,7 @@ // It will be stored in MapWork and ReduceWork. private String[] rowColumnNames; private TypeInfo[] rowColumnTypeInfos; + private int[] initialProjection; private int dataColumnCount; private int partitionColumnCount; @@ -80,9 +83,10 @@ public VectorizedRowBatchCtx() { } public VectorizedRowBatchCtx(String[] rowColumnNames, TypeInfo[] rowColumnTypeInfos, - int partitionColumnCount, String[] scratchColumnTypeNames) { + int[] initialProjection, int partitionColumnCount, String[] scratchColumnTypeNames) { this.rowColumnNames = rowColumnNames; this.rowColumnTypeInfos = rowColumnTypeInfos; + this.initialProjection = initialProjection; this.partitionColumnCount = partitionColumnCount; this.scratchColumnTypeNames = scratchColumnTypeNames; @@ -97,6 +101,10 @@ public VectorizedRowBatchCtx(String[] rowColumnNames, TypeInfo[] rowColumnTypeIn return rowColumnTypeInfos; } + public int[] getInitialProjection() { + return initialProjection; + } + public int getDataColumnCount() { return dataColumnCount; } @@ -123,6 +131,7 @@ public void init(StructObjectInspector structObjectInspector, String[] scratchCo // Row column information. rowColumnNames = VectorizedBatchUtil.columnNamesFromStructObjectInspector(structObjectInspector); rowColumnTypeInfos = VectorizedBatchUtil.typeInfosFromStructObjectInspector(structObjectInspector); + initialProjection = null; partitionColumnCount = 0; dataColumnCount = rowColumnTypeInfos.length; @@ -185,44 +194,24 @@ public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, PartitionDes */ public VectorizedRowBatch createVectorizedRowBatch() { - int totalColumnCount = rowColumnTypeInfos.length + scratchColumnTypeNames.length; - VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount); - - LOG.info("createVectorizedRowBatch columnsToIncludeTruncated NONE"); - for (int i = 0; i < rowColumnTypeInfos.length; i++) { - TypeInfo typeInfo = rowColumnTypeInfos[i]; - result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo); - } - - for (int i = 0; i < scratchColumnTypeNames.length; i++) { - String typeName = scratchColumnTypeNames[i]; - result.cols[rowColumnTypeInfos.length + i] = - VectorizedBatchUtil.createColumnVector(typeName); - } - - result.setPartitionInfo(dataColumnCount, partitionColumnCount); - - result.reset(); - return result; - } - - public VectorizedRowBatch createVectorizedRowBatch(boolean[] columnsToIncludeTruncated) - { - if (columnsToIncludeTruncated == null) { - return createVectorizedRowBatch(); - } - - LOG.info("createVectorizedRowBatch columnsToIncludeTruncated " + Arrays.toString(columnsToIncludeTruncated)); - int totalColumnCount = rowColumnTypeInfos.length + scratchColumnTypeNames.length; + final int dataAndPartColumnCount = rowColumnTypeInfos.length; + final int totalColumnCount = dataAndPartColumnCount + scratchColumnTypeNames.length; VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount); - for (int i = 0; i < dataColumnCount; i++) { - TypeInfo typeInfo = rowColumnTypeInfos[i]; - result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo); - } - for (int i = dataColumnCount; i < dataColumnCount + partitionColumnCount; i++) { - TypeInfo typeInfo = rowColumnTypeInfos[i]; - result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo); + if (initialProjection == null) { + // All data and partition columns. + for (int i = 0; i < dataAndPartColumnCount; i++) { + TypeInfo typeInfo = rowColumnTypeInfos[i]; + result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo); + } + } else { + // Only columns for initial projection (data and partition) will be present. + for (int i = 0; i < initialProjection.length; i++) { + int columnNum = initialProjection[i]; + Preconditions.checkState(columnNum < dataAndPartColumnCount); + TypeInfo typeInfo = rowColumnTypeInfos[columnNum]; + result.cols[columnNum] = VectorizedBatchUtil.createColumnVector(typeInfo); + } } for (int i = 0; i < scratchColumnTypeNames.length; i++) { @@ -237,45 +226,6 @@ public VectorizedRowBatch createVectorizedRowBatch(boolean[] columnsToIncludeTru return result; } - public boolean[] getColumnsToIncludeTruncated(Configuration conf) { - boolean[] columnsToIncludeTruncated = null; - - List columnsToIncludeTruncatedList = ColumnProjectionUtils.getReadColumnIDs(conf); - if (columnsToIncludeTruncatedList != null && columnsToIncludeTruncatedList.size() > 0 ) { - - // Partitioned columns will not be in the include list. - - boolean[] columnsToInclude = new boolean[dataColumnCount]; - Arrays.fill(columnsToInclude, false); - for (int columnNum : columnsToIncludeTruncatedList) { - if (columnNum < dataColumnCount) { - columnsToInclude[columnNum] = true; - } - } - - // Work backwards to find the highest wanted column. - - int highestWantedColumnNum = -1; - for (int i = dataColumnCount - 1; i >= 0; i--) { - if (columnsToInclude[i]) { - highestWantedColumnNum = i; - break; - } - } - if (highestWantedColumnNum == -1) { - throw new RuntimeException("No columns to include?"); - } - int newColumnCount = highestWantedColumnNum + 1; - if (newColumnCount == dataColumnCount) { - // Didn't trim any columns off the end. Use the original. - columnsToIncludeTruncated = columnsToInclude; - } else { - columnsToIncludeTruncated = Arrays.copyOf(columnsToInclude, newColumnCount); - } - } - return columnsToIncludeTruncated; - } - /** * Add the partition values to the batch * @@ -299,72 +249,72 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partition lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Boolean) value == true ? 1 : 0); lcv.isNull[0] = false; } } - break; - + break; + case BYTE: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Byte) value); lcv.isNull[0] = false; } } - break; - + break; + case SHORT: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Short) value); lcv.isNull[0] = false; } } break; - + case INT: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Integer) value); lcv.isNull[0] = false; - } + } } break; - + case LONG: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill((Long) value); lcv.isNull[0] = false; - } + } } break; - + case DATE: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { lcv.noNulls = false; lcv.isNull[0] = true; lcv.isRepeating = true; - } else { + } else { lcv.fill(DateWritable.dateToDays((Date) value)); lcv.isNull[0] = false; } @@ -417,10 +367,10 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partition } else { dcv.fill((Float) value); dcv.isNull[0] = false; - } + } } break; - + case DOUBLE: { DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex]; if (value == null) { @@ -433,7 +383,7 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partition } } break; - + case DECIMAL: { DecimalColumnVector dv = (DecimalColumnVector) batch.cols[colIndex]; if (value == null) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java index 80858a9..bca5096 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java @@ -67,7 +67,6 @@ public DummyInputSplit(String path) { private int counter; protected final VectorizedRowBatchCtx rbCtx; - private final boolean[] columnsToIncludeTruncated; private final Object[] partitionValues; private boolean addPartitionCols = true; @@ -78,7 +77,6 @@ public NullRowsRecordReader(Configuration conf, InputSplit split) throws IOExcep } if (isVectorMode) { rbCtx = Utilities.getVectorizedRowBatchCtx(conf); - columnsToIncludeTruncated = rbCtx.getColumnsToIncludeTruncated(conf); int partitionColumnCount = rbCtx.getPartitionColumnCount(); if (partitionColumnCount > 0) { partitionValues = new Object[partitionColumnCount]; @@ -88,7 +86,6 @@ public NullRowsRecordReader(Configuration conf, InputSplit split) throws IOExcep } } else { rbCtx = null; - columnsToIncludeTruncated = null; partitionValues = null; } } @@ -105,7 +102,7 @@ public NullWritable createKey() { @Override public Object createValue() { return rbCtx == null ? NullWritable.get() : - rbCtx.createVectorizedRowBatch(columnsToIncludeTruncated); + rbCtx.createVectorizedRowBatch(); } @Override @@ -143,11 +140,10 @@ protected void makeNullVrb(Object value, int size) { vrb.size = size; vrb.selectedInUse = false; for (int i = 0; i < rbCtx.getDataColumnCount(); i++) { - if (columnsToIncludeTruncated != null - && (columnsToIncludeTruncated.length <= i || !columnsToIncludeTruncated[i])) { + ColumnVector cv = vrb.cols[i]; + if (cv == null) { continue; } - ColumnVector cv = vrb.cols[i]; cv.noNulls = false; cv.isRepeating = true; cv.isNull[0] = true; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java index e4d2e6e..214b84f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java @@ -59,7 +59,6 @@ private final long length; private float progress = 0.0f; private VectorizedRowBatchCtx rbCtx; - private final boolean[] columnsToIncludeTruncated; private final Object[] partitionValues; private boolean addPartitionCols = true; @@ -102,8 +101,6 @@ this.reader = file.rowsOptions(options); - columnsToIncludeTruncated = rbCtx.getColumnsToIncludeTruncated(conf); - int partitionColumnCount = rbCtx.getPartitionColumnCount(); if (partitionColumnCount > 0) { partitionValues = new Object[partitionColumnCount]; @@ -145,7 +142,7 @@ public NullWritable createKey() { @Override public VectorizedRowBatch createValue() { - return rbCtx.createVectorizedRowBatch(columnsToIncludeTruncated); + return rbCtx.createVectorizedRowBatch(); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 00203ae..cd2ff2d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -34,6 +34,7 @@ import java.util.regex.Pattern; import org.apache.calcite.util.Pair; +import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -92,6 +93,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; @@ -160,6 +162,7 @@ import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.*; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.NullStructSerDe; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; @@ -354,6 +357,8 @@ public Vectorizer() { private class VectorTaskColumnInfo { List columnNames; List typeInfos; + List initialProjection; + int partitionColumnCount; boolean useVectorizedInputFileFormat; @@ -361,6 +366,8 @@ public Vectorizer() { Set> nonVectorizedOps; + TableScanOperator tableScanOperator; + VectorTaskColumnInfo() { partitionColumnCount = 0; } @@ -371,6 +378,9 @@ public void setColumnNames(List columnNames) { public void setTypeInfos(List typeInfos) { this.typeInfos = typeInfos; } + public void setInitialProjection(List initialProjection) { + this.initialProjection = initialProjection; + } public void setPartitionColumnCount(int partitionColumnCount) { this.partitionColumnCount = partitionColumnCount; } @@ -383,6 +393,9 @@ public void setUseVectorizedInputFileFormat(boolean useVectorizedInputFileFormat public void setNonVectorizedOps(Set> nonVectorizedOps) { this.nonVectorizedOps = nonVectorizedOps; } + public void setTableScanOperator(TableScanOperator tableScanOperator) { + this.tableScanOperator = tableScanOperator; + } public Set> getNonVectorizedOps() { return nonVectorizedOps; @@ -392,11 +405,18 @@ public void transferToBaseWork(BaseWork baseWork) { String[] columnNameArray = columnNames.toArray(new String[0]); TypeInfo[] typeInfoArray = typeInfos.toArray(new TypeInfo[0]); + int[] initialProjectionArray; + if (initialProjection != null) { + initialProjectionArray = ArrayUtils.toPrimitive(initialProjection.toArray(new Integer[0])); + } else { + initialProjectionArray = null; + } VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx( columnNameArray, typeInfoArray, + initialProjectionArray, partitionColumnCount, scratchTypeNameArray); baseWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx); @@ -515,6 +535,29 @@ private void getTableScanOperatorSchemaInfo(TableScanOperator tableScanOperator, } } + private void determineInitialProjection(TableScanOperator tableScanOperator, + List allColumnNameList, int dataColumnCount, int partitionColumnCount, + List initialProjection) { + + /* + * The TableScanOperator's needed columns are just data columns. + */ + Set neededColumns = new HashSet(tableScanOperator.getNeededColumns()); + + for (int dataColumnNum = 0; dataColumnNum < dataColumnCount; dataColumnNum++) { + String columnName = allColumnNameList.get(dataColumnNum); + if (neededColumns.contains(columnName)) { + initialProjection.add(dataColumnNum); + } + } + + // Add all partition columns. + final int endPartitionColumn = dataColumnCount + partitionColumnCount; + for (int partColumnNum = dataColumnCount; partColumnNum < endPartitionColumn; partColumnNum++) { + initialProjection.add(partColumnNum); + } + } + private String getHiveOptionsString() { StringBuilder sb = new StringBuilder(); sb.append(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); @@ -656,6 +699,9 @@ private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String al final List allTypeInfoList = new ArrayList(); getTableScanOperatorSchemaInfo(tableScanOperator, allColumnNameList, allTypeInfoList); + + final List initialProjection = new ArrayList(); + final int allColumnCount = allColumnNameList.size(); /* @@ -705,6 +751,9 @@ private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String al dataColumnCount = allColumnCount; } + determineInitialProjection(tableScanOperator, allColumnNameList, dataColumnCount, + partitionColumnCount, initialProjection); + tableDataColumnList = allColumnNameList.subList(0, dataColumnCount); tableDataTypeInfoList = allTypeInfoList.subList(0, dataColumnCount); @@ -780,9 +829,13 @@ private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String al vectorTaskColumnInfo.setColumnNames(allColumnNameList); vectorTaskColumnInfo.setTypeInfos(allTypeInfoList); + vectorTaskColumnInfo.setInitialProjection(initialProjection); vectorTaskColumnInfo.setPartitionColumnCount(partitionColumnCount); vectorTaskColumnInfo.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat); + // Helps to keep this for debugging. + vectorTaskColumnInfo.setTableScanOperator(tableScanOperator); + return true; } @@ -1894,7 +1947,9 @@ private boolean validateDataType(String type, VectorExpressionDescriptor.Mode mo private VectorizationContext getVectorizationContext(String contextName, VectorTaskColumnInfo vectorTaskColumnInfo) { - VectorizationContext vContext = new VectorizationContext(contextName, vectorTaskColumnInfo.columnNames); + VectorizationContext vContext = + new VectorizationContext(contextName, vectorTaskColumnInfo.columnNames, + vectorTaskColumnInfo.initialProjection); return vContext; } diff --git ql/src/test/queries/clientpositive/vector_include_no_sel.q ql/src/test/queries/clientpositive/vector_include_no_sel.q new file mode 100644 index 0000000..b78b149 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_include_no_sel.q @@ -0,0 +1,78 @@ +set hive.explain.user=false; +SET hive.vectorized.execution.enabled=true; +SET hive.vectorized.execution.reducesink.new.enabled=false; +set hive.cbo.enable=false; +SET hive.auto.convert.join=true; +SET hive.auto.convert.join.noconditionaltask=true; +SET hive.auto.convert.join.noconditionaltask.size=1000000000; + +-- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile; + +LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt; + +create table store_sales stored as orc as select * from store_sales_txt; + + +create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile; + +LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt; + +create table customer_demographics stored as orc as select * from customer_demographics_txt; + +explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')); + +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')); \ No newline at end of file diff --git ql/src/test/results/clientpositive/llap/vector_include_no_sel.q.out ql/src/test/results/clientpositive/llap/vector_include_no_sel.q.out new file mode 100644 index 0000000..e939c67 --- /dev/null +++ ql/src/test/results/clientpositive/llap/vector_include_no_sel.q.out @@ -0,0 +1,286 @@ +PREHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@store_sales_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@store_sales_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +POSTHOOK: Lineage: store_sales.ss_addr_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_addr_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_cdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_cdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_coupon_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_coupon_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_customer_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_customer_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_discount_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_discount_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_wholesale_cost, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_hdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_hdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_item_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_item_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid_inc_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid_inc_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_profit SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_profit, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_promo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_promo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_quantity SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_quantity, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_date_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_date_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_time_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_time_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_store_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_store_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ticket_number SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ticket_number, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_wholesale_cost, type:float, comment:null), ] +PREHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@customer_demographics_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics +POSTHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@customer_demographics_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics +POSTHOOK: Lineage: customer_demographics.cd_credit_rating SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_credit_rating, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_demo_sk SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_demo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_college_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_college_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_employed_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_employed_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_education_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_education_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_gender SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_gender, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_marital_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_marital_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_purchase_estimate SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_purchase_estimate, type:int, comment:null), ] +Warning: Map Join MAPJOIN[13][bigTable=store_sales] in task 'Map 2' is a cross product +PREHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 2 <- Map 1 (BROADCAST_EDGE) + Reducer 3 <- Map 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: customer_demographics + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + value expressions: cd_demo_sk (type: int), cd_marital_status (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 2 + Map Operator Tree: + TableScan + alias: store_sales + Statistics: Num rows: 1000 Data size: 88276 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col2, _col16 + input vertices: + 0 Map 1 + Statistics: Num rows: 200000 Data size: 92055200 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((_col0 = _col16) and (_col2 = 'M')) or ((_col0 = _col16) and (_col2 = 'U'))) (type: boolean) + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Map Join MAPJOIN[13][bigTable=store_sales] in task 'Map 2' is a cross product +PREHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +PREHOOK: Input: default@customer_demographics +PREHOOK: Input: default@store_sales +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer_demographics +POSTHOOK: Input: default@store_sales +#### A masked pattern was here #### +0 diff --git ql/src/test/results/clientpositive/tez/vector_include_no_sel.q.out ql/src/test/results/clientpositive/tez/vector_include_no_sel.q.out new file mode 100644 index 0000000..be991b2 --- /dev/null +++ ql/src/test/results/clientpositive/tez/vector_include_no_sel.q.out @@ -0,0 +1,284 @@ +PREHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@store_sales_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@store_sales_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +POSTHOOK: Lineage: store_sales.ss_addr_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_addr_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_cdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_cdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_coupon_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_coupon_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_customer_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_customer_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_discount_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_discount_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_wholesale_cost, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_hdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_hdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_item_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_item_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid_inc_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid_inc_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_profit SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_profit, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_promo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_promo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_quantity SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_quantity, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_date_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_date_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_time_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_time_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_store_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_store_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ticket_number SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ticket_number, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_wholesale_cost, type:float, comment:null), ] +PREHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@customer_demographics_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics +POSTHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@customer_demographics_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics +POSTHOOK: Lineage: customer_demographics.cd_credit_rating SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_credit_rating, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_demo_sk SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_demo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_college_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_college_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_employed_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_employed_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_education_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_education_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_gender SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_gender, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_marital_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_marital_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_purchase_estimate SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_purchase_estimate, type:int, comment:null), ] +Warning: Map Join MAPJOIN[13][bigTable=store_sales] in task 'Map 2' is a cross product +PREHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 2 <- Map 1 (BROADCAST_EDGE) + Reducer 3 <- Map 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: customer_demographics + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + value expressions: cd_demo_sk (type: int), cd_marital_status (type: string) + Execution mode: vectorized + Map 2 + Map Operator Tree: + TableScan + alias: store_sales + Statistics: Num rows: 1000 Data size: 88276 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col2, _col16 + input vertices: + 0 Map 1 + Statistics: Num rows: 200000 Data size: 92055200 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((_col0 = _col16) and (_col2 = 'M')) or ((_col0 = _col16) and (_col2 = 'U'))) (type: boolean) + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reducer 3 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Map Join MAPJOIN[13][bigTable=store_sales] in task 'Map 2' is a cross product +PREHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +PREHOOK: Input: default@customer_demographics +PREHOOK: Input: default@store_sales +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer_demographics +POSTHOOK: Input: default@store_sales +#### A masked pattern was here #### +0 diff --git ql/src/test/results/clientpositive/vector_include_no_sel.q.out ql/src/test/results/clientpositive/vector_include_no_sel.q.out new file mode 100644 index 0000000..697d422 --- /dev/null +++ ql/src/test/results/clientpositive/vector_include_no_sel.q.out @@ -0,0 +1,282 @@ +PREHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: -- HIVE-13872 +-- Looking for TableScan immediately followed by ReduceSink (no intervening SEL operator). +-- This caused problems for Vectorizer not eliminating columns which are not included. +-- The input file format didn't fill in those vectorized columns and thus caused NPE in +-- ReduceSink. +-- Only a problem when NOT CBO because of CBO rule-based transforms. +-- +-- Using a cross-product. + +create table store_sales_txt +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_store_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost float, + ss_list_price float, + ss_sales_price float, + ss_ext_discount_amt float, + ss_ext_sales_price float, + ss_ext_wholesale_cost float, + ss_ext_list_price float, + ss_ext_tax float, + ss_coupon_amt float, + ss_net_paid float, + ss_net_paid_inc_tax float, + ss_net_profit float +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@store_sales_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/store_sales.txt' OVERWRITE INTO TABLE store_sales_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@store_sales_txt +PREHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@store_sales_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: create table store_sales stored as orc as select * from store_sales_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@store_sales_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +POSTHOOK: Lineage: store_sales.ss_addr_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_addr_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_cdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_cdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_coupon_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_coupon_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_customer_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_customer_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_discount_amt SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_discount_amt, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ext_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ext_wholesale_cost, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_hdemo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_hdemo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_item_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_item_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_list_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_list_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_paid_inc_tax SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_paid_inc_tax, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_net_profit SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_net_profit, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_promo_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_promo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_quantity SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_quantity, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sales_price SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sales_price, type:float, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_date_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_date_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_sold_time_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_sold_time_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_store_sk SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_store_sk, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_ticket_number SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_ticket_number, type:int, comment:null), ] +POSTHOOK: Lineage: store_sales.ss_wholesale_cost SIMPLE [(store_sales_txt)store_sales_txt.FieldSchema(name:ss_wholesale_cost, type:float, comment:null), ] +PREHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: create table customer_demographics_txt +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_education_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +row format delimited fields terminated by '|' +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@customer_demographics_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/customer_demographics.txt' OVERWRITE INTO TABLE customer_demographics_txt +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@customer_demographics_txt +PREHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@customer_demographics_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_demographics +POSTHOOK: query: create table customer_demographics stored as orc as select * from customer_demographics_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@customer_demographics_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_demographics +POSTHOOK: Lineage: customer_demographics.cd_credit_rating SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_credit_rating, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_demo_sk SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_demo_sk, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_college_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_college_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_dep_employed_count SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_dep_employed_count, type:int, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_education_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_education_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_gender SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_gender, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_marital_status SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_marital_status, type:string, comment:null), ] +POSTHOOK: Lineage: customer_demographics.cd_purchase_estimate SIMPLE [(customer_demographics_txt)customer_demographics_txt.FieldSchema(name:cd_purchase_estimate, type:int, comment:null), ] +Warning: Map Join MAPJOIN[15][bigTable=store_sales] in task 'Stage-2:MAPRED' is a cross product +PREHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-5 is a root stage + Stage-2 depends on stages: Stage-5 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-5 + Map Reduce Local Work + Alias -> Map Local Tables: + customer_demographics + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + customer_demographics + TableScan + alias: customer_demographics + Statistics: Num rows: 200 Data size: 74200 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + keys: + 0 + 1 + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: store_sales + Statistics: Num rows: 1000 Data size: 88276 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col2, _col16 + Statistics: Num rows: 200000 Data size: 92055200 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((_col0 = _col16) and (_col2 = 'M')) or ((_col0 = _col16) and (_col2 = 'U'))) (type: boolean) + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 100000 Data size: 46027600 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Map Join MAPJOIN[15][bigTable=store_sales] in task 'Stage-2:MAPRED' is a cross product +PREHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +PREHOOK: type: QUERY +PREHOOK: Input: default@customer_demographics +PREHOOK: Input: default@store_sales +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from customer_demographics,store_sales +where ((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'M') or + (customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk and customer_demographics.cd_marital_status = 'U')) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer_demographics +POSTHOOK: Input: default@store_sales +#### A masked pattern was here #### +0