diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 615fd8a..7c7650e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -23,9 +23,11 @@ import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.Stack; @@ -87,6 +89,7 @@ import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.ql.plan.SparkWork; +import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.TezWork; import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc; @@ -362,12 +365,62 @@ private void addMapWorkRules(Map opRules, NodeProcessor np) + ReduceSinkOperator.getOperatorName()), np); } + /** + * Compare row schema's column names and types only. + */ + private boolean sameRowSchemaColumns(RowSchema rowSchema1, RowSchema rowSchema2) { + + ArrayList signature1 = rowSchema1.getSignature(); + ArrayList signature2 = rowSchema2.getSignature(); + + if(signature1 == null && signature2 == null) { + return true; + } + if((signature1 == null && signature2 != null) || + (signature1 != null && signature2 == null) ) { + return false; + } + + if(signature1.size() != signature2.size()) { + return false; + } + + Iterator iterator1 = signature1.iterator(); + Iterator iterator2 = signature2.iterator(); + while(iterator1.hasNext()) { + ColumnInfo column1 = iterator1.next(); + ColumnInfo column2 = iterator2.next(); + + if (column1 == null && column2 == null) { + continue; + } + + if ((column1 == null && column2 != null) || + (column1 != null && column2 == null) ) { + return false; + } + + if (!column1.getInternalName().equals(column2.getInternalName())) { + return false; + } + + if (!column1.getTypeName().equals(column2.getTypeName())) { + return false; + } + } + + return true; + } + private boolean validateMapWork(MapWork mapWork, boolean isTez) throws SemanticException { LOG.info("Validating MapWork..."); // Validate the input format for (String path : mapWork.getPathToPartitionInfo().keySet()) { PartitionDesc pd = mapWork.getPathToPartitionInfo().get(path); + if (LOG.isDebugEnabled()) { + LOG.debug("validateMapWork path " + path); + } List> interfaceList = Arrays.asList(pd.getInputFileFormatClass().getInterfaces()); if (!interfaceList.contains(VectorizedInputFormatInterface.class)) { @@ -384,11 +437,37 @@ private boolean validateMapWork(MapWork mapWork, boolean isTez) throws SemanticE if ((mapWork.getAliasToWork() == null) || (mapWork.getAliasToWork().size() == 0)) { return false; } else { + RowSchema taskRowSchema = null; + String firstAlias = ""; for (Operator op : mapWork.getAliasToWork().values()) { if (op == null) { LOG.warn("Map work has invalid aliases to work with. Fail validation!"); return false; } + if (!(op instanceof TableScanOperator)) { + LOG.warn("Map work root operator is not a TableScanOperator. Fail validation!"); + return false; + } + TableScanOperator tableScanOperator = (TableScanOperator) op; + RowSchema rowSchema = tableScanOperator.getSchema(); + TableScanDesc tableScanDesc = tableScanOperator.getConf(); + String alias = tableScanDesc.getAlias(); + + if (LOG.isDebugEnabled()) { + LOG.debug("validateMapWork alias " + alias + " row schema " + rowSchema.toString()); + } + if (taskRowSchema == null) { + taskRowSchema = rowSchema; + firstAlias = alias; + } else { + + // Cannot use RowSchema equals method because it compares alias. + + if (!sameRowSchemaColumns(taskRowSchema, rowSchema)) { + LOG.warn("Map work alias " + alias + " has different columns than alias " + firstAlias + ". Fail validation!"); + return false; + } + } } } @@ -706,7 +785,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, if (op instanceof TableScanOperator) { if (taskVectorizationContext == null) { - taskVectorizationContext = getVectorizationContext(op.getSchema(), op.getName(), + taskVectorizationContext = getMapVectorizationContext(op.getSchema(), op.getName(), taskColumnTypeNameMap); } vContext = taskVectorizationContext; @@ -1288,7 +1367,7 @@ private boolean validateDataType(String type) { return supportedDataTypesPattern.matcher(type.toLowerCase()).matches(); } - private VectorizationContext getVectorizationContext(RowSchema rowSchema, String contextName, + private VectorizationContext getMapVectorizationContext(RowSchema rowSchema, String contextName, Map typeNameMap) { VectorizationContext vContext = new VectorizationContext(contextName); diff --git ql/src/test/queries/clientpositive/vector_mr_diff_schema_alias.q ql/src/test/queries/clientpositive/vector_mr_diff_schema_alias.q new file mode 100644 index 0000000..9db6340 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_mr_diff_schema_alias.q @@ -0,0 +1,118 @@ +SET hive.vectorized.execution.enabled=true; +set hive.optimize.correlation=true; +set hive.join.emit.interval=1; +set hive.cbo.enable=true; + +create table date_dim +( + d_date_sk int, + d_date_id string, + d_date string, + d_month_seq int, + d_week_seq int, + d_quarter_seq int, + d_year int, + d_dow int, + d_moy int, + d_dom int, + d_qoy int, + d_fy_year int, + d_fy_quarter_seq int, + d_fy_week_seq int, + d_day_name string, + d_quarter_name string, + d_holiday string, + d_weekend string, + d_following_holiday string, + d_first_dom int, + d_last_dom int, + d_same_day_ly int, + d_same_day_lq int, + d_current_day string, + d_current_week string, + d_current_month string, + d_current_quarter string, + d_current_year string +) +stored as orc; + +create table store_sales +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost decimal(7,2), + ss_list_price decimal(7,2), + ss_sales_price decimal(7,2), + ss_ext_discount_amt decimal(7,2), + ss_ext_sales_price decimal(7,2), + ss_ext_wholesale_cost decimal(7,2), + ss_ext_list_price decimal(7,2), + ss_ext_tax decimal(7,2), + ss_coupon_amt decimal(7,2), + ss_net_paid decimal(7,2), + ss_net_paid_inc_tax decimal(7,2), + ss_net_profit decimal(7,2) +) +partitioned by +( + ss_store_sk int +) +stored as orc +tblproperties ("orc.stripe.size"="33554432", "orc.compress.size"="16384"); + +create table store +( + s_store_sk int, + s_store_id string, + s_rec_start_date string, + s_rec_end_date string, + s_closed_date_sk int, + s_store_name string, + s_number_employees int, + s_floor_space int, + s_hours string, + s_manager string, + s_market_id int, + s_geography_class string, + s_market_desc string, + s_market_manager string, + s_division_id int, + s_division_name string, + s_company_id int, + s_company_name string, + s_street_number string, + s_street_name string, + s_street_type string, + s_suite_number string, + s_city string, + s_county string, + s_state string, + s_zip string, + s_country string, + s_gmt_offset decimal(5,2), + s_tax_precentage decimal(5,2) +) +stored as orc; + +-- For MR, we are verifying this query DOES NOT vectorize the Map vertex with +-- the 2 TableScanOperators that have different schema. + +explain select + s_state, count(1) + from store_sales, + store, + date_dim + where store_sales.ss_sold_date_sk = date_dim.d_date_sk and + store_sales.ss_store_sk = store.s_store_sk and + store.s_state in ('KS','AL', 'MN', 'AL', 'SC', 'VT') + group by s_state + order by s_state + limit 100; diff --git ql/src/test/results/clientpositive/tez/vector_mr_diff_schema_alias.q.out ql/src/test/results/clientpositive/tez/vector_mr_diff_schema_alias.q.out new file mode 100644 index 0000000..fafccdc --- /dev/null +++ ql/src/test/results/clientpositive/tez/vector_mr_diff_schema_alias.q.out @@ -0,0 +1,381 @@ +PREHOOK: query: create table date_dim +( + d_date_sk int, + d_date_id string, + d_date string, + d_month_seq int, + d_week_seq int, + d_quarter_seq int, + d_year int, + d_dow int, + d_moy int, + d_dom int, + d_qoy int, + d_fy_year int, + d_fy_quarter_seq int, + d_fy_week_seq int, + d_day_name string, + d_quarter_name string, + d_holiday string, + d_weekend string, + d_following_holiday string, + d_first_dom int, + d_last_dom int, + d_same_day_ly int, + d_same_day_lq int, + d_current_day string, + d_current_week string, + d_current_month string, + d_current_quarter string, + d_current_year string +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@date_dim +POSTHOOK: query: create table date_dim +( + d_date_sk int, + d_date_id string, + d_date string, + d_month_seq int, + d_week_seq int, + d_quarter_seq int, + d_year int, + d_dow int, + d_moy int, + d_dom int, + d_qoy int, + d_fy_year int, + d_fy_quarter_seq int, + d_fy_week_seq int, + d_day_name string, + d_quarter_name string, + d_holiday string, + d_weekend string, + d_following_holiday string, + d_first_dom int, + d_last_dom int, + d_same_day_ly int, + d_same_day_lq int, + d_current_day string, + d_current_week string, + d_current_month string, + d_current_quarter string, + d_current_year string +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@date_dim +PREHOOK: query: create table store_sales +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost decimal(7,2), + ss_list_price decimal(7,2), + ss_sales_price decimal(7,2), + ss_ext_discount_amt decimal(7,2), + ss_ext_sales_price decimal(7,2), + ss_ext_wholesale_cost decimal(7,2), + ss_ext_list_price decimal(7,2), + ss_ext_tax decimal(7,2), + ss_coupon_amt decimal(7,2), + ss_net_paid decimal(7,2), + ss_net_paid_inc_tax decimal(7,2), + ss_net_profit decimal(7,2) +) +partitioned by +( + ss_store_sk int +) +stored as orc +tblproperties ("orc.stripe.size"="33554432", "orc.compress.size"="16384") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: create table store_sales +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost decimal(7,2), + ss_list_price decimal(7,2), + ss_sales_price decimal(7,2), + ss_ext_discount_amt decimal(7,2), + ss_ext_sales_price decimal(7,2), + ss_ext_wholesale_cost decimal(7,2), + ss_ext_list_price decimal(7,2), + ss_ext_tax decimal(7,2), + ss_coupon_amt decimal(7,2), + ss_net_paid decimal(7,2), + ss_net_paid_inc_tax decimal(7,2), + ss_net_profit decimal(7,2) +) +partitioned by +( + ss_store_sk int +) +stored as orc +tblproperties ("orc.stripe.size"="33554432", "orc.compress.size"="16384") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +PREHOOK: query: create table store +( + s_store_sk int, + s_store_id string, + s_rec_start_date string, + s_rec_end_date string, + s_closed_date_sk int, + s_store_name string, + s_number_employees int, + s_floor_space int, + s_hours string, + s_manager string, + s_market_id int, + s_geography_class string, + s_market_desc string, + s_market_manager string, + s_division_id int, + s_division_name string, + s_company_id int, + s_company_name string, + s_street_number string, + s_street_name string, + s_street_type string, + s_suite_number string, + s_city string, + s_county string, + s_state string, + s_zip string, + s_country string, + s_gmt_offset decimal(5,2), + s_tax_precentage decimal(5,2) +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store +POSTHOOK: query: create table store +( + s_store_sk int, + s_store_id string, + s_rec_start_date string, + s_rec_end_date string, + s_closed_date_sk int, + s_store_name string, + s_number_employees int, + s_floor_space int, + s_hours string, + s_manager string, + s_market_id int, + s_geography_class string, + s_market_desc string, + s_market_manager string, + s_division_id int, + s_division_name string, + s_company_id int, + s_company_name string, + s_street_number string, + s_street_name string, + s_street_type string, + s_suite_number string, + s_city string, + s_county string, + s_state string, + s_zip string, + s_country string, + s_gmt_offset decimal(5,2), + s_tax_precentage decimal(5,2) +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store +PREHOOK: query: -- For MR, we are verifying this query DOES NOT vectorize the Map vertex with +-- the 2 TableScanOperators that have different schema. + +explain select + s_state, count(1) + from store_sales, + store, + date_dim + where store_sales.ss_sold_date_sk = date_dim.d_date_sk and + store_sales.ss_store_sk = store.s_store_sk and + store.s_state in ('KS','AL', 'MN', 'AL', 'SC', 'VT') + group by s_state + order by s_state + limit 100 +PREHOOK: type: QUERY +POSTHOOK: query: -- For MR, we are verifying this query DOES NOT vectorize the Map vertex with +-- the 2 TableScanOperators that have different schema. + +explain select + s_state, count(1) + from store_sales, + store, + date_dim + where store_sales.ss_sold_date_sk = date_dim.d_date_sk and + store_sales.ss_store_sk = store.s_store_sk and + store.s_state in ('KS','AL', 'MN', 'AL', 'SC', 'VT') + group by s_state + order by s_state + limit 100 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE) + Reducer 3 <- Map 7 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Reducer 3 (SIMPLE_EDGE) + Reducer 5 <- Reducer 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: store_sales + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (ss_store_sk is not null and ss_sold_date_sk is not null) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: ss_store_sk (type: int) + sort order: + + Map-reduce partition columns: ss_store_sk (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: ss_sold_date_sk (type: int) + Execution mode: vectorized + Map 6 + Map Operator Tree: + TableScan + alias: store + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (s_store_sk is not null and (s_state) IN ('KS', 'AL', 'MN', 'AL', 'SC', 'VT')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: s_store_sk (type: int) + sort order: + + Map-reduce partition columns: s_store_sk (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: s_state (type: string) + Execution mode: vectorized + Map 7 + Map Operator Tree: + TableScan + alias: date_dim + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: d_date_sk is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: d_date_sk (type: int) + sort order: + + Map-reduce partition columns: d_date_sk (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Execution mode: vectorized + Reducer 2 + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 ss_store_sk (type: int) + 1 s_store_sk (type: int) + outputColumnNames: _col0, _col22, _col26, _col50 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col22 (type: int), _col26 (type: int), _col50 (type: string) + Reducer 3 + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 d_date_sk (type: int) + outputColumnNames: _col0, _col22, _col26, _col50, _col58 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (((_col0 = _col58) and (_col22 = _col26)) and (_col50) IN ('KS', 'AL', 'MN', 'AL', 'SC', 'VT')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col50 (type: string) + outputColumnNames: _col50 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: _col50 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Reducer 5 + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Limit + Number of rows: 100 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: 100 + Processor Tree: + ListSink + diff --git ql/src/test/results/clientpositive/vector_mr_diff_schema_alias.q.out ql/src/test/results/clientpositive/vector_mr_diff_schema_alias.q.out new file mode 100644 index 0000000..a8b7097 --- /dev/null +++ ql/src/test/results/clientpositive/vector_mr_diff_schema_alias.q.out @@ -0,0 +1,396 @@ +PREHOOK: query: create table date_dim +( + d_date_sk int, + d_date_id string, + d_date string, + d_month_seq int, + d_week_seq int, + d_quarter_seq int, + d_year int, + d_dow int, + d_moy int, + d_dom int, + d_qoy int, + d_fy_year int, + d_fy_quarter_seq int, + d_fy_week_seq int, + d_day_name string, + d_quarter_name string, + d_holiday string, + d_weekend string, + d_following_holiday string, + d_first_dom int, + d_last_dom int, + d_same_day_ly int, + d_same_day_lq int, + d_current_day string, + d_current_week string, + d_current_month string, + d_current_quarter string, + d_current_year string +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@date_dim +POSTHOOK: query: create table date_dim +( + d_date_sk int, + d_date_id string, + d_date string, + d_month_seq int, + d_week_seq int, + d_quarter_seq int, + d_year int, + d_dow int, + d_moy int, + d_dom int, + d_qoy int, + d_fy_year int, + d_fy_quarter_seq int, + d_fy_week_seq int, + d_day_name string, + d_quarter_name string, + d_holiday string, + d_weekend string, + d_following_holiday string, + d_first_dom int, + d_last_dom int, + d_same_day_ly int, + d_same_day_lq int, + d_current_day string, + d_current_week string, + d_current_month string, + d_current_quarter string, + d_current_year string +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@date_dim +PREHOOK: query: create table store_sales +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost decimal(7,2), + ss_list_price decimal(7,2), + ss_sales_price decimal(7,2), + ss_ext_discount_amt decimal(7,2), + ss_ext_sales_price decimal(7,2), + ss_ext_wholesale_cost decimal(7,2), + ss_ext_list_price decimal(7,2), + ss_ext_tax decimal(7,2), + ss_coupon_amt decimal(7,2), + ss_net_paid decimal(7,2), + ss_net_paid_inc_tax decimal(7,2), + ss_net_profit decimal(7,2) +) +partitioned by +( + ss_store_sk int +) +stored as orc +tblproperties ("orc.stripe.size"="33554432", "orc.compress.size"="16384") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: create table store_sales +( + ss_sold_date_sk int, + ss_sold_time_sk int, + ss_item_sk int, + ss_customer_sk int, + ss_cdemo_sk int, + ss_hdemo_sk int, + ss_addr_sk int, + ss_promo_sk int, + ss_ticket_number int, + ss_quantity int, + ss_wholesale_cost decimal(7,2), + ss_list_price decimal(7,2), + ss_sales_price decimal(7,2), + ss_ext_discount_amt decimal(7,2), + ss_ext_sales_price decimal(7,2), + ss_ext_wholesale_cost decimal(7,2), + ss_ext_list_price decimal(7,2), + ss_ext_tax decimal(7,2), + ss_coupon_amt decimal(7,2), + ss_net_paid decimal(7,2), + ss_net_paid_inc_tax decimal(7,2), + ss_net_profit decimal(7,2) +) +partitioned by +( + ss_store_sk int +) +stored as orc +tblproperties ("orc.stripe.size"="33554432", "orc.compress.size"="16384") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +PREHOOK: query: create table store +( + s_store_sk int, + s_store_id string, + s_rec_start_date string, + s_rec_end_date string, + s_closed_date_sk int, + s_store_name string, + s_number_employees int, + s_floor_space int, + s_hours string, + s_manager string, + s_market_id int, + s_geography_class string, + s_market_desc string, + s_market_manager string, + s_division_id int, + s_division_name string, + s_company_id int, + s_company_name string, + s_street_number string, + s_street_name string, + s_street_type string, + s_suite_number string, + s_city string, + s_county string, + s_state string, + s_zip string, + s_country string, + s_gmt_offset decimal(5,2), + s_tax_precentage decimal(5,2) +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store +POSTHOOK: query: create table store +( + s_store_sk int, + s_store_id string, + s_rec_start_date string, + s_rec_end_date string, + s_closed_date_sk int, + s_store_name string, + s_number_employees int, + s_floor_space int, + s_hours string, + s_manager string, + s_market_id int, + s_geography_class string, + s_market_desc string, + s_market_manager string, + s_division_id int, + s_division_name string, + s_company_id int, + s_company_name string, + s_street_number string, + s_street_name string, + s_street_type string, + s_suite_number string, + s_city string, + s_county string, + s_state string, + s_zip string, + s_country string, + s_gmt_offset decimal(5,2), + s_tax_precentage decimal(5,2) +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store +PREHOOK: query: -- For MR, we are verifying this query DOES NOT vectorize the Map vertex with +-- the 2 TableScanOperators. + +explain select + s_state, count(1) + from store_sales, + store, + date_dim + where store_sales.ss_sold_date_sk = date_dim.d_date_sk and + store_sales.ss_store_sk = store.s_store_sk and + store.s_state in ('KS','AL', 'MN', 'AL', 'SC', 'VT') + group by s_state + order by s_state + limit 100 +PREHOOK: type: QUERY +POSTHOOK: query: -- For MR, we are verifying this query DOES NOT vectorize the Map vertex with +-- the 2 TableScanOperators. + +explain select + s_state, count(1) + from store_sales, + store, + date_dim + where store_sales.ss_sold_date_sk = date_dim.d_date_sk and + store_sales.ss_store_sk = store.s_store_sk and + store.s_state in ('KS','AL', 'MN', 'AL', 'SC', 'VT') + group by s_state + order by s_state + limit 100 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-4 depends on stages: Stage-3 + Stage-0 depends on stages: Stage-4 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: store_sales + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (ss_store_sk is not null and ss_sold_date_sk is not null) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: ss_store_sk (type: int) + sort order: + + Map-reduce partition columns: ss_store_sk (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: ss_sold_date_sk (type: int) + TableScan + alias: store + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (s_store_sk is not null and (s_state) IN ('KS', 'AL', 'MN', 'AL', 'SC', 'VT')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: s_store_sk (type: int) + sort order: + + Map-reduce partition columns: s_store_sk (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: s_state (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 ss_store_sk (type: int) + 1 s_store_sk (type: int) + outputColumnNames: _col0, _col22, _col26, _col50 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col22 (type: int), _col26 (type: int), _col50 (type: string) + TableScan + alias: date_dim + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: d_date_sk is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: d_date_sk (type: int) + sort order: + + Map-reduce partition columns: d_date_sk (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 d_date_sk (type: int) + outputColumnNames: _col0, _col22, _col26, _col50, _col58 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (((_col0 = _col58) and (_col22 = _col26)) and (_col50) IN ('KS', 'AL', 'MN', 'AL', 'SC', 'VT')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col50 (type: string) + outputColumnNames: _col50 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: _col50 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Limit + Number of rows: 100 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 100 + Processor Tree: + ListSink +