From 94d85a39aae9cfd87f27560ab78866809b1995f0 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Fri, 16 Oct 2015 18:41:48 -0700 Subject: [PATCH] First cut --- .../org/apache/hadoop/hive/ql/QueryProperties.java | 10 -- .../org/apache/hadoop/hive/ql/metadata/Hive.java | 11 +- .../hadoop/hive/ql/optimizer/GenMapRedUtils.java | 3 +- .../hadoop/hive/ql/optimizer/StatsOptimizer.java | 38 ++++-- .../apache/hadoop/hive/ql/parse/QBParseInfo.java | 9 -- .../hadoop/hive/ql/parse/SemanticAnalyzer.java | 6 - ql/src/test/queries/clientpositive/insert_into1.q | 11 +- .../test/results/clientpositive/insert_into1.q.out | 151 +++++++++++++++++++++ 8 files changed, 197 insertions(+), 42 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/QueryProperties.java b/ql/src/java/org/apache/hadoop/hive/ql/QueryProperties.java index e8f7fba..3bc9432 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/QueryProperties.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/QueryProperties.java @@ -39,7 +39,6 @@ boolean noScanAnalyzeCommand; boolean analyzeRewrite; boolean ctas; - boolean insertToTable; int outerQueryLimit; boolean hasJoin = false; @@ -115,14 +114,6 @@ public void setCTAS(boolean ctas) { this.ctas = ctas; } - public boolean isInsertToTable() { - return insertToTable; - } - - public void setInsertToTable(boolean insertToTable) { - this.insertToTable = insertToTable; - } - public int getOuterQueryLimit() { return outerQueryLimit; } @@ -276,7 +267,6 @@ public void clear() { noScanAnalyzeCommand = false; analyzeRewrite = false; ctas = false; - insertToTable = false; outerQueryLimit = -1; hasJoin = false; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 4058606..cfae4b8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -1467,6 +1467,12 @@ public Partition loadPartition(Path loadPath, Table tbl, boolean forceCreate = (!holdDDLTime) ? true : false; newTPart = getPartition(tbl, partSpec, forceCreate, newPartPath.toString(), inheritTableSpecs, newFiles); + + if(!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { + newTPart.getParameters().put(StatsSetupConst.COLUMN_STATS_ACCURATE, "false"); + } else { + newTPart.getParameters().put(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK, "true"); + } // recreate the partition if it existed before if (!holdDDLTime) { if (isSkewedStoreAsSubdir) { @@ -1720,9 +1726,12 @@ public void loadTable(Path loadPath, String tableName, boolean replace, } catch (IOException e) { throw new HiveException("addFiles: filesystem error in check phase", e); } + } + if(!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { + tbl.getParameters().put(StatsSetupConst.COLUMN_STATS_ACCURATE, "false"); + } else { tbl.getParameters().put(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK, "true"); } - try { if (isSkewedStoreAsSubdir) { SkewedInfo skewedInfo = tbl.getSkewedInfo(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index 109b938..c22c35f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -1500,8 +1500,7 @@ public static void addStatsTask(FileSinkOperator nd, MoveTask mvTask, * @return */ public static boolean isInsertInto(ParseContext parseCtx, FileSinkOperator fsOp) { - return fsOp.getConf().getTableInfo().getTableName() != null && - parseCtx.getQueryProperties().isInsertToTable(); + return fsOp.getConf().getTableInfo().getTableName() != null; } /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index 5a21e6b..aa204c7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -144,17 +144,23 @@ public MetaDataProcessor (ParseContext pctx) { } enum LongSubType { - BIGINT { Object cast(long longValue) { return longValue; } }, - INT { Object cast(long longValue) { return (int)longValue; } }, - SMALLINT { Object cast(long longValue) { return (short)longValue; } }, - TINYINT { Object cast(long longValue) { return (byte)longValue; } }; + BIGINT { @Override + Object cast(long longValue) { return longValue; } }, + INT { @Override + Object cast(long longValue) { return (int)longValue; } }, + SMALLINT { @Override + Object cast(long longValue) { return (short)longValue; } }, + TINYINT { @Override + Object cast(long longValue) { return (byte)longValue; } }; abstract Object cast(long longValue); } enum DoubleSubType { - DOUBLE { Object cast(double doubleValue) { return doubleValue; } }, - FLOAT { Object cast(double doubleValue) { return (float) doubleValue; } }; + DOUBLE { @Override + Object cast(double doubleValue) { return doubleValue; } }, + FLOAT { @Override + Object cast(double doubleValue) { return (float) doubleValue; } }; abstract Object cast(double doubleValue); } @@ -221,7 +227,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // Since we have done an exact match on TS-SEL-GBY-RS-GBY-(SEL)-FS // we need not to do any instanceof checks for following. GroupByOperator pgbyOp = (GroupByOperator)stack.get(2); - if (pgbyOp.getConf().getOutputColumnNames().size() != + if (pgbyOp.getConf().getOutputColumnNames().size() != pgbyOp.getConf().getAggregators().size()) { return null; } @@ -260,7 +266,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, FileSinkOperator fsOp = (FileSinkOperator)last; if (fsOp.getNumChild() > 0) { // looks like a subq plan. - return null; // todo we can collapse this part of tree into single TS + return null; // todo we can collapse this part of tree into single TS } Table tbl = tsOp.getConf().getTableMetadata(); @@ -296,7 +302,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } switch (category) { - case LONG: + case LONG: oneRow.add(Long.valueOf(constant) * rowCnt); break; case DOUBLE: @@ -436,7 +442,7 @@ else if (udaf instanceof GenericUDAFCount) { switch (type) { case Integeral: { LongSubType subType = LongSubType.valueOf(name); - + Long maxVal = null; Collection> result = verifyAndGetPartStats(hive, tbl, colName, parts); @@ -462,7 +468,7 @@ else if (udaf instanceof GenericUDAFCount) { } case Double: { DoubleSubType subType = DoubleSubType.valueOf(name); - + Double maxVal = null; Collection> result = verifyAndGetPartStats(hive, tbl, colName, parts); @@ -537,7 +543,7 @@ else if (udaf instanceof GenericUDAFCount) { switch(type) { case Integeral: { LongSubType subType = LongSubType.valueOf(name); - + Long minVal = null; Collection> result = verifyAndGetPartStats(hive, tbl, colName, parts); @@ -563,7 +569,7 @@ else if (udaf instanceof GenericUDAFCount) { } case Double: { DoubleSubType subType = DoubleSubType.valueOf(name); - + Double minVal = null; Collection> result = verifyAndGetPartStats(hive, tbl, colName, parts); @@ -680,6 +686,9 @@ private Long getRowCnt( if (tbl.isPartitioned()) { for (Partition part : pctx.getPrunedPartitions( tsOp.getConf().getAlias(), tsOp).getPartitions()) { + if (!StatsSetupConst.areStatsUptoDate(part.getParameters())) { + return null; + } long partRowCnt = Long.parseLong(part.getParameters().get(StatsSetupConst.ROW_COUNT)); if (partRowCnt < 1) { Log.debug("Partition doesn't have upto date stats " + part.getSpec()); @@ -688,6 +697,9 @@ private Long getRowCnt( rowCnt += partRowCnt; } } else { // unpartitioned table + if (!StatsSetupConst.areStatsUptoDate(tbl.getParameters())) { + return null; + } rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT)); if (rowCnt < 1) { // if rowCnt < 1 than its either empty table or table on which stats are not diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java index 14a7e9c..9072d7f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java @@ -65,7 +65,6 @@ private final HashSet insertIntoTables; private boolean isAnalyzeCommand; // used for the analyze command (statistics) - private boolean isInsertToTable; // used for insert overwrite command (statistics) private boolean isNoScanAnalyzeCommand; // used for the analyze command (statistics) (noscan) private boolean isPartialScanAnalyzeCommand; // used for the analyze command (statistics) // (partialscan) @@ -550,14 +549,6 @@ public boolean isAnalyzeCommand() { return isAnalyzeCommand; } - public void setIsInsertToTable(boolean isInsertToTable) { - this.isInsertToTable = isInsertToTable; - } - - public boolean isInsertToTable() { - return isInsertToTable; - } - public void addTableSpec(String tName, TableSpec tSpec) { tableSpecs.put(tName, tSpec); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 3262887..f9cba84 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -1735,8 +1735,6 @@ public void getMetaData(QB qb, ReadEntity parentInput) throws SemanticException qb.getMetaData().setDestForAlias(name, ts.partHandle); } if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { - // Set that variable to automatically collect stats during the MapReduce job - qb.getParseInfo().setIsInsertToTable(true); // Add the table spec for the destination table. qb.getParseInfo().addTableSpec(ts.tableName.toLowerCase(), ts); } @@ -1773,8 +1771,6 @@ public void getMetaData(QB qb, ReadEntity parentInput) throws SemanticException } if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { TableSpec ts = new TableSpec(db, conf, this.ast); - // Set that variable to automatically collect stats during the MapReduce job - qb.getParseInfo().setIsInsertToTable(true); // Add the table spec for the destination table. qb.getParseInfo().addTableSpec(ts.tableName.toLowerCase(), ts); } @@ -6360,7 +6356,6 @@ protected Operator genFileSinkPlan(String dest, QB qb, Operator input) // verify that our destination is empty before proceeding if (dest_tab.isImmutable() && qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(),dest_tab.getTableName())){ - qb.getParseInfo().isInsertToTable(); try { FileSystem fs = partPath.getFileSystem(conf); if (! MetaStoreUtils.isDirEmpty(fs,partPath)){ @@ -12254,7 +12249,6 @@ private void copyInfoToQueryProperties(QueryProperties queryProperties) { queryProperties.setNoScanAnalyzeCommand(qb.getParseInfo().isNoScanAnalyzeCommand()); queryProperties.setAnalyzeRewrite(qb.isAnalyzeRewrite()); queryProperties.setCTAS(qb.getTableDesc() != null); - queryProperties.setInsertToTable(qb.getParseInfo().isInsertToTable()); queryProperties.setHasOuterOrderBy(!qb.getParseInfo().getIsSubQ() && !qb.getParseInfo().getDestToOrderBy().isEmpty()); queryProperties.setOuterQueryLimit(qb.getParseInfo().getOuterQueryLimit()); diff --git a/ql/src/test/queries/clientpositive/insert_into1.q b/ql/src/test/queries/clientpositive/insert_into1.q index 1b7db5c..08cb3c9 100644 --- a/ql/src/test/queries/clientpositive/insert_into1.q +++ b/ql/src/test/queries/clientpositive/insert_into1.q @@ -35,6 +35,15 @@ explain SELECT COUNT(*) FROM insert_into1; select count(*) from insert_into1; -DROP TABLE insert_into1; +set hive.stats.autogather=false; +explain +insert into table insert_into1 values(1, 'abc'); +insert into table insert_into1 values(1, 'abc'); +explain +SELECT COUNT(*) FROM insert_into1; +select count(*) from insert_into1; + +DROP TABLE insert_into1; +set hive.stats.autogather=true; set hive.compute.query.using.stats=false; diff --git a/ql/src/test/results/clientpositive/insert_into1.q.out b/ql/src/test/results/clientpositive/insert_into1.q.out index 2b5cbca..9667f4b 100644 --- a/ql/src/test/results/clientpositive/insert_into1.q.out +++ b/ql/src/test/results/clientpositive/insert_into1.q.out @@ -335,6 +335,157 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@insert_into1 #### A masked pattern was here #### 10 +PREHOOK: query: explain +insert into table insert_into1 values(1, 'abc') +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert into table insert_into1 values(1, 'abc') +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-6 depends on stages: Stage-1 , consists of Stage-3, Stage-2, Stage-4 + Stage-3 + Stage-0 depends on stages: Stage-3, Stage-2, Stage-5 + Stage-2 + Stage-4 + Stage-5 depends on stages: Stage-4 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: values__tmp__table__1 + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(tmp_values_col1) (type: int), tmp_values_col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_into1 + + Stage: Stage-6 + Conditional Operator + + Stage: Stage-3 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_into1 + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_into1 + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_into1 + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert into table insert_into1 values(1, 'abc') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@insert_into1 +POSTHOOK: query: insert into table insert_into1 values(1, 'abc') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@insert_into1 +POSTHOOK: Lineage: insert_into1.key EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: insert_into1.value SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain +SELECT COUNT(*) FROM insert_into1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +SELECT COUNT(*) FROM insert_into1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: insert_into1 + Statistics: Num rows: -1 Data size: 112 Basic stats: PARTIAL Column stats: COMPLETE + Select Operator + Statistics: Num rows: -1 Data size: 112 Basic stats: PARTIAL Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from insert_into1 +PREHOOK: type: QUERY +PREHOOK: Input: default@insert_into1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from insert_into1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@insert_into1 +#### A masked pattern was here #### +11 PREHOOK: query: DROP TABLE insert_into1 PREHOOK: type: DROPTABLE PREHOOK: Input: default@insert_into1 -- 1.7.12.4 (Apple Git-37)