diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java index 74a1a82..d2e1b13 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java @@ -23,9 +23,11 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.IOConstants; +import org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader; import org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; @@ -54,7 +56,8 @@ import com.google.common.base.Strings; -public class ParquetRecordReaderWrapper implements RecordReader { +public class ParquetRecordReaderWrapper implements RecordReader, + StatsProvidingRecordReader { public static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReaderWrapper.class); private final long splitLen; // for getPos() @@ -70,6 +73,7 @@ private JobConf jobConf; private final ProjectionPusher projectionPusher; private List filtedBlocks; + private final SerDeStats serDeStats; public ParquetRecordReaderWrapper( final ParquetInputFormat newInputFormat, @@ -89,6 +93,7 @@ public ParquetRecordReaderWrapper( throws IOException, InterruptedException { this.splitLen = oldSplit.getLength(); this.projectionPusher = pusher; + this.serDeStats = new SerDeStats(); jobConf = oldJobConf; final ParquetInputSplit split = getSplit(oldSplit, jobConf); @@ -247,6 +252,13 @@ protected ParquetInputSplit getSplit( final ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema())); + + // Compute stats + for (BlockMetaData bmd : blocks) { + serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount()); + serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize()); + } + schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata() .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount(); final List splitGroup = new ArrayList(); @@ -300,4 +312,9 @@ protected ParquetInputSplit getSplit( public List getFiltedBlocks() { return filtedBlocks; } + + @Override + public SerDeStats getStats() { + return serDeStats; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java index 669d56f..d6f1b7a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; @@ -90,8 +91,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, if (parseCtx.getQueryProperties().isAnalyzeCommand()) { boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand(); boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand(); - if (inputFormat.equals(OrcInputFormat.class)) { - // For ORC, all the following statements are the same + if (inputFormat.equals(OrcInputFormat.class) || + inputFormat.equals(MapredParquetInputFormat.class)) { + // For ORC and Parquet, all the following statements are the same // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;