diff --git data/files/parquet_create_people.txt data/files/parquet_create_people.txt new file mode 100644 index 0000000..ab93c14 --- /dev/null +++ data/files/parquet_create_people.txt @@ -0,0 +1,100 @@ +1CelesteBrowning959-3763 Nec, Av.100.002011-03-12 15:20:00Ca +2RisaYangP.O. Box 292, 8229 Porttitor Road200.002011-03-12 15:20:00Or +3VenusSuttonAp #962-8021 Egestas Rd.300.002011-03-12 15:20:00Ca +4GretchenHarrisonP.O. Box 636, 8734 Magna Avenue400.002011-03-12 15:20:00Or +5LaniIrwinAp #441-5911 Iaculis, Ave500.002011-03-12 15:20:00Ca +6VeraGeorge409-1555 Vel, Ave600.002011-03-12 15:20:00Or +7JessicaMalone286-9779 Aliquam Road700.002011-03-12 15:20:00Ca +8AnnChapmanAp #504-3915 Placerat Road800.002011-03-12 15:20:00Or +9NigelBartlettAp #185-385 Diam Street900.002011-03-12 15:20:00Ca +10AzaliaJennings5772 Diam St.100.002011-03-12 15:20:00Or +11PrestonCannonAp #527-8769 Nunc Avenue100.002011-03-12 15:20:00Ca +12AllistairVasquez2562 Odio. St.100.002011-03-12 15:20:00Or +13ReedHayes5190 Elit Street100.002011-03-12 15:20:00Ca +14ElaineBarronP.O. Box 840, 8860 Sodales Av.100.002011-03-12 15:20:00Or +15LydiaHoodP.O. Box 698, 5666 Semper Road100.002011-03-12 15:20:00Ca +16VanceMaxwell298-3313 Malesuada Road100.002011-03-12 15:20:00Or +17KeikoDeleonP.O. Box 732, 5921 Massa. Av.100.002011-03-12 15:20:00Ca +18DolanKaneAp #906-3606 Ut Rd.100.002011-03-12 15:20:00Or +19MerrittPerkinsP.O. Box 228, 7090 Egestas Street100.002011-03-12 15:20:00Ca +20CaseySalazar506-5065 Ut St.200.002011-03-12 15:20:00Or +21SamsonNoel1370 Ultrices, Road200.002012-03-12 15:20:00Ca +22ByronWalkerP.O. Box 386, 8324 Tellus Ave200.002012-03-12 15:20:00Or +23PiperSingletonAp #500-3561 Primis St.200.002012-03-12 15:20:00Ca +24RiaMckinney3080 Dui Rd.200.002012-03-12 15:20:00Or +25RahimStanley559-9016 Nascetur Street200.002012-03-12 15:20:00Ca +26ChloeSteeleP.O. Box 766, 1628 Elit Street200.002012-03-12 15:20:00Or +27PalomaWardAp #390-3042 Ipsum Rd.200.002012-03-12 15:20:00Ca +28RoaryShermanAp #409-6549 Metus St.200.002012-03-12 15:20:00Or +29CalvinBuckner6378 Diam Avenue200.002012-03-12 15:20:00Ca +30CamilleGoodAp #113-8659 Suspendisse St.300.002012-03-12 15:20:00Or +31SteelAyala5518 Justo St.300.002012-03-12 15:20:00Ca +32JosiahGilbertAp #149-6651 At, Av.300.002012-03-12 15:20:00Or +33HamiltonCruz4620 Tellus. Ave300.002012-03-12 15:20:00Ca +34ScarletSantos586-1785 Velit. Av.300.002012-03-12 15:20:00Or +35LewisMcintyre629-6419 Ac Rd.300.002012-03-12 15:20:00Ca +36ArsenioMejiaP.O. Box 767, 8625 Justo Rd.300.002012-03-12 15:20:00Or +37VelmaHaley1377 At Rd.300.002012-03-12 15:20:00Ca +38TatumJennings829-7432 Posuere, Road300.002012-03-12 15:20:00Or +39BritanniEaton8811 Morbi Street300.002012-03-12 15:20:00Ca +40AileenJacobsonP.O. Box 469, 2266 Dui, Rd.400.002012-03-12 15:20:00Or +41KareemAyala2706 Ridiculus Street400.002013-03-12 15:20:00Ca +42MaiteRush7592 Neque Road400.002013-03-12 15:20:00Or +43SigneVelasquezAp #868-3039 Eget St.400.002013-03-12 15:20:00Ca +44ZoritaCamachoP.O. Box 651, 3340 Quis Av.400.002013-03-12 15:20:00Or +45GlennaCurtis953-7965 Enim Ave400.002013-03-12 15:20:00Ca +46QuinCortez4898 Ridiculus St.400.002013-03-12 15:20:00Or +47TalonDaltonP.O. Box 408, 7597 Integer Rd.400.002013-03-12 15:20:00Ca +48DarrylBlankenshipP.O. Box 771, 1471 Non Rd.400.002013-03-12 15:20:00Or +49VernonReyesP.O. Box 971, 7009 Vulputate Street400.002013-03-12 15:20:00Ca +50TallulahHeathP.O. Box 865, 3697 Dis Ave500.002013-03-12 15:20:00Or +51CiaranOlson2721 Et St.500.002013-03-12 15:20:00Ca +52OrlandoWittP.O. Box 717, 1102 Nulla. Rd.500.002013-03-12 15:20:00Or +53QuinnRiceAp #647-6627 Tristique Avenue500.002013-03-12 15:20:00Ca +54WyattPickettAp #128-3130 Vel, Rd.500.002013-03-12 15:20:00Or +55EmeraldCopeland857-5119 Turpis Rd.500.002013-03-12 15:20:00Ca +56JonasQuinnAp #441-7183 Ligula. Street500.002013-03-12 15:20:00Or +57WillaBerg6672 Velit Ave500.002013-03-12 15:20:00Ca +58MalikLee998-9208 In Street500.002013-03-12 15:20:00Or +59CallieMedina1620 Dui. Rd.500.002013-03-12 15:20:00Ca +60LukeMasonP.O. Box 143, 2070 Augue Rd.600.002013-03-12 15:20:00Or +61ShafiraEstrada8824 Ante Street600.002014-03-12 15:20:00Ca +62ElizabethRutledge315-6510 Sit St.600.002014-03-12 15:20:00Or +63PandoraLevine357-3596 Nibh. Ave600.002014-03-12 15:20:00Ca +64HilelPrince845-1229 Sociosqu Rd.600.002014-03-12 15:20:00Or +65RinahTorresAp #492-9328 At St.600.002014-03-12 15:20:00Ca +66YaelHobbsP.O. Box 477, 3896 In Street600.002014-03-12 15:20:00Or +67NevadaNashP.O. Box 251, 1914 Tincidunt Road600.002014-03-12 15:20:00Ca +68MarnyHuffP.O. Box 818, 6086 Ultricies St.600.002014-03-12 15:20:00Or +69KimberleyMilesAp #893-3685 In Road600.002014-03-12 15:20:00Ca +70DuncanFullerAp #197-5216 Iaculis Street700.002014-03-12 15:20:00Or +71YardleyLeblancP.O. Box 938, 1278 Sit Ave700.002014-03-12 15:20:00Ca +72HamishBrewerAp #854-781 Quisque St.700.002014-03-12 15:20:00Or +73PetraMoon453-6609 Curabitur Street700.002014-03-12 15:20:00Ca +74ReeseEstradaAp #382-3313 Malesuada St.700.002014-03-12 15:20:00Or +75GageHiggins7443 Eu Street700.002014-03-12 15:20:00Ca +76ZacheryCamachoAp #795-4143 Quam. St.700.002014-03-12 15:20:00Or +77KellyGarnerP.O. Box 895, 2843 Cras Rd.700.002014-03-12 15:20:00Ca +78HanaeCarr9440 Amet St.700.002014-03-12 15:20:00Or +79AnnAlston884-7948 Dictum Road700.002014-03-12 15:20:00Ca +80ChancellorCobbP.O. Box 889, 5978 Ac Avenue800.002014-03-12 15:20:00Or +81DorothyHarrell6974 Tristique Ave800.002010-03-12 15:20:00Ca +82VaughanLeon1610 Luctus Av.800.002010-03-12 15:20:00Or +83WynneJimenez321-9171 Felis. Avenue800.002010-03-12 15:20:00Ca +84WillaMendoza489-182 Sed Av.800.002010-03-12 15:20:00Or +85CamdenGoodwin4579 Ante St.800.002010-03-12 15:20:00Ca +86IfeomaFrenchP.O. Box 160, 8769 Integer Road800.002010-03-12 15:20:00Or +87RamonaStrong1666 Ridiculus Avenue800.002010-03-12 15:20:00Ca +88BrettRamosAp #579-9879 Et, Road800.002010-03-12 15:20:00Or +89UllaGray595-7066 Malesuada Road800.002010-03-12 15:20:00Ca +90KevynMccallP.O. Box 968, 1420 Aenean Avenue900.002010-03-12 15:20:00Or +91GenevieveWilkins908 Turpis. Street900.002010-03-12 15:20:00Ca +92ThaneOneil6766 Lectus St.900.002010-03-12 15:20:00Or +93MarikoClineP.O. Box 329, 5375 Ac St.900.002010-03-12 15:20:00Ca +94LaelMclean500-7010 Sit St.900.002010-03-12 15:20:00Or +95WinifredHopperAp #140-8982 Velit Avenue900.002010-03-12 15:20:00Ca +96RafaelEnglandP.O. Box 405, 7857 Eget Av.900.002010-03-12 15:20:00Or +97DanaCarter814-601 Purus. Av.900.002010-03-12 15:20:00Ca +98JulietBattleAp #535-1965 Cursus St.900.002010-03-12 15:20:00Or +99WynterVincent626-8492 Mollis Avenue900.002010-03-12 15:20:00Ca +100WangMitchell4023 Lacinia. Ave100.002010-03-12 15:20:00Or diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java index 74a1a82..d2e1b13 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java @@ -23,9 +23,11 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.IOConstants; +import org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader; import org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; @@ -54,7 +56,8 @@ import com.google.common.base.Strings; -public class ParquetRecordReaderWrapper implements RecordReader { +public class ParquetRecordReaderWrapper implements RecordReader, + StatsProvidingRecordReader { public static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReaderWrapper.class); private final long splitLen; // for getPos() @@ -70,6 +73,7 @@ private JobConf jobConf; private final ProjectionPusher projectionPusher; private List filtedBlocks; + private final SerDeStats serDeStats; public ParquetRecordReaderWrapper( final ParquetInputFormat newInputFormat, @@ -89,6 +93,7 @@ public ParquetRecordReaderWrapper( throws IOException, InterruptedException { this.splitLen = oldSplit.getLength(); this.projectionPusher = pusher; + this.serDeStats = new SerDeStats(); jobConf = oldJobConf; final ParquetInputSplit split = getSplit(oldSplit, jobConf); @@ -247,6 +252,13 @@ protected ParquetInputSplit getSplit( final ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema())); + + // Compute stats + for (BlockMetaData bmd : blocks) { + serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount()); + serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize()); + } + schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata() .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount(); final List splitGroup = new ArrayList(); @@ -300,4 +312,9 @@ protected ParquetInputSplit getSplit( public List getFiltedBlocks() { return filtedBlocks; } + + @Override + public SerDeStats getStats() { + return serDeStats; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java index 669d56f..d6f1b7a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; @@ -90,8 +91,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, if (parseCtx.getQueryProperties().isAnalyzeCommand()) { boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand(); boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand(); - if (inputFormat.equals(OrcInputFormat.class)) { - // For ORC, all the following statements are the same + if (inputFormat.equals(OrcInputFormat.class) || + inputFormat.equals(MapredParquetInputFormat.class)) { + // For ORC and Parquet, all the following statements are the same // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; diff --git ql/src/test/queries/clientpositive/parquet_analyze.q ql/src/test/queries/clientpositive/parquet_analyze.q new file mode 100644 index 0000000..0d7543e --- /dev/null +++ ql/src/test/queries/clientpositive/parquet_analyze.q @@ -0,0 +1,33 @@ +CREATE TABLE parquet_create_people_staging ( + id int, + first_name string, + last_name string, + address string, + salary decimal, + start_date timestamp, + state string); + +LOAD DATA LOCAL INPATH '../../data/files/parquet_create_people.txt' OVERWRITE INTO TABLE parquet_create_people_staging; + +CREATE TABLE parquet_create_people ( + id int, + first_name string, + last_name string, + address string, + salary decimal, + start_date timestamp, + state string) +STORED AS parquet; + +INSERT OVERWRITE TABLE parquet_create_people SELECT * FROM parquet_create_people_staging ORDER BY id; + +-- describe the table first. This should contain un-updated stats. +DESC FORMATTED parquet_create_people; + +-- now run noscan and re-check the stats, and they should be updated. +ANALYZE TABLE parquet_create_people COMPUTE STATISTICS noscan; +DESC FORMATTED parquet_create_people; + +-- clean up +DROP TABLE parquet_create_people_staging; +DROP TABLE parquet_create_people; diff --git ql/src/test/results/clientpositive/parquet_analyze.q.out ql/src/test/results/clientpositive/parquet_analyze.q.out new file mode 100644 index 0000000..4429c0a --- /dev/null +++ ql/src/test/results/clientpositive/parquet_analyze.q.out @@ -0,0 +1,179 @@ +PREHOOK: query: CREATE TABLE parquet_create_people_staging ( + id int, + first_name string, + last_name string, + address string, + salary decimal, + start_date timestamp, + state string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_create_people_staging +POSTHOOK: query: CREATE TABLE parquet_create_people_staging ( + id int, + first_name string, + last_name string, + address string, + salary decimal, + start_date timestamp, + state string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_create_people_staging +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_create_people.txt' OVERWRITE INTO TABLE parquet_create_people_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_create_people_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_create_people.txt' OVERWRITE INTO TABLE parquet_create_people_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_create_people_staging +PREHOOK: query: CREATE TABLE parquet_create_people ( + id int, + first_name string, + last_name string, + address string, + salary decimal, + start_date timestamp, + state string) +STORED AS parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_create_people +POSTHOOK: query: CREATE TABLE parquet_create_people ( + id int, + first_name string, + last_name string, + address string, + salary decimal, + start_date timestamp, + state string) +STORED AS parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_create_people +PREHOOK: query: INSERT OVERWRITE TABLE parquet_create_people SELECT * FROM parquet_create_people_staging ORDER BY id +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create_people_staging +PREHOOK: Output: default@parquet_create_people +POSTHOOK: query: INSERT OVERWRITE TABLE parquet_create_people SELECT * FROM parquet_create_people_staging ORDER BY id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create_people_staging +POSTHOOK: Output: default@parquet_create_people +POSTHOOK: Lineage: parquet_create_people.address SIMPLE [(parquet_create_people_staging)parquet_create_people_staging.FieldSchema(name:address, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create_people.first_name SIMPLE [(parquet_create_people_staging)parquet_create_people_staging.FieldSchema(name:first_name, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create_people.id SIMPLE [(parquet_create_people_staging)parquet_create_people_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create_people.last_name SIMPLE [(parquet_create_people_staging)parquet_create_people_staging.FieldSchema(name:last_name, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create_people.salary SIMPLE [(parquet_create_people_staging)parquet_create_people_staging.FieldSchema(name:salary, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: parquet_create_people.start_date SIMPLE [(parquet_create_people_staging)parquet_create_people_staging.FieldSchema(name:start_date, type:timestamp, comment:null), ] +POSTHOOK: Lineage: parquet_create_people.state SIMPLE [(parquet_create_people_staging)parquet_create_people_staging.FieldSchema(name:state, type:string, comment:null), ] +PREHOOK: query: -- describe the table first. This should contain un-updated stats. +DESC FORMATTED parquet_create_people +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parquet_create_people +POSTHOOK: query: -- describe the table first. This should contain un-updated stats. +DESC FORMATTED parquet_create_people +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parquet_create_people +# col_name data_type comment + +id int +first_name string +last_name string +address string +salary decimal(10,0) +start_date timestamp +state string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 1 + numRows 100 + rawDataSize 700 + totalSize 6623 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe +InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: -- now run noscan and re-check the stats, and they should be updated. +ANALYZE TABLE parquet_create_people COMPUTE STATISTICS noscan +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create_people +PREHOOK: Output: default@parquet_create_people +POSTHOOK: query: -- now run noscan and re-check the stats, and they should be updated. +ANALYZE TABLE parquet_create_people COMPUTE STATISTICS noscan +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create_people +POSTHOOK: Output: default@parquet_create_people +PREHOOK: query: DESC FORMATTED parquet_create_people +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parquet_create_people +POSTHOOK: query: DESC FORMATTED parquet_create_people +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parquet_create_people +# col_name data_type comment + +id int +first_name string +last_name string +address string +salary decimal(10,0) +start_date timestamp +state string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 1 + numRows 100 + rawDataSize 5952 + totalSize 6623 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe +InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: -- clean up +DROP TABLE parquet_create_people_staging +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_create_people_staging +PREHOOK: Output: default@parquet_create_people_staging +POSTHOOK: query: -- clean up +DROP TABLE parquet_create_people_staging +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_create_people_staging +POSTHOOK: Output: default@parquet_create_people_staging +PREHOOK: query: DROP TABLE parquet_create_people +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_create_people +PREHOOK: Output: default@parquet_create_people +POSTHOOK: query: DROP TABLE parquet_create_people +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_create_people +POSTHOOK: Output: default@parquet_create_people