diff --git data/files/UserVisits.dat data/files/UserVisits.dat new file mode 100644 index 0000000..f56c5a1 --- /dev/null +++ data/files/UserVisits.dat @@ -0,0 +1,55 @@ +170.131.22.2|13rdgckzlcblruc.html|1984-8-7|336.869186722|NuSearch Spider|HUN|HUN-NL|remnants|3 +162.114.4.2|6xpirzjeytxdjsmwtmyeugkesratmpvamliekrijlgmvyyrslqwgw.html|1978-1-9|331.791153595|Superdownloads Spiderma|AUT|AUT-ZR|MHD|8 +177.110.45.18|11zvmoamsyaameokoeylbkivgquksibqbalnpmailbiyfxitbhfdroyxesixbjndkyqzl.html|1986-9-25|411.968497603|Mozilla/4.0|FLK|FLK-GB|apj@as.arizona.edu.|7 +157.111.12.37|44mvdnls.html|2002-7-3|486.660926201|PHP/4.0.|FIN|FIN-CZ|diffuse|3 +161.100.45.22|14ceyigx.html|1978-10-26|399.80234522|NP/0.1|BEN|BEN-CA|region|8 +164.118.48.16|28axfinfqwdcwoorukpwqvqoxxeuivbniclnkytavwdslrj.html|1996-12-8|298.335411612|MSNBOT_Mobile MSMOBOT Mozilla/2.0|USA|USA-IO|medium|1 +153.104.13.11|19aysprojntmnwymfdkaznbqxprxxaissjqkzhzivsvipuvuxfuxsvnqlfnigvby.html|1976-10-6|146.309480768|WebSearch.COM.AU/3.0.1|MCO|MCO-YD|state|5 +150.112.45.27|12hcaewxiswjeezfnlulkenwubaqsitpuarufosogoxls.html|1995-6-19|173.469334335|WinkBot/0.06|PHL|PHL-NN|important|2 +152.108.39.16|36umg.html|1974-3-28|269.969215988|GSiteCrawler/v1.xx rev. xxx|MNG|MNG-HI|...)|6 +174.119.41.16|60yxoboskwpyfin.html|2002-7-17|436.113482675|Infoseek SideWinder/2.0B|NIC|NIC-JP|data|1 +165.116.21.12|70difiadhmrvragggmoaufnuwwbakbjntnwzvxcdjtybufiarwbmcphzmizwkikewh.html|1984-2-6|13.099044572|WWWeasel Robot v1.00|THA|THA-FO|bubbles|6 +155.128.42.14|21brkepinqiwvtmfmebjckkhwevhxaesogkykzgyqpuligrul.html|1986-7-29|347.800952938|Mozilla/4.0 compatible ZyBorg/1.0|IRN|IRN-YS|conduction|1 +156.131.31.12|14nbaaoablhxrlvbfgrwcxktvshtkoqzddbdepegbmesxztdglzjjkc.html|2002-7-30|85.7691140217|Java1.1.xx.|BRA|BRA-BL|circumstellar|9 +159.122.42.18|4xfydvopxveeduudfzodxkbczvdlzou.html|1989-9-20|332.572440865|Metaeuro Web Crawler/0.2|LUX|LUX-SD|kinematics|7 +151.104.39.45|65psclahgvasawczpyicyxkuqzwpbowghmzkxzsdvtwwpzvfydiwbsqrrmhtbezjqyuo.html|2002-1-13|190.528735328|JobSpider_BA/1.|UGA|UGA-PY|pulsars:|7 +159.132.24.22|18vhcbzhhblfbayejcybyibwqsgzlkmswizyjzgrbrw.html|1978-1-2|182.368755789|Piffany_Web_Scraper_v0.|ITA|ITA-NJ|nonthermal|1 +170.101.17.16|40prmxavsjoizdzkgsncesndxebatfwvrmmejnacxol.html|1989-9-1|41.4163486896|Mozilla/4.01 [en]|ZAF|ZAF-AK|Scuti|6 +171.124.38.2|29nripzogexadckoiaoafxvtkrxksdqgveydtxsabpbfsltbmibrfwlqojagmr.html|1979-6-12|192.085693167|IconSurf/2.0 favicon monitor|SVN|SVN-DY|systems|5 +178.128.29.41|24tmrndfialwvkwybuspjyexlkiamebwtvilimqqncnimkgofzepximj.html|2000-7-8|276.89796127|obidos-bot|SLB|SLB-RL|(...|4 +175.101.24.43|70dcfbcotdzhfhuhquyosbcviglrkrakddmifpxzswg.html|1978-3-16|131.775726872|Mozilla/4.0|BMU|BMU-BR|spiral|6 +155.102.37.30|99cyllzbnsowifxdxsdmiseiceeriaaoucmgnlhaewxmbvqynulwmpepujhckhqfjdmxpuyt.html|1975-5-4|311.052004479|WebSearch.COM.AU/3.0.1|NLD|NLD-GX|Herbig-Haro|6 +156.105.11.18|1nczmzpivhbgn.html|1992-9-19|36.9747263531|Search/1.0|GLP|GLP-DJ|observations|3 +164.115.38.23|79bvcojctkaugbcterbzfykwvesklokgilbkalntvoocqqvuixunvekqjcburlbzxckxnyrjm.html|1991-4-20|267.047961774|Journster.com RSS/Atom aggregator 0.5|HKG|HKG-PK|radio|2 +179.133.2.36|12azizhsdhdgdpidjgmdeyzmfhdwsbezbeyjegcioforvxvfehjigiulqyhizmhargkwmmeartsnrosvvbdbkynawvi.html|1999-12-9|481.463770712|LeechGet 200x|SCG|SCG-XF|instruments|8 +178.107.45.18|45mbziaowxegkhzcmbsyrextgqjbyezodmqduqrqnwxydwaqytopxmidcsfbwfparfemvwdjtaiwxjnvcclaotdrmjs.html|1983-4-13|51.6686671965|IlTrovatore/1.2|HND|HND-AN|dynamics|2 +162.117.17.14|17tkabzxynnqswezhqmkvrlfycpmxqowlhgligihuwxmscmasylopwuozjawaotlwaxfggmack.html|2001-12-24|161.048060104|Mozilla/4.5 [en]C-CCK-MCD {TLC;RETAIL}|RWA|RWA-QE|rays|9 +178.119.40.7|48amqtmqxsjgrmjkszztfpegqzapidysnze.html|1987-4-3|492.988714137|Mozilla/4.0|AUT|AUT-ZR|cosmology:|8 +160.119.18.18|15yufqaoxpuqwb.html|1979-7-22|394.694548614|scooter-venus-3.0.vn|MCO|MCO-YD|outflows|1 +162.112.21.25|21boum.html|1991-2-6|165.368136543|LinkProver 2.|TCA|TCA-IS|spots|8 +176.112.31.17|20gblxgjcvpu.html|1991-8-5|78.2740990152|Mozilla/4.0|BMU|BMU-BR|masses|2 +166.130.12.13|9izokfebomgsiifyzrsepbbemutvj.html|2003-12-5|188.600736756|WWW-Mechanize/1.1|TGO|TGO-WB|bursts|5 +171.100.18.39|97sxfsgahjujwzlszmxkahyslcobrrlx.html|1985-11-21|143.277058506|Overture-WebCrawler/3.8/Fresh|SAU|SAU-KL|interferometric|5 +152.122.43.35|85zdszgzonsxkqbrkthtceiuzjsedwvghvkzvqzj.html|1989-12-1|315.628996565|moget/x.x|UMI|UMI-VU|Galaxy:|2 +157.133.36.37|15xnilzhtqjsxhhbzazrflznupllyhvdbsqjeqqyharfiyhhyhzdszrnpcyoktslljvqam.html|1990-3-20|426.498017786|PrivacyFinder/1.|UZB|UZB-ZJ|nebulae|7 +161.134.11.11|96kvrofepctfbesrphjiznjktygntkkubupsjvxyxrdzvwrkeasdobohauvueg.html|1984-6-6|280.039128409|Waypath development crawler - info at waypath dot co|IDN|IDN-BH|supergiants|6 +163.123.23.13|19rkrtwumqwmnnzisxyeesqacwolpypyxhipaejnvfzitzrlwqqbigblcqxrpnqmuybudkiyqhhjgzvdpleysg.html|1977-10-11|86.3390049695|Opera/5.0|LSO|LSO-PW|testing|7 +166.126.40.21|52ejufqiidwioozorbnsjxezfwaucndbihldnblvehdtwchoeuhoslnyioslbwmkdynrzymegpy.html|1990-10-20|125.582281932|Mozilla/4.0|BTN|BTN-HP|catalogs|9 +158.133.10.19|87nzdhsnzhkylakazmkvctgaaxtrafpxscxvjqijxthitrj.html|1982-10-5|481.583542862|larbin|GAB|GAB-CS|angular|8 +173.104.45.8|49sdptdphxjlbiwrbbrsebwqquadx.html|1981-5-2|41.3182727245|LECodeChecker/3.0 libgetdoc/1.|AUS|AUS-AV|bands|6 +160.101.31.43|6lrepnctlanokfhla.html|1973-9-7|133.29867101|sogou develop spide|SWE|SWE-TM|time|5 +150.127.33.8|22oeawpxhqahkvtaecwp.html|1999-3-16|398.882494477|W3C-WebCon/5.x.x libwww/5.x.|ISR|ISR-DY|history|1 +154.114.47.36|2mzzsgievabpkaoqegadbbjxwkutdisnvrmox.html|1981-7-24|332.760102125|mammoth/1.0|AUT|AUT-ZR|FUNCTION|3 +155.108.15.24|22beewtbnpw.html|1996-6-7|393.470347637|Scrubby/3.0|ABW|ABW-NB|horizontal-branch|4 +177.120.40.39|48itvyjulckeddslsuayoguojzhvqvmfgvyctiwflhj.html|1977-8-12|239.601807636|webmeasurement-bot, http://rvs.informatik.uni-leipzig.d|WSM|WSM-UF|are|3 +179.123.41.31|46eppnympstjuhivvpritvotqmivgsfmdkbtxafns.html|2001-11-26|258.55616439|Mozilla/2.0|SYR|SYR-XP|photometric|1 +175.100.9.4|32fjrnrlabonc.html|1988-10-22|344.394849153|Snapbot/1.|GUF|GUF-KP|acceleration|2 +155.126.7.17|72wufwnsdsqncftnvdcunnknzqnaiyflmcgsytkbmbpogicblew.html|1981-12-5|398.334494319|UKWizz/Nutch-0.8.1|NIC|NIC-JP|Kuiper|4 +150.118.20.31|1mbyargbxtnjtivflxzzredcfbtehxbxjcwkucmrwaaqiwvutuulzxnezhi.html|1982-8-27|168.936669894|Mozilla/4.0|IRL|IRL-NN|cataclysmic|5 +177.116.39.36|84maivbmcqggefkjtsde.html|1982-6-11|88.121669797|Mozilla/4.0|ARE|ARE-MX|instruments|1 +168.119.19.26|73vhjursdvxateuvrxsspwwfdbsoqfegeannuegyadzuitparisgfomiqfxhkcnocacxfivfmuzuopvfynmdcyl.html|1991-11-17|397.829289621|webbandit/4.xx.|NIC|NIC-JP|dust|2 +154.100.36.32|57rylepuglpfqvjwkxgrtftvqkjzjwsznjyzontuzizqdimofsfzxzuojeot.html|1999-1-5|334.714055649|RRC|GTM|GTM-VH|blue|7 +153.112.2.11|6pkwxtlgkkxoqtxpgrullqxjauquvmlkcwhzpsgzdeotymieddqpu.html|1975-8-6|348.218411093|Wotbox/alpha0.6|MNP|MNP-UD|supernovae:|6 +150.107.15.22|53gohsgrvrjgfptttlpfipgsnijsrhxsyeggwnysfhykxrdqdsvlicdwkmpcumut.html|1978-8-2|355.771603423|Mozilla/3.0|DEU|DEU-PU|stars|4 +150.126.27.44|0rgxbnwiqebsmszpkvfpxvhkleebngzxxgvzt.html|1989-5-18|467.800755054|Mozilla/3.01|ZWE|ZWE-TS|system|3 +151.101.32.3|34btbqii.html|1998-8-1|131.055972797|Orca Browser|THA|THA-FO|late-type|5 diff --git data/files/binary.txt data/files/binary.txt new file mode 100644 index 0000000..c6fbdee --- /dev/null +++ data/files/binary.txt @@ -0,0 +1,10 @@ +the quick brown fox jumped over the lazy little dog +today is nice outside +the quick brown fox jumped over the lazy little dog + +wikipedia is a great source of information +the quick brown fox jumped over the lazy little dog + +estimating the number of distinct values is a hard problem + +the quick brown fox jumped over the lazy little dog diff --git data/files/bool.txt data/files/bool.txt new file mode 100644 index 0000000..065359b --- /dev/null +++ data/files/bool.txt @@ -0,0 +1,33 @@ +true +false +true +true +true +false +false +false +false +true +true +true +true +false + +false +true +true +false +false +false +false +false +false +false +false +true +false +false +false +true +true +false diff --git data/files/double.txt data/files/double.txt new file mode 100644 index 0000000..66c030b --- /dev/null +++ data/files/double.txt @@ -0,0 +1,16 @@ +55.33 +44.2 +435.33 +324.33 +324.33 +44.2 +55.3 +55.3 +0.0 + +66.4 +23.22 +-87.2 + +33.44 +55.3 diff --git data/files/employee.dat data/files/employee.dat new file mode 100644 index 0000000..1c1d342 --- /dev/null +++ data/files/employee.dat @@ -0,0 +1,12 @@ +16|john +17|robert +18|andrew +19|katty +21|tom +22|tim +23|james +24|paul +27|edward +29|alan +31|kerry +34|terri diff --git data/files/employee2.dat data/files/employee2.dat new file mode 100644 index 0000000..b70b2dd --- /dev/null +++ data/files/employee2.dat @@ -0,0 +1,8 @@ +16|john +17|robert +18|andrew +19|katty +27|edward +29|alan +31|kerry + diff --git data/files/int.txt data/files/int.txt new file mode 100644 index 0000000..9553ed2 --- /dev/null +++ data/files/int.txt @@ -0,0 +1,12 @@ +4 +252 +233 + +343 +43 +45 +344 +22 +54 +8 +13 diff --git ivy/libraries.properties ivy/libraries.properties index 7ac6778..2794049 100644 --- ivy/libraries.properties +++ ivy/libraries.properties @@ -64,3 +64,4 @@ slf4j-api.version=1.6.1 slf4j-log4j12.version=1.6.1 velocity.version=1.5 zookeeper.version=3.4.3 +javolution.version=5.5.1 diff --git metastore/if/hive_metastore.thrift metastore/if/hive_metastore.thrift index d4fad72..7230107 100755 --- metastore/if/hive_metastore.thrift +++ metastore/if/hive_metastore.thrift @@ -194,6 +194,67 @@ struct Index { 10: bool deferredRebuild } +// column statistics +struct BooleanColumnStatsData { +1: required i64 numTrues, +2: required i64 numFalses, +3: required i64 numNulls +} + +struct DoubleColumnStatsData { +1: required double lowValue, +2: required double highValue, +3: required i64 numNulls, +4: required i64 numDVs +} + +struct LongColumnStatsData { +1: required i64 lowValue, +2: required i64 highValue, +3: required i64 numNulls, +4: required i64 numDVs +} + +struct StringColumnStatsData { +1: required i64 maxColLen, +2: required double avgColLen, +3: required i64 numNulls, +4: required i64 numDVs +} + +struct BinaryColumnStatsData { +1: required i64 maxColLen, +2: required double avgColLen, +3: required i64 numNulls +} + +union ColumnStatisticsData { +1: BooleanColumnStatsData booleanStats, +2: LongColumnStatsData longStats, +3: DoubleColumnStatsData doubleStats, +4: StringColumnStatsData stringStats, +5: BinaryColumnStatsData binaryStats +} + +struct ColumnStatisticsObj { +1: required string colName, +2: required string colType, +3: required ColumnStatisticsData statsData +} + +struct ColumnStatisticsDesc { +1: required bool isTblLevel, +2: required string dbName, +3: required string tableName, +4: optional string partName, +5: optional i64 lastAnalyzed +} + +struct ColumnStatistics { +1: required ColumnStatisticsDesc statsDesc, +2: required list statsObj; +} + // schema of the table/query results etc. struct Schema { // column names, types, comments @@ -253,6 +314,10 @@ exception ConfigValSecurityException { 1: string message } +exception InvalidInputException { + 1: string message +} + /** * This interface is live. */ @@ -472,6 +537,37 @@ service ThriftHiveMetastore extends fb303.FacebookService list get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) throws(1:MetaException o2) + // column statistics interfaces + + // update APIs persist the column statistics object(s) that are passed in. If statistics already + // exists for one or more columns, the existing statistics will be overwritten. The update APIs + // validate that the dbName, tableName, partName, colName[] passed in as part of the ColumnStatistics + // struct are valid, throws InvalidInputException/NoSuchObjectException if found to be invalid + bool update_table_column_statistics(1:ColumnStatistics stats_obj) throws (1:NoSuchObjectException o1, + 2:InvalidObjectException o2, 3:MetaException o3, 4:InvalidInputException o4) + bool update_partition_column_statistics(1:ColumnStatistics stats_obj) throws (1:NoSuchObjectException o1, + 2:InvalidObjectException o2, 3:MetaException o3, 4:InvalidInputException o4) + + // get APIs return the column statistics corresponding to db_name, tbl_name, [part_name], col_name if + // such statistics exists. If the required statistics doesn't exist, get APIs throw NoSuchObjectException + // For instance, if get_table_column_statistics is called on a partitioned table for which only + // partition level column stats exist, get_table_column_statistics will throw NoSuchObjectException + ColumnStatistics get_table_column_statistics(1:string db_name, 2:string tbl_name, 3:string col_name) throws + (1:NoSuchObjectException o1, 2:MetaException o2, 3:InvalidInputException o3, 4:InvalidObjectException o4) + ColumnStatistics get_partition_column_statistics(1:string db_name, 2:string tbl_name, 3:string part_name, + 4:string col_name) throws (1:NoSuchObjectException o1, 2:MetaException o2, + 3:InvalidInputException o3, 4:InvalidObjectException o4) + + // delete APIs attempt to delete column statistics, if found, associated with a given db_name, tbl_name, [part_name] + // and col_name. If the delete API doesn't find the statistics record in the metastore, throws NoSuchObjectException + // Delete API validates the input and if the input is invalid throws InvalidInputException/InvalidObjectException. + bool delete_partition_column_statistics(1:string db_name, 2:string tbl_name, 3:string part_name, 4:string col_name) throws + (1:NoSuchObjectException o1, 2:MetaException o2, 3:InvalidObjectException o3, + 4:InvalidInputException o4) + bool delete_table_column_statistics(1:string db_name, 2:string tbl_name, 3:string col_name) throws + (1:NoSuchObjectException o1, 2:MetaException o2, 3:InvalidObjectException o3, + 4:InvalidInputException o4) + //authorization privileges bool create_role(1:Role role) throws(1:MetaException o1) diff --git metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java index 8fec13d..9e38d25 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java @@ -55,6 +55,9 @@ import org.apache.hadoop.hive.common.metrics.Metrics; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.ConfigValSecurityException; import org.apache.hadoop.hive.metastore.api.Constants; import org.apache.hadoop.hive.metastore.api.Database; @@ -65,6 +68,7 @@ import org.apache.hadoop.hive.metastore.api.HiveObjectRef; import org.apache.hadoop.hive.metastore.api.HiveObjectType; import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.metastore.api.IndexAlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.InvalidInputException; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.InvalidPartitionException; @@ -620,7 +624,7 @@ public class HiveMetaStore extends ThriftHiveMetastore { private void drop_database_core(RawStore ms, final String name, final boolean deleteData, final boolean cascade) throws NoSuchObjectException, InvalidOperationException, MetaException, - IOException { + IOException, InvalidObjectException, InvalidInputException { boolean success = false; Database db = null; List tablePaths = new ArrayList(); @@ -753,6 +757,10 @@ public class HiveMetaStore extends ThriftHiveMetastore { success = true; } catch (IOException e) { throw new MetaException(e.getMessage()); + } catch (InvalidInputException e) { + throw new MetaException(e.getMessage()); + } catch (InvalidObjectException e) { + throw new MetaException(e.getMessage()); } finally { endFunction("drop_database", success); } @@ -1006,7 +1014,7 @@ public class HiveMetaStore extends ThriftHiveMetastore { private void drop_table_core(final RawStore ms, final String dbname, final String name, final boolean deleteData) - throws NoSuchObjectException, MetaException, IOException { + throws NoSuchObjectException, MetaException, IOException, InvalidObjectException, InvalidInputException { boolean success = false; boolean isExternal = false; @@ -1140,10 +1148,13 @@ public class HiveMetaStore extends ThriftHiveMetastore { * @return * @throws MetaException * @throws IOException + * @throws InvalidInputException + * @throws InvalidObjectException + * @throws NoSuchObjectException */ private List dropPartitionsAndGetLocations(RawStore ms, String dbName, String tableName, Path tablePath, List partitionKeys, boolean checkLocation) - throws MetaException, IOException { + throws MetaException, IOException, NoSuchObjectException, InvalidObjectException, InvalidInputException { int partitionBatchSize = HiveConf.getIntVar(hiveConf, ConfVars.METASTORE_BATCH_RETRIEVE_MAX); @@ -1193,6 +1204,10 @@ public class HiveMetaStore extends ThriftHiveMetastore { success = true; } catch (IOException e) { throw new MetaException(e.getMessage()); + } catch (InvalidInputException e) { + throw new MetaException(e.getMessage()); + } catch (InvalidObjectException e) { + throw new MetaException(e.getMessage()); } finally { endFunction("drop_table", success); } @@ -1605,7 +1620,7 @@ public class HiveMetaStore extends ThriftHiveMetastore { private boolean drop_partition_common(RawStore ms, String db_name, String tbl_name, List part_vals, final boolean deleteData) - throws MetaException, NoSuchObjectException, IOException { + throws MetaException, NoSuchObjectException, IOException, InvalidObjectException, InvalidInputException { boolean success = false; Path partPath = null; @@ -1688,6 +1703,10 @@ public class HiveMetaStore extends ThriftHiveMetastore { ret = drop_partition_common(getMS(), db_name, tbl_name, part_vals, deleteData); } catch (IOException e) { throw new MetaException(e.getMessage()); + } catch (InvalidInputException e) { + throw new MetaException(e.getMessage()); + } catch (InvalidObjectException e) { + throw new MetaException(e.getMessage()); } finally { endFunction("drop_partition", ret); } @@ -2210,7 +2229,7 @@ public class HiveMetaStore extends ThriftHiveMetastore { private boolean drop_partition_by_name_core(final RawStore ms, final String db_name, final String tbl_name, final String part_name, final boolean deleteData) throws NoSuchObjectException, - MetaException, TException, IOException { + MetaException, TException, IOException, InvalidObjectException, InvalidInputException { List partVals = null; try { @@ -2235,6 +2254,10 @@ public class HiveMetaStore extends ThriftHiveMetastore { part_name, deleteData); } catch (IOException e) { throw new MetaException(e.getMessage()); + } catch (InvalidInputException e) { + throw new MetaException(e.getMessage()); + } catch (InvalidObjectException e) { + throw new MetaException(e.getMessage()); } finally { endFunction("drop_partition_by_name", ret); } @@ -2396,6 +2419,10 @@ public class HiveMetaStore extends ThriftHiveMetastore { indexName, deleteData); } catch (IOException e) { throw new MetaException(e.getMessage()); + } catch (InvalidInputException e) { + throw new MetaException(e.getMessage()); + } catch (InvalidObjectException e) { + throw new MetaException(e.getMessage()); } finally { endFunction("drop_index_by_name", ret); } @@ -2406,7 +2433,7 @@ public class HiveMetaStore extends ThriftHiveMetastore { private boolean drop_index_by_name_core(final RawStore ms, final String dbName, final String tblName, final String indexName, final boolean deleteData) throws NoSuchObjectException, - MetaException, TException, IOException { + MetaException, TException, IOException, InvalidObjectException, InvalidInputException { boolean success = false; Path tblPath = null; @@ -2520,6 +2547,195 @@ public class HiveMetaStore extends ThriftHiveMetastore { return ret; } + private String lowerCaseConvertPartName(String partName) throws MetaException { + boolean isFirst = true; + Map partSpec = Warehouse.makeEscSpecFromName(partName); + String convertedPartName = new String(); + + for (Map.Entry entry : partSpec.entrySet()) { + String partColName = entry.getKey(); + String partColVal = entry.getValue(); + + if (!isFirst) { + convertedPartName += "/"; + } else { + isFirst = false; + } + convertedPartName += partColName.toLowerCase() + "=" + partColVal; + } + return convertedPartName; + } + + public ColumnStatistics get_table_column_statistics(String dbName, String tableName, + String colName) throws NoSuchObjectException, MetaException, TException, InvalidInputException, + InvalidObjectException + { + dbName = dbName.toLowerCase(); + tableName = tableName.toLowerCase(); + colName = colName.toLowerCase(); + startFunction("get_column_statistics_by_table: db=" + dbName + " table=" + tableName + + " column=" + colName); + ColumnStatistics statsObj = null; + try { + statsObj = getMS().getTableColumnStatistics(dbName, tableName, colName); + } finally { + endFunction("get_column_statistics_by_table: ", statsObj != null); + } + return statsObj; + } + + public ColumnStatistics get_partition_column_statistics(String dbName, String tableName, + String partName, String colName) throws NoSuchObjectException, MetaException, + InvalidInputException, TException,InvalidObjectException + { + dbName = dbName.toLowerCase(); + tableName = tableName.toLowerCase(); + colName = colName.toLowerCase(); + String convertedPartName = lowerCaseConvertPartName(partName); + startFunction("get_column_statistics_by_partition: db=" + dbName + " table=" + tableName + + " partition=" + convertedPartName + " column=" + colName); + ColumnStatistics statsObj = null; + + try { + List partVals = getPartValsFromName(getMS(), dbName, tableName, partName); + statsObj = getMS().getPartitionColumnStatistics(dbName, tableName, convertedPartName, + partVals, colName); + } finally { + endFunction("get_column_statistics_by_partition: ", statsObj != null); + } + return statsObj; + } + + public boolean update_table_column_statistics(ColumnStatistics colStats) + throws NoSuchObjectException,InvalidObjectException,MetaException,TException, + InvalidInputException + { + String dbName = null; + String tableName = null; + String colName = null; + ColumnStatisticsDesc statsDesc = colStats.getStatsDesc(); + dbName = statsDesc.getDbName().toLowerCase(); + tableName = statsDesc.getTableName().toLowerCase(); + + statsDesc.setDbName(dbName); + statsDesc.setTableName(tableName); + long time = System.currentTimeMillis() / 1000; + statsDesc.setLastAnalyzed(time); + + List statsObjs = colStats.getStatsObj(); + + for (ColumnStatisticsObj statsObj:statsObjs) { + colName = statsObj.getColName().toLowerCase(); + statsObj.setColName(colName); + startFunction("write_column_statistics: db=" + dbName + " table=" + tableName + + " column=" + colName); + } + + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + + boolean ret = false; + + try { + ret = getMS().updateTableColumnStatistics(colStats); + return ret; + } finally { + endFunction("write_column_statistics: ", ret != false); + } + } + + public boolean update_partition_column_statistics(ColumnStatistics colStats) + throws NoSuchObjectException,InvalidObjectException,MetaException,TException, + InvalidInputException + { + + String dbName = null; + String tableName = null; + String partName = null; + String colName = null; + + ColumnStatisticsDesc statsDesc = colStats.getStatsDesc(); + dbName = statsDesc.getDbName().toLowerCase(); + tableName = statsDesc.getTableName().toLowerCase(); + partName = lowerCaseConvertPartName(statsDesc.getPartName()); + + statsDesc.setDbName(dbName); + statsDesc.setTableName(tableName); + statsDesc.setPartName(partName); + + long time = System.currentTimeMillis() / 1000; + statsDesc.setLastAnalyzed(time); + + List statsObjs = colStats.getStatsObj(); + + for (ColumnStatisticsObj statsObj:statsObjs) { + colName = statsObj.getColName().toLowerCase(); + statsObj.setColName(colName); + startFunction("write_partition_column_statistics: db=" + dbName + " table=" + tableName + + " part=" + partName + "column=" + colName); + } + + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + + boolean ret = false; + + try { + List partVals = getPartValsFromName(getMS(), dbName, + tableName, partName); + ret = getMS().updatePartitionColumnStatistics(colStats, partVals); + return ret; + } finally { + endFunction("write_partition_column_statistics: ", ret != false); + } + } + + public boolean delete_partition_column_statistics(String dbName, String tableName, + String partName, String colName) throws NoSuchObjectException, MetaException, + InvalidObjectException, TException, InvalidInputException + { + dbName = dbName.toLowerCase(); + tableName = tableName.toLowerCase(); + if (colName != null) { + colName = colName.toLowerCase(); + } + String convertedPartName = lowerCaseConvertPartName(partName); + startFunction("delete_column_statistics_by_partition: db=" + dbName + " table=" + tableName + + " partition=" + convertedPartName + " column=" + colName); + boolean ret = false; + + try { + List partVals = getPartValsFromName(getMS(), dbName, tableName, convertedPartName); + ret = getMS().deletePartitionColumnStatistics(dbName, tableName, + convertedPartName, partVals, colName); + } finally { + endFunction("delete_column_statistics_by_partition: ", ret != false); + } + return ret; + } + + public boolean delete_table_column_statistics(String dbName, String tableName, String colName) + throws NoSuchObjectException, MetaException, InvalidObjectException, TException, + InvalidInputException + { + dbName = dbName.toLowerCase(); + tableName = tableName.toLowerCase(); + + if (colName != null) { + colName = colName.toLowerCase(); + } + startFunction("delete_column_statistics_by_table: db=" + dbName + " table=" + tableName + + " column=" + colName); + + boolean ret = false; + try { + ret = getMS().deleteTableColumnStatistics(dbName, tableName, colName); + } finally { + endFunction("delete_column_statistics_by_table: ", ret != false); + } + return ret; + } + @Override public List get_partitions_by_filter(final String dbName, final String tblName, final String filter, final short maxParts) @@ -3197,6 +3413,7 @@ public class HiveMetaStore extends ThriftHiveMetastore { } + /** * Discard a current delegation token. * diff --git metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.java metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.java index 17b986c..0ae8a4c 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.java @@ -40,12 +40,14 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ConfigValSecurityException; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; import org.apache.hadoop.hive.metastore.api.HiveObjectRef; import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.metastore.api.InvalidInputException; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.InvalidPartitionException; @@ -935,6 +937,50 @@ public class HiveMetaStoreClient implements IMetaStoreClient { return client.get_indexes(dbName, tblName, max); } + /** {@inheritDoc} */ + public boolean updateTableColumnStatistics(ColumnStatistics statsObj) + throws NoSuchObjectException, InvalidObjectException, MetaException, TException, + InvalidInputException{ + return client.update_table_column_statistics(statsObj); + } + + /** {@inheritDoc} */ + public boolean updatePartitionColumnStatistics(ColumnStatistics statsObj) + throws NoSuchObjectException, InvalidObjectException, MetaException, TException, + InvalidInputException{ + return client.update_partition_column_statistics(statsObj); + } + + /** {@inheritDoc} */ + public ColumnStatistics getTableColumnStatistics(String dbName, String tableName,String colName) + throws NoSuchObjectException, MetaException, TException, InvalidInputException, + InvalidObjectException { + return client.get_table_column_statistics(dbName, tableName, colName); + } + + /** {@inheritDoc} */ + public ColumnStatistics getPartitionColumnStatistics(String dbName, String tableName, + String partName, String colName) throws NoSuchObjectException, MetaException, TException, + InvalidInputException, InvalidObjectException { + return client.get_partition_column_statistics(dbName, tableName, partName, colName); + } + + /** {@inheritDoc} */ + public boolean deletePartitionColumnStatistics(String dbName, String tableName, String partName, + String colName) throws NoSuchObjectException, InvalidObjectException, MetaException, + TException, InvalidInputException + { + return client.delete_partition_column_statistics(dbName, tableName, partName, colName); + } + + /** {@inheritDoc} */ + public boolean deleteTableColumnStatistics(String dbName, String tableName, String colName) + throws NoSuchObjectException, InvalidObjectException, MetaException, TException, + InvalidInputException + { + return client.delete_table_column_statistics(dbName, tableName, colName); + } + /** * @param db * @param tableName diff --git metastore/src/java/org/apache/hadoop/hive/metastore/IMetaStoreClient.java metastore/src/java/org/apache/hadoop/hive/metastore/IMetaStoreClient.java index 3883b5b..a7c68db 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/IMetaStoreClient.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/IMetaStoreClient.java @@ -22,12 +22,14 @@ import java.util.List; import java.util.Map; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ConfigValSecurityException; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; import org.apache.hadoop.hive.metastore.api.HiveObjectRef; import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.metastore.api.InvalidInputException; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.InvalidPartitionException; @@ -726,6 +728,103 @@ public interface IMetaStoreClient { MetaException, TException; /** + * Write table level column statistics to persistent store + * @param statsObj + * @return boolean indicating the status of the operation + * @throws NoSuchObjectException + * @throws InvalidObjectException + * @throws MetaException + * @throws TException + * @throws InvalidInputException + */ + + public boolean updateTableColumnStatistics(ColumnStatistics statsObj) + throws NoSuchObjectException, InvalidObjectException, MetaException, TException, InvalidInputException; + + /** + * Write partition level column statistics to persistent store + * @param statsObj + * @return boolean indicating the status of the operation + * @throws NoSuchObjectException + * @throws InvalidObjectException + * @throws MetaException + * @throws TException + * @throws InvalidInputException + */ + + public boolean updatePartitionColumnStatistics(ColumnStatistics statsObj) + throws NoSuchObjectException, InvalidObjectException, MetaException, TException, InvalidInputException; + + /** + * Get table level column statistics given dbName, tableName and colName + * @param dbName + * @param tableName + * @param colName + * @return ColumnStatistics struct for a given db, table and col + * @throws NoSuchObjectException + * @throws MetaException + * @throws TException + * @throws InvalidInputException + * @throws InvalidObjectException + */ + + public ColumnStatistics getTableColumnStatistics(String dbName, String tableName, String colName) + throws NoSuchObjectException, MetaException, TException, + InvalidInputException, InvalidObjectException; + + /** + * Get partition level column statistics given dbName, tableName, partitionName and colName + * @param dbName + * @param tableName + * @param partitionName + * @param colName + * @return ColumnStatistics struct for a given db, table, partition and col + * @throws NoSuchObjectException + * @throws MetaException + * @throws TException + * @throws InvalidInputException + * @throws InvalidObjectException + */ + + public ColumnStatistics getPartitionColumnStatistics(String dbName, String tableName, + String partitionName, String colName) throws NoSuchObjectException, MetaException, TException, + InvalidInputException, InvalidObjectException; + + /** + * Delete partition level column statistics given dbName, tableName, partName and colName + * @param dbName + * @param tableName + * @param partName + * @param colName + * @return boolean indicating outcome of the operation + * @throws NoSuchObjectException + * @throws InvalidObjectException + * @throws MetaException + * @throws TException + * @throws InvalidInputException + */ + + public boolean deletePartitionColumnStatistics(String dbName, String tableName, + String partName, String colName) throws NoSuchObjectException, MetaException, + InvalidObjectException, TException, InvalidInputException; + + /** + * Delete table level column statistics given dbName, tableName and colName + * @param dbName + * @param tableName + * @param colName + * @return boolean indicating the outcome of the operation + * @throws NoSuchObjectException + * @throws MetaException + * @throws InvalidObjectException + * @throws TException + * @throws InvalidInputException + */ + + public boolean deleteTableColumnStatistics(String dbName, String tableName, String colName) throws + NoSuchObjectException, MetaException, InvalidObjectException, TException, InvalidInputException; + + /** * @param role * role object * @return true on success diff --git metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java index eff44b1..dc78b09 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java @@ -56,14 +56,23 @@ import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.common.classification.InterfaceStability; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; +import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; import org.apache.hadoop.hive.metastore.api.HiveObjectRef; import org.apache.hadoop.hive.metastore.api.HiveObjectType; import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.metastore.api.InvalidInputException; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidPartitionException; +import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.Order; @@ -77,6 +86,7 @@ import org.apache.hadoop.hive.metastore.api.Role; import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.SkewedInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.Type; import org.apache.hadoop.hive.metastore.api.UnknownDBException; @@ -91,6 +101,7 @@ import org.apache.hadoop.hive.metastore.model.MIndex; import org.apache.hadoop.hive.metastore.model.MOrder; import org.apache.hadoop.hive.metastore.model.MPartition; import org.apache.hadoop.hive.metastore.model.MPartitionColumnPrivilege; +import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; import org.apache.hadoop.hive.metastore.model.MPartitionEvent; import org.apache.hadoop.hive.metastore.model.MPartitionPrivilege; import org.apache.hadoop.hive.metastore.model.MRole; @@ -100,6 +111,7 @@ import org.apache.hadoop.hive.metastore.model.MStorageDescriptor; import org.apache.hadoop.hive.metastore.model.MStringList; import org.apache.hadoop.hive.metastore.model.MTable; import org.apache.hadoop.hive.metastore.model.MTableColumnPrivilege; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; import org.apache.hadoop.hive.metastore.model.MTablePrivilege; import org.apache.hadoop.hive.metastore.model.MType; import org.apache.hadoop.hive.metastore.parser.ExpressionTree.ANTLRNoCaseStringStream; @@ -673,7 +685,7 @@ public class ObjectStore implements RawStore, Configurable { } } - public boolean dropTable(String dbName, String tableName) throws MetaException { + public boolean dropTable(String dbName, String tableName) throws MetaException, NoSuchObjectException, InvalidObjectException, InvalidInputException { boolean success = false; try { openTransaction(); @@ -701,6 +713,13 @@ public class ObjectStore implements RawStore, Configurable { if (partColGrants != null && partColGrants.size() > 0) { pm.deletePersistentAll(partColGrants); } + // delete column statistics if present + try { + deleteTableColumnStatistics(dbName, tableName, null); + } catch (NoSuchObjectException e) { + LOG.info("Found no table level column statistics associated with db " + dbName + + " table " + tableName + " record to delete"); + } preDropStorageDescriptor(tbl.getSd()); // then remove the table @@ -1267,7 +1286,7 @@ public class ObjectStore implements RawStore, Configurable { @Override public boolean dropPartition(String dbName, String tableName, - List part_vals) throws MetaException { + List part_vals) throws MetaException, NoSuchObjectException, InvalidObjectException, InvalidInputException { boolean success = false; try { openTransaction(); @@ -1287,8 +1306,12 @@ public class ObjectStore implements RawStore, Configurable { * drop the storage descriptor cleanly, etc.) * @param part - the MPartition to drop * @return whether the transaction committed successfully + * @throws InvalidInputException + * @throws InvalidObjectException + * @throws MetaException + * @throws NoSuchObjectException */ - private boolean dropPartitionCommon(MPartition part) { + private boolean dropPartitionCommon(MPartition part) throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException { boolean success = false; try { openTransaction(); @@ -1316,6 +1339,17 @@ public class ObjectStore implements RawStore, Configurable { if (partColumnGrants != null && partColumnGrants.size() > 0) { pm.deletePersistentAll(partColumnGrants); } + + String dbName = part.getTable().getDatabase().getName(); + String tableName = part.getTable().getTableName(); + + // delete partition level column stats if it exists + try { + deletePartitionColumnStatistics(dbName, tableName, partName, part.getValues(), null); + } catch (NoSuchObjectException e) { + LOG.info("No column statistics records found to delete"); + } + preDropStorageDescriptor(part.getSd()); pm.deletePersistent(part); } @@ -4455,6 +4489,738 @@ public class ObjectStore implements RawStore, Configurable { } } + // Methods to persist, maintain and retrieve Column Statistics + private MTableColumnStatistics convertToMTableColumnStatistics(ColumnStatisticsDesc statsDesc, + ColumnStatisticsObj statsObj) throws NoSuchObjectException, + MetaException, InvalidObjectException + { + if (statsObj == null || statsDesc == null) { + throw new InvalidObjectException("Invalid column stats object"); + } + + String dbName = statsDesc.getDbName(); + String tableName = statsDesc.getTableName(); + MTable table = getMTable(dbName, tableName); + + if (table == null) { + throw new NoSuchObjectException("Table " + tableName + + " for which stats is gathered doesn't exist."); + } + + MTableColumnStatistics mColStats = new MTableColumnStatistics(); + mColStats.setTable(table); + mColStats.setDbName(statsDesc.getDbName()); + mColStats.setTableName(statsDesc.getTableName()); + mColStats.setLastAnalyzed(statsDesc.getLastAnalyzed()); + mColStats.setColName(statsObj.getColName()); + mColStats.setColType(statsObj.getColType()); + + if (statsObj.getStatsData().isSetBooleanStats()) { + BooleanColumnStatsData boolStats = statsObj.getStatsData().getBooleanStats(); + mColStats.setBooleanStats(boolStats.getNumTrues(), boolStats.getNumFalses(), + boolStats.getNumNulls()); + } else if (statsObj.getStatsData().isSetLongStats()) { + LongColumnStatsData longStats = statsObj.getStatsData().getLongStats(); + mColStats.setLongStats(longStats.getNumNulls(), longStats.getNumDVs(), + longStats.getLowValue(), longStats.getHighValue()); + } else if (statsObj.getStatsData().isSetDoubleStats()) { + DoubleColumnStatsData doubleStats = statsObj.getStatsData().getDoubleStats(); + mColStats.setDoubleStats(doubleStats.getNumNulls(), doubleStats.getNumDVs(), + doubleStats.getLowValue(), doubleStats.getHighValue()); + } else if (statsObj.getStatsData().isSetStringStats()) { + StringColumnStatsData stringStats = statsObj.getStatsData().getStringStats(); + mColStats.setStringStats(stringStats.getNumNulls(), stringStats.getNumDVs(), + stringStats.getMaxColLen(), stringStats.getAvgColLen()); + } else if (statsObj.getStatsData().isSetBinaryStats()) { + BinaryColumnStatsData binaryStats = statsObj.getStatsData().getBinaryStats(); + mColStats.setBinaryStats(binaryStats.getNumNulls(), binaryStats.getMaxColLen(), + binaryStats.getAvgColLen()); + } + return mColStats; + } + + private ColumnStatisticsObj getColumnStatisticsObj(MTableColumnStatistics mStatsObj) { + ColumnStatisticsObj statsObj = new ColumnStatisticsObj(); + statsObj.setColType(mStatsObj.getColType()); + statsObj.setColName(mStatsObj.getColName()); + String colType = mStatsObj.getColType(); + ColumnStatisticsData colStatsData = new ColumnStatisticsData(); + + if (colType.equalsIgnoreCase("boolean")) { + BooleanColumnStatsData boolStats = new BooleanColumnStatsData(); + boolStats.setNumFalses(mStatsObj.getNumFalses()); + boolStats.setNumTrues(mStatsObj.getNumTrues()); + boolStats.setNumNulls(mStatsObj.getNumNulls()); + colStatsData.setBooleanStats(boolStats); + } else if (colType.equalsIgnoreCase("string")) { + StringColumnStatsData stringStats = new StringColumnStatsData(); + stringStats.setNumNulls(mStatsObj.getNumNulls()); + stringStats.setAvgColLen(mStatsObj.getAvgColLen()); + stringStats.setMaxColLen(mStatsObj.getMaxColLen()); + stringStats.setNumDVs(mStatsObj.getNumDVs()); + colStatsData.setStringStats(stringStats); + } else if (colType.equalsIgnoreCase("binary")) { + BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); + binaryStats.setNumNulls(mStatsObj.getNumNulls()); + binaryStats.setAvgColLen(mStatsObj.getAvgColLen()); + binaryStats.setMaxColLen(mStatsObj.getMaxColLen()); + colStatsData.setBinaryStats(binaryStats); + } else if (colType.equalsIgnoreCase("long")) { + LongColumnStatsData longStats = new LongColumnStatsData(); + longStats.setNumNulls(mStatsObj.getNumNulls()); + longStats.setHighValue(mStatsObj.getHighValueAsLong()); + longStats.setLowValue(mStatsObj.getLowValueAsLong()); + longStats.setNumDVs(mStatsObj.getNumDVs()); + colStatsData.setLongStats(longStats); + } else if (colType.equalsIgnoreCase("double")) { + DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); + doubleStats.setNumNulls(mStatsObj.getNumNulls()); + doubleStats.setHighValue(mStatsObj.getHighValueAsDouble()); + doubleStats.setLowValue(mStatsObj.getLowValueAsDouble()); + doubleStats.setNumDVs(mStatsObj.getNumDVs()); + colStatsData.setDoubleStats(doubleStats); + } + statsObj.setStatsData(colStatsData); + return statsObj; + } + + private ColumnStatisticsDesc getTableColumnStatisticsDesc(MTableColumnStatistics mStatsObj) { + ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(); + statsDesc.setIsTblLevel(true); + statsDesc.setDbName(mStatsObj.getDbName()); + statsDesc.setTableName(mStatsObj.getTableName()); + statsDesc.setLastAnalyzed(mStatsObj.getLastAnalyzed()); + return statsDesc; + } + + private ColumnStatistics convertToTableColumnStatistics(MTableColumnStatistics mStatsObj) + throws MetaException + { + if (mStatsObj == null) { + return null; + } + + ColumnStatisticsDesc statsDesc = getTableColumnStatisticsDesc(mStatsObj); + ColumnStatisticsObj statsObj = getColumnStatisticsObj(mStatsObj); + List statsObjs = new ArrayList(); + statsObjs.add(statsObj); + + ColumnStatistics colStats = new ColumnStatistics(); + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + return colStats; + } + + private MPartitionColumnStatistics convertToMPartitionColumnStatistics(ColumnStatisticsDesc statsDesc, + ColumnStatisticsObj statsObj, List partVal) + throws MetaException, NoSuchObjectException + { + if (statsDesc == null || statsObj == null || partVal == null) { + return null; + } + + MPartition partition = getMPartition(statsDesc.getDbName(), statsDesc.getTableName(), partVal); + + if (partition == null) { + throw new NoSuchObjectException("Partition for which stats is gathered doesn't exist."); + } + + MPartitionColumnStatistics mColStats = new MPartitionColumnStatistics(); + mColStats.setPartition(partition); + mColStats.setDbName(statsDesc.getDbName()); + mColStats.setTableName(statsDesc.getTableName()); + mColStats.setPartitionName(statsDesc.getPartName()); + mColStats.setLastAnalyzed(statsDesc.getLastAnalyzed()); + mColStats.setColName(statsObj.getColName()); + mColStats.setColType(statsObj.getColType()); + + if (statsObj.getStatsData().isSetBooleanStats()) { + BooleanColumnStatsData boolStats = statsObj.getStatsData().getBooleanStats(); + mColStats.setBooleanStats(boolStats.getNumTrues(), boolStats.getNumFalses(), + boolStats.getNumNulls()); + } else if (statsObj.getStatsData().isSetLongStats()) { + LongColumnStatsData longStats = statsObj.getStatsData().getLongStats(); + mColStats.setLongStats(longStats.getNumNulls(), longStats.getNumDVs(), + longStats.getLowValue(), longStats.getHighValue()); + } else if (statsObj.getStatsData().isSetDoubleStats()) { + DoubleColumnStatsData doubleStats = statsObj.getStatsData().getDoubleStats(); + mColStats.setDoubleStats(doubleStats.getNumNulls(), doubleStats.getNumDVs(), + doubleStats.getLowValue(), doubleStats.getHighValue()); + } else if (statsObj.getStatsData().isSetStringStats()) { + StringColumnStatsData stringStats = statsObj.getStatsData().getStringStats(); + mColStats.setStringStats(stringStats.getNumNulls(), stringStats.getNumDVs(), + stringStats.getMaxColLen(), stringStats.getAvgColLen()); + } else if (statsObj.getStatsData().isSetBinaryStats()) { + BinaryColumnStatsData binaryStats = statsObj.getStatsData().getBinaryStats(); + mColStats.setBinaryStats(binaryStats.getNumNulls(), binaryStats.getMaxColLen(), + binaryStats.getAvgColLen()); + } + return mColStats; + } + + private void writeMTableColumnStatistics(MTableColumnStatistics mStatsObj) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException + { + String dbName = mStatsObj.getDbName(); + String tableName = mStatsObj.getTableName(); + String colName = mStatsObj.getColName(); + + LOG.info("Updating table level column statistics for db=" + dbName + " tableName=" + tableName + + " colName=" + colName); + + MTable mTable = getMTable(mStatsObj.getDbName(), mStatsObj.getTableName()); + boolean foundCol = false; + + if (mTable == null) { + throw new + NoSuchObjectException("Table " + tableName + + " for which stats gathering is requested doesn't exist."); + } + + MStorageDescriptor mSDS = mTable.getSd(); + List colList = mSDS.getCD().getCols(); + + for(MFieldSchema mCol:colList) { + if (mCol.getName().equals(mStatsObj.getColName().trim())) { + foundCol = true; + break; + } + } + + if (!foundCol) { + throw new + NoSuchObjectException("Column " + colName + + " for which stats gathering is requested doesn't exist."); + } + + MTableColumnStatistics oldStatsObj = getMTableColumnStatistics(dbName, tableName, colName); + + if (oldStatsObj != null) { + oldStatsObj.setAvgColLen(mStatsObj.getAvgColLen()); + oldStatsObj.setHighValue(mStatsObj.getHighValue()); + oldStatsObj.setLastAnalyzed(mStatsObj.getLastAnalyzed()); + oldStatsObj.setLowValue(mStatsObj.getLowValue()); + oldStatsObj.setMaxColLen(mStatsObj.getMaxColLen()); + oldStatsObj.setNumDVs(mStatsObj.getNumDVs()); + oldStatsObj.setNumFalses(mStatsObj.getNumFalses()); + oldStatsObj.setNumTrues(mStatsObj.getNumTrues()); + oldStatsObj.setNumNulls(mStatsObj.getNumNulls()); + } else { + pm.makePersistent(mStatsObj); + } + } + + private ColumnStatisticsObj getColumnStatisticsObj(MPartitionColumnStatistics mStatsObj) { + ColumnStatisticsObj statsObj = new ColumnStatisticsObj(); + statsObj.setColType(mStatsObj.getColType()); + statsObj.setColName(mStatsObj.getColName()); + String colType = mStatsObj.getColType(); + ColumnStatisticsData colStatsData = new ColumnStatisticsData(); + + if (colType.equalsIgnoreCase("boolean")) { + BooleanColumnStatsData boolStats = new BooleanColumnStatsData(); + boolStats.setNumFalses(mStatsObj.getNumFalses()); + boolStats.setNumTrues(mStatsObj.getNumTrues()); + boolStats.setNumNulls(mStatsObj.getNumNulls()); + colStatsData.setBooleanStats(boolStats); + } else if (colType.equalsIgnoreCase("string")) { + StringColumnStatsData stringStats = new StringColumnStatsData(); + stringStats.setNumNulls(mStatsObj.getNumNulls()); + stringStats.setAvgColLen(mStatsObj.getAvgColLen()); + stringStats.setMaxColLen(mStatsObj.getMaxColLen()); + stringStats.setNumDVs(mStatsObj.getNumDVs()); + colStatsData.setStringStats(stringStats); + } else if (colType.equalsIgnoreCase("binary")) { + BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); + binaryStats.setNumNulls(mStatsObj.getNumNulls()); + binaryStats.setAvgColLen(mStatsObj.getAvgColLen()); + binaryStats.setMaxColLen(mStatsObj.getMaxColLen()); + colStatsData.setBinaryStats(binaryStats); + } else if (colType.equalsIgnoreCase("long")) { + LongColumnStatsData longStats = new LongColumnStatsData(); + longStats.setNumNulls(mStatsObj.getNumNulls()); + longStats.setHighValue(mStatsObj.getHighValueAsLong()); + longStats.setLowValue(mStatsObj.getLowValueAsLong()); + longStats.setNumDVs(mStatsObj.getNumDVs()); + colStatsData.setLongStats(longStats); + } else if (colType.equalsIgnoreCase("double")) { + DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); + doubleStats.setNumNulls(mStatsObj.getNumNulls()); + doubleStats.setHighValue(mStatsObj.getHighValueAsDouble()); + doubleStats.setLowValue(mStatsObj.getLowValueAsDouble()); + doubleStats.setNumDVs(mStatsObj.getNumDVs()); + colStatsData.setDoubleStats(doubleStats); + } + statsObj.setStatsData(colStatsData); + return statsObj; + } + + private ColumnStatisticsDesc getPartitionColumnStatisticsDesc(MPartitionColumnStatistics mStatsObj) { + ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(); + statsDesc.setIsTblLevel(false); + statsDesc.setDbName(mStatsObj.getDbName()); + statsDesc.setTableName(mStatsObj.getTableName()); + statsDesc.setPartName(mStatsObj.getPartitionName()); + statsDesc.setLastAnalyzed(mStatsObj.getLastAnalyzed()); + return statsDesc; + } + + private void writeMPartitionColumnStatistics(MPartitionColumnStatistics mStatsObj, + List partVal) throws NoSuchObjectException, MetaException, InvalidObjectException, + InvalidInputException + { + String dbName = mStatsObj.getDbName(); + String tableName = mStatsObj.getTableName(); + String partName = mStatsObj.getPartitionName(); + String colName = mStatsObj.getColName(); + + LOG.info("Updating partition level column statistics for db=" + dbName + " tableName=" + + tableName + " partName=" + partName + " colName=" + colName); + + MTable mTable = getMTable(mStatsObj.getDbName(), mStatsObj.getTableName()); + boolean foundCol = false; + + if (mTable == null) { + throw new + NoSuchObjectException("Table " + tableName + + " for which stats gathering is requested doesn't exist."); + } + + MPartition mPartition = + getMPartition(mStatsObj.getDbName(), mStatsObj.getTableName(), partVal); + + if (mPartition == null) { + throw new + NoSuchObjectException("Partition " + partName + + " for which stats gathering is requested doesn't exist"); + } + + MStorageDescriptor mSDS = mPartition.getSd(); + List colList = mSDS.getCD().getCols(); + + for(MFieldSchema mCol:colList) { + if (mCol.getName().equals(mStatsObj.getColName().trim())) { + foundCol = true; + break; + } + } + + if (!foundCol) { + throw new + NoSuchObjectException("Column " + colName + + " for which stats gathering is requested doesn't exist."); + } + + MPartitionColumnStatistics oldStatsObj = getMPartitionColumnStatistics(dbName, tableName, + partName, partVal, colName); + if (oldStatsObj != null) { + oldStatsObj.setAvgColLen(mStatsObj.getAvgColLen()); + oldStatsObj.setHighValue(mStatsObj.getHighValue()); + oldStatsObj.setLastAnalyzed(mStatsObj.getLastAnalyzed()); + oldStatsObj.setLowValue(mStatsObj.getLowValue()); + oldStatsObj.setMaxColLen(mStatsObj.getMaxColLen()); + oldStatsObj.setNumDVs(mStatsObj.getNumDVs()); + oldStatsObj.setNumFalses(mStatsObj.getNumFalses()); + oldStatsObj.setNumTrues(mStatsObj.getNumTrues()); + oldStatsObj.setNumNulls(mStatsObj.getNumNulls()); + } else { + pm.makePersistent(mStatsObj); + } + } + + public boolean updateTableColumnStatistics(ColumnStatistics colStats) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException + { + boolean committed = false; + + try { + openTransaction(); + List statsObjs = colStats.getStatsObj(); + ColumnStatisticsDesc statsDesc = colStats.getStatsDesc(); + + for (ColumnStatisticsObj statsObj:statsObjs) { + MTableColumnStatistics mStatsObj = convertToMTableColumnStatistics(statsDesc, statsObj); + writeMTableColumnStatistics(mStatsObj); + } + committed = commitTransaction(); + return committed; + } finally { + if (!committed) { + rollbackTransaction(); + } + } + } + + public boolean updatePartitionColumnStatistics(ColumnStatistics colStats, List partVals) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException + { + boolean committed = false; + + try { + openTransaction(); + List statsObjs = colStats.getStatsObj(); + ColumnStatisticsDesc statsDesc = colStats.getStatsDesc(); + + for (ColumnStatisticsObj statsObj:statsObjs) { + MPartitionColumnStatistics mStatsObj = + convertToMPartitionColumnStatistics(statsDesc, statsObj, partVals); + writeMPartitionColumnStatistics(mStatsObj, partVals); + } + committed = commitTransaction(); + return committed; + } finally { + if (!committed) { + rollbackTransaction(); + } + } + } + + private MTableColumnStatistics getMTableColumnStatistics(String dbName, String tableName, + String colName) throws NoSuchObjectException, InvalidInputException, MetaException + { + boolean committed = false; + + if (dbName == null) { + dbName = MetaStoreUtils.DEFAULT_DATABASE_NAME; + } + + if (tableName == null || colName == null) { + throw new InvalidInputException("TableName/ColName passed to get_table_column_statistics " + + "is null"); + } + + try { + openTransaction(); + MTableColumnStatistics mStatsObj = null; + MTable mTable = getMTable(dbName.trim(), tableName.trim()); + boolean foundCol = false; + + if (mTable == null) { + throw new NoSuchObjectException("Table " + tableName + + " for which stats is requested doesn't exist."); + } + + MStorageDescriptor mSDS = mTable.getSd(); + List colList = mSDS.getCD().getCols(); + + for(MFieldSchema mCol:colList) { + if (mCol.getName().equals(colName.trim())) { + foundCol = true; + break; + } + } + + if (!foundCol) { + throw new NoSuchObjectException("Column " + colName + + " for which stats is requested doesn't exist."); + } + + Query query = pm.newQuery(MTableColumnStatistics.class); + query.setFilter("table.tableName == t1 && " + + "dbName == t2 && " + "colName == t3"); + query + .declareParameters("java.lang.String t1, java.lang.String t2, java.lang.String t3"); + query.setUnique(true); + + mStatsObj = (MTableColumnStatistics) query.execute(tableName.trim(), + dbName.trim(), colName.trim()); + pm.retrieve(mStatsObj); + committed = commitTransaction(); + return mStatsObj; + } finally { + if (!committed) { + rollbackTransaction(); + return null; + } + } + } + + public ColumnStatistics getTableColumnStatistics(String dbName, String tableName, String colName) + throws MetaException, NoSuchObjectException, InvalidInputException + { + ColumnStatistics statsObj; + MTableColumnStatistics mStatsObj = getMTableColumnStatistics(dbName, tableName, colName); + + if (mStatsObj == null) { + throw new NoSuchObjectException("Statistics for dbName=" + dbName + " tableName=" + tableName + + " columnName=" + colName + " doesn't exist."); + } + + statsObj = convertToTableColumnStatistics(mStatsObj); + return statsObj; + } + + public ColumnStatistics getPartitionColumnStatistics(String dbName, String tableName, + String partName, List partVal, String colName) + throws MetaException, NoSuchObjectException, InvalidInputException + { + ColumnStatistics statsObj; + MPartitionColumnStatistics mStatsObj = + getMPartitionColumnStatistics(dbName, tableName, partName, partVal, colName); + + if (mStatsObj == null) { + throw new NoSuchObjectException("Statistics for dbName=" + dbName + " tableName=" + tableName + + " partName= " + partName + " columnName=" + colName + " doesn't exist."); + } + statsObj = convertToPartColumnStatistics(mStatsObj); + return statsObj; + } + + private ColumnStatistics convertToPartColumnStatistics(MPartitionColumnStatistics mStatsObj) + { + if (mStatsObj == null) { + return null; + } + + ColumnStatisticsDesc statsDesc = getPartitionColumnStatisticsDesc(mStatsObj); + ColumnStatisticsObj statsObj = getColumnStatisticsObj(mStatsObj); + List statsObjs = new ArrayList(); + statsObjs.add(statsObj); + + ColumnStatistics colStats = new ColumnStatistics(); + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + return colStats; + } + + private MPartitionColumnStatistics getMPartitionColumnStatistics(String dbName, String tableName, + String partName, List partVal, String colName) throws NoSuchObjectException, + InvalidInputException, MetaException + { + boolean committed = false; + MPartitionColumnStatistics mStatsObj = null; + + if (dbName == null) { + dbName = MetaStoreUtils.DEFAULT_DATABASE_NAME; + } + + if (tableName == null || partVal == null || colName == null) { + throw new InvalidInputException("TableName/PartName/ColName passed to " + + " get_partition_column_statistics is null"); + } + + try { + openTransaction(); + MTable mTable = getMTable(dbName.trim(), tableName.trim()); + boolean foundCol = false; + + if (mTable == null) { + throw new NoSuchObjectException("Table " + tableName + + " for which stats is requested doesn't exist."); + } + + MPartition mPartition = + getMPartition(dbName, tableName, partVal); + + if (mPartition == null) { + throw new + NoSuchObjectException("Partition " + partName + + " for which stats is requested doesn't exist"); + } + + MStorageDescriptor mSDS = mPartition.getSd(); + List colList = mSDS.getCD().getCols(); + + for(MFieldSchema mCol:colList) { + if (mCol.getName().equals(colName.trim())) { + foundCol = true; + break; + } + } + + if (!foundCol) { + throw new NoSuchObjectException("Column " + colName + + " for which stats is requested doesn't exist."); + } + + Query query = pm.newQuery(MPartitionColumnStatistics.class); + query.setFilter("partition.partitionName == t1 && " + + "dbName == t2 && " + "tableName == t3 && " + "colName == t4"); + query + .declareParameters("java.lang.String t1, java.lang.String t2, " + + "java.lang.String t3, java.lang.String t4"); + query.setUnique(true); + + mStatsObj = (MPartitionColumnStatistics) query.executeWithArray(partName.trim(), + dbName.trim(), tableName.trim(), + colName.trim()); + pm.retrieve(mStatsObj); + committed = commitTransaction(); + return mStatsObj; + + } finally { + if (!committed) { + rollbackTransaction(); + } + } + } + + public boolean deletePartitionColumnStatistics(String dbName, String tableName, + String partName, List partVals,String colName) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException + { + boolean ret = false; + + if (dbName == null) { + dbName = MetaStoreUtils.DEFAULT_DATABASE_NAME; + } + + if (tableName == null) { + throw new InvalidInputException("Table name is null."); + } + + try { + openTransaction(); + MTable mTable = getMTable(dbName, tableName); + MPartitionColumnStatistics mStatsObj; + List mStatsObjColl; + + if (mTable == null) { + throw new + NoSuchObjectException("Table " + tableName + + " for which stats deletion is requested doesn't exist"); + } + + MPartition mPartition = + getMPartition(dbName, tableName, partVals); + + if (mPartition == null) { + throw new + NoSuchObjectException("Partition " + partName + + " for which stats deletion is requested doesn't exist"); + } + + Query query = pm.newQuery(MPartitionColumnStatistics.class); + String filter; + String parameters; + + if (colName != null) { + filter = "partition.partitionName == t1 && dbName == t2 && tableName == t3 && colName == t4"; + parameters = "java.lang.String t1, java.lang.String t2, " + + "java.lang.String t3, java.lang.String t4"; + } else { + filter = "partition.partitionName == t1 && dbName == t2 && tableName == t3"; + parameters = "java.lang.String t1, java.lang.String t2, java.lang.String t3"; + } + + query.setFilter(filter); + query + .declareParameters(parameters); + + if (colName != null) { + query.setUnique(true); + mStatsObj = (MPartitionColumnStatistics)query.executeWithArray(partName.trim(), + dbName.trim(), tableName.trim(), colName.trim()); + pm.retrieve(mStatsObj); + + if (mStatsObj != null) { + pm.deletePersistent(mStatsObj); + } else { + throw new NoSuchObjectException("Column stats doesn't exist for db=" +dbName + " table=" + + tableName + " partition=" + partName + " col=" + colName); + } + } else { + + mStatsObjColl= (List)query.execute(partName.trim(), + dbName.trim(), tableName.trim()); + pm.retrieveAll(mStatsObjColl); + + if (mStatsObjColl != null) { + pm.deletePersistentAll(mStatsObjColl); + } else { + throw new NoSuchObjectException("Column stats doesn't exist for db=" + dbName + + " table=" + tableName + " partition" + partName); + } + } + ret = commitTransaction(); + } catch(NoSuchObjectException e) { + rollbackTransaction(); + throw e; + } finally { + if (!ret) { + rollbackTransaction(); + } + } + return ret; + } + + public boolean deleteTableColumnStatistics(String dbName, String tableName, String colName) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException + { + boolean ret = false; + + if (dbName == null) { + dbName = MetaStoreUtils.DEFAULT_DATABASE_NAME; + } + + if (tableName == null) { + throw new InvalidInputException("Table name is null."); + } + + try { + openTransaction(); + MTable mTable = getMTable(dbName, tableName); + MTableColumnStatistics mStatsObj; + List mStatsObjColl; + + if (mTable == null) { + throw new + NoSuchObjectException("Table " + tableName + + " for which stats deletion is requested doesn't exist"); + } + + Query query = pm.newQuery(MTableColumnStatistics.class); + String filter; + String parameters; + + if (colName != null) { + filter = "table.tableName == t1 && dbName == t2 && colName == t3"; + parameters = "java.lang.String t1, java.lang.String t2, java.lang.String t3"; + } else { + filter = "table.tableName == t1 && dbName == t2"; + parameters = "java.lang.String t1, java.lang.String t2"; + } + + query.setFilter(filter); + query + .declareParameters(parameters); + + if (colName != null) { + query.setUnique(true); + mStatsObj = (MTableColumnStatistics)query.execute(tableName.trim(), + dbName.trim(), colName.trim()); + pm.retrieve(mStatsObj); + + if (mStatsObj != null) { + pm.deletePersistent(mStatsObj); + } else { + throw new NoSuchObjectException("Column stats doesn't exist for db=" +dbName + " table=" + + tableName + " col=" + colName); + } + } else { + + mStatsObjColl= (List)query.execute(tableName.trim(), dbName.trim()); + pm.retrieveAll(mStatsObjColl); + + if (mStatsObjColl != null) { + pm.deletePersistentAll(mStatsObjColl); + } else { + throw new NoSuchObjectException("Column stats doesn't exist for db=" + dbName + + " table=" + tableName); + } + } + ret = commitTransaction(); + } catch(NoSuchObjectException e) { + rollbackTransaction(); + throw e; + } finally { + if (!ret) { + rollbackTransaction(); + } + } + return ret; + } + @Override public long cleanupEvents() { boolean commited = false; @@ -4477,4 +5243,5 @@ public class ObjectStore implements RawStore, Configurable { } return delCnt; } + } diff --git metastore/src/java/org/apache/hadoop/hive/metastore/RawStore.java metastore/src/java/org/apache/hadoop/hive/metastore/RawStore.java index bf5ae3a..3dc9031 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/RawStore.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/RawStore.java @@ -22,8 +22,10 @@ import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.metastore.api.InvalidInputException; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidPartitionException; import org.apache.hadoop.hive.metastore.api.MetaException; @@ -97,7 +99,7 @@ public interface RawStore extends Configurable { MetaException; public abstract boolean dropTable(String dbName, String tableName) - throws MetaException; + throws MetaException, NoSuchObjectException, InvalidObjectException, InvalidInputException; public abstract Table getTable(String dbName, String tableName) throws MetaException; @@ -109,7 +111,7 @@ public interface RawStore extends Configurable { List part_vals) throws MetaException, NoSuchObjectException; public abstract boolean dropPartition(String dbName, String tableName, - List part_vals) throws MetaException; + List part_vals) throws MetaException, NoSuchObjectException, InvalidObjectException, InvalidInputException; public abstract List getPartitions(String dbName, String tableName, int max) throws MetaException; @@ -306,5 +308,116 @@ public interface RawStore extends Configurable { List part_vals, short max_parts, String userName, List groupNames) throws MetaException, InvalidObjectException, NoSuchObjectException; + /** Persists the given column statistics object to the metastore + * @param partVals + * + * @param ColumnStats object to persist + * @param List of partVals + * @return Boolean indicating the outcome of the operation + * @throws NoSuchObjectException + * @throws MetaException + * @throws InvalidObjectException + * @throws InvalidInputException + */ + public abstract boolean updateTableColumnStatistics(ColumnStatistics colStats) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException; + + /** Persists the given column statistics object to the metastore + * @param partVals + * + * @param ColumnStats object to persist + * @param List of partVals + * @return Boolean indicating the outcome of the operation + * @throws NoSuchObjectException + * @throws MetaException + * @throws InvalidObjectException + * @throws InvalidInputException + */ + public abstract boolean updatePartitionColumnStatistics(ColumnStatistics statsObj, + List partVals) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException; + + /** + * Returns the relevant column statistics for a given column in a given table in a given database + * if such statistics exist. + * + * @param The name of the database, defaults to current database + * @param The name of the table + * @param The name of the column for which statistics is requested + * @return Relevant column statistics for the column for the given table + * @throws NoSuchObjectException + * @throws MetaException + * @throws InvalidInputException + * + */ + public abstract ColumnStatistics getTableColumnStatistics(String dbName, String tableName, + String colName) throws MetaException, NoSuchObjectException, InvalidInputException, + InvalidObjectException; + + /** + * Returns the relevant column statistics for a given column in a given partition in a given + * table in a given database if such statistics exist. + * @param partName + * + * @param The name of the database, defaults to current database + * @param The name of the table + * @param The name of the partition + * @param List of partVals for the partition + * @param The name of the column for which statistics is requested + * @return Relevant column statistics for the column for the given partition in a given table + * @throws NoSuchObjectException + * @throws MetaException + * @throws InvalidInputException + * @throws InvalidObjectException + * + */ + + public abstract ColumnStatistics getPartitionColumnStatistics(String dbName, String tableName, + String partName, List partVals, String colName) + throws MetaException, NoSuchObjectException, InvalidInputException, InvalidObjectException; + + /** + * Deletes column statistics if present associated with a given db, table, partition and col. If + * null is passed instead of a colName, stats when present for all columns associated + * with a given db, table and partition are deleted. + * + * @param dbName + * @param tableName + * @param partName + * @param partVals + * @param colName + * @return Boolean indicating the outcome of the operation + * @throws NoSuchObjectException + * @throws MetaException + * @throws InvalidObjectException + * @throws InvalidInputException + */ + + public abstract boolean deletePartitionColumnStatistics(String dbName, String tableName, + String partName, List partVals, String colName) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException; + + /** + * Deletes column statistics if present associated with a given db, table and col. If + * null is passed instead of a colName, stats when present for all columns associated + * with a given db and table are deleted. + * + * @param dbName + * @param tableName + * @param colName + * @return Boolean indicating the outcome of the operation + * @throws NoSuchObjectException + * @throws MetaException + * @throws InvalidObjectException + * @throws InvalidInputException + */ + + public abstract boolean deleteTableColumnStatistics(String dbName, String tableName, + String colName) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException; + public abstract long cleanupEvents(); + + + } diff --git metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java index 77d1caa..9f3b9ff 100755 --- metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java @@ -376,6 +376,38 @@ public class Warehouse { } } + public static Map makeEscSpecFromName(String name) throws MetaException { + + if (name == null || name.isEmpty()) { + throw new MetaException("Partition name is invalid. " + name); + } + LinkedHashMap partSpec = new LinkedHashMap(); + + Path currPath = new Path(name); + + List kvs = new ArrayList(); + do { + String component = currPath.getName(); + Matcher m = pat.matcher(component); + if (m.matches()) { + String k = m.group(1); + String v = m.group(2); + String[] kv = new String[2]; + kv[0] = k; + kv[1] = v; + kvs.add(kv); + } + currPath = currPath.getParent(); + } while (currPath != null && !currPath.getName().isEmpty()); + + // reverse the list since we checked the part from leaf dir to table's base dir + for (int i = kvs.size(); i > 0; i--) { + partSpec.put(kvs.get(i - 1)[0], kvs.get(i - 1)[1]); + } + + return partSpec; + } + public Path getPartitionPath(Database db, String tableName, LinkedHashMap pm) throws MetaException { return new Path(getTablePath(db, tableName), makePartPath(pm)); diff --git metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java new file mode 100644 index 0000000..067cb49 --- /dev/null +++ metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + */ +package org.apache.hadoop.hive.metastore.model; + +import java.nio.ByteBuffer; + + +/** + * + * MPartitionColumnStatistics - Represents Hive's partiton level Column Statistics Description. + * The fields in this class with the exception of partition are persisted in the metastore. + * In case of partition, part_id is persisted in its place. + * + */ +public class MPartitionColumnStatistics { + + private MPartition partition; + + private String dbName; + private String tableName; + private String partitionName; + private String colName; + private String colType; + + private byte[] lowValue; + private byte[] highValue; + private long numNulls; + private long numDVs; + private double avgColLen; + private long maxColLen; + private long numTrues; + private long numFalses; + private long lastAnalyzed; + + public MPartitionColumnStatistics() {} + + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public String getColName() { + return colName; + } + + public void setColName(String colName) { + this.colName = colName; + } + + public byte[] getLowValue() { + return lowValue; + } + + public long getLowValueAsLong() { + ByteBuffer byteBuf = ByteBuffer.wrap(lowValue); + return byteBuf.getLong(); + } + + public double getLowValueAsDouble() { + ByteBuffer byteBuf = ByteBuffer.wrap(lowValue); + return byteBuf.getDouble(); + } + + public byte[] getHighValue() { + return highValue; + } + + public long getHighValueAsLong() { + ByteBuffer byteBuf = ByteBuffer.wrap(highValue); + return byteBuf.getLong(); + } + + public double getHighValueAsDouble() { + ByteBuffer byteBuf = ByteBuffer.wrap(highValue); + return byteBuf.getDouble(); + } + + public void setHighValue(byte[] b) { + this.highValue = b; + } + + public void setLowValue(byte[] b) { + this.lowValue = b; + } + + public long getNumNulls() { + return numNulls; + } + + public void setNumNulls(long numNulls) { + this.numNulls = numNulls; + } + + public long getNumDVs() { + return numDVs; + } + + public void setNumDVs(long numDVs) { + this.numDVs = numDVs; + } + + public double getAvgColLen() { + return avgColLen; + } + + public void setAvgColLen(double avgColLen) { + this.avgColLen = avgColLen; + } + + public long getMaxColLen() { + return maxColLen; + } + + public void setMaxColLen(long maxColLen) { + this.maxColLen = maxColLen; + } + + public long getNumTrues() { + return numTrues; + } + + public void setNumTrues(long numTrues) { + this.numTrues = numTrues; + } + + public long getNumFalses() { + return numFalses; + } + + public void setNumFalses(long numFalses) { + this.numFalses = numFalses; + } + + public long getLastAnalyzed() { + return lastAnalyzed; + } + + public void setLastAnalyzed(long lastAnalyzed) { + this.lastAnalyzed = lastAnalyzed; + } + + public String getDbName() { + return dbName; + } + + public void setDbName(String dbName) { + this.dbName = dbName; + } + + public MPartition getPartition() { + return partition; + } + + public void setPartition(MPartition partition) { + this.partition = partition; + } + + public String getPartitionName() { + return partitionName; + } + + public void setPartitionName(String partitionName) { + this.partitionName = partitionName; + } + + public String getColType() { + return colType; + } + + public void setColType(String colType) { + this.colType = colType; + } + + public void setBooleanStats(long numTrues, long numFalses, long numNulls) { + this.numTrues = numTrues; + this.numFalses = numFalses; + this.numNulls = numNulls; + } + + public void setLongStats(long numNulls, long numNDVs, long lowValue, long highValue) { + this.numNulls = numNulls; + this.numDVs = numNDVs; + byte[] bytes = ByteBuffer.allocate(Long.SIZE/8).putLong(lowValue).array(); + this.lowValue = bytes; + bytes = ByteBuffer.allocate(Long.SIZE/8).putLong(highValue).array(); + this.highValue = bytes; + } + + public void setDoubleStats(long numNulls, long numNDVs, double lowValue, double highValue) { + this.numNulls = numNulls; + this.numDVs = numNDVs; + byte[] bytes = ByteBuffer.allocate(Double.SIZE/8).putDouble(lowValue).array(); + this.lowValue = bytes; + bytes = ByteBuffer.allocate(Double.SIZE/8).putDouble(highValue).array(); + this.highValue = bytes; + } + + public void setStringStats(long numNulls, long numNDVs, long maxColLen, double avgColLen) { + this.numNulls = numNulls; + this.numDVs = numNDVs; + this.maxColLen = maxColLen; + this.avgColLen = avgColLen; + } + + public void setBinaryStats(long numNulls, long maxColLen, double avgColLen) { + this.numNulls = numNulls; + this.maxColLen = maxColLen; + this.avgColLen = avgColLen; + } +} \ No newline at end of file diff --git metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java new file mode 100644 index 0000000..8572c5a --- /dev/null +++ metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java @@ -0,0 +1,223 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + */ +package org.apache.hadoop.hive.metastore.model; + +import java.nio.ByteBuffer; + + +/** + * + * MTableColumnStatistics - Represents Hive's Column Statistics Description. The fields in this class + * with the exception of table are persisted in the metastore. In case of table, tbl_id is persisted + * in its place. + * + */ +public class MTableColumnStatistics { + + private MTable table; + private String dbName; + private String tableName; + private String colName; + private String colType; + + private byte[] lowValue; + private byte[] highValue; + private long numNulls; + private long numDVs; + private double avgColLen; + private long maxColLen; + private long numTrues; + private long numFalses; + private long lastAnalyzed; + + public MTableColumnStatistics() {} + + public MTable getTable() { + return table; + } + + public void setTable(MTable table) { + this.table = table; + } + + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public String getColName() { + return colName; + } + + public void setColName(String colName) { + this.colName = colName; + } + + public String getColType() { + return colType; + } + + public void setColType(String colType) { + this.colType = colType; + } + + public byte[] getLowValue() { + return lowValue; + } + + public long getLowValueAsLong() { + ByteBuffer byteBuf = ByteBuffer.wrap(lowValue); + return byteBuf.getLong(); + } + + public double getLowValueAsDouble() { + ByteBuffer byteBuf = ByteBuffer.wrap(lowValue); + return byteBuf.getDouble(); + } + + public byte[] getHighValue() { + return highValue; + } + + public long getHighValueAsLong() { + ByteBuffer byteBuf = ByteBuffer.wrap(highValue); + return byteBuf.getLong(); + } + + public double getHighValueAsDouble() { + ByteBuffer byteBuf = ByteBuffer.wrap(highValue); + return byteBuf.getDouble(); + } + + public void setHighValue(byte[] b) { + this.highValue = b; + } + + public void setLowValue(byte[] b) { + this.lowValue = b; + } + + public long getNumNulls() { + return numNulls; + } + + + public void setNumNulls(long numNulls) { + this.numNulls = numNulls; + } + + public long getNumDVs() { + return numDVs; + } + + public void setNumDVs(long numDVs) { + this.numDVs = numDVs; + } + + public double getAvgColLen() { + return avgColLen; + } + + public void setAvgColLen(double avgColLen) { + this.avgColLen = avgColLen; + } + + public long getMaxColLen() { + return maxColLen; + } + + public void setMaxColLen(long maxColLen) { + this.maxColLen = maxColLen; + } + + public long getNumTrues() { + return numTrues; + } + + public void setNumTrues(long numTrues) { + this.numTrues = numTrues; + } + + public long getNumFalses() { + return numFalses; + } + + public void setNumFalses(long numFalses) { + this.numFalses = numFalses; + } + + public long getLastAnalyzed() { + return lastAnalyzed; + } + + public void setLastAnalyzed(long lastAnalyzed) { + this.lastAnalyzed = lastAnalyzed; + } + + public String getDbName() { + return dbName; + } + + public void setDbName(String dbName) { + this.dbName = dbName; + } + + public void setBooleanStats(long numTrues, long numFalses, long numNulls) { + this.numTrues = numTrues; + this.numFalses = numFalses; + this.numNulls = numNulls; + } + + public void setLongStats(long numNulls, long numNDVs, long lowValue, long highValue) { + this.numNulls = numNulls; + this.numDVs = numNDVs; + byte[] bytes = ByteBuffer.allocate(Long.SIZE/8).putLong(lowValue).array(); + this.lowValue = bytes; + bytes = ByteBuffer.allocate(Long.SIZE/8).putLong(highValue).array(); + this.highValue = bytes; + } + + public void setDoubleStats(long numNulls, long numNDVs, double lowValue, double highValue) { + this.numNulls = numNulls; + this.numDVs = numNDVs; + byte[] bytes = ByteBuffer.allocate(Double.SIZE/8).putDouble(lowValue).array(); + this.lowValue = bytes; + bytes = ByteBuffer.allocate(Double.SIZE/8).putDouble(highValue).array(); + this.highValue = bytes; + } + + public void setStringStats(long numNulls, long numNDVs, long maxColLen, double avgColLen) { + this.numNulls = numNulls; + this.numDVs = numNDVs; + this.maxColLen = maxColLen; + this.avgColLen = avgColLen; + } + + public void setBinaryStats(long numNulls, long maxColLen, double avgColLen) { + this.numNulls = numNulls; + this.maxColLen = maxColLen; + this.avgColLen = avgColLen; + } +} \ No newline at end of file diff --git metastore/src/model/package.jdo metastore/src/model/package.jdo index 38ce6d5..703b110 100644 --- metastore/src/model/package.jdo +++ metastore/src/model/package.jdo @@ -751,5 +751,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git metastore/src/test/org/apache/hadoop/hive/metastore/DummyRawStoreForJdoConnection.java metastore/src/test/org/apache/hadoop/hive/metastore/DummyRawStoreForJdoConnection.java index 528a100..ce7bd65 100644 --- metastore/src/test/org/apache/hadoop/hive/metastore/DummyRawStoreForJdoConnection.java +++ metastore/src/test/org/apache/hadoop/hive/metastore/DummyRawStoreForJdoConnection.java @@ -25,8 +25,10 @@ import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.metastore.api.InvalidInputException; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidPartitionException; import org.apache.hadoop.hive.metastore.api.MetaException; @@ -512,4 +514,45 @@ public class DummyRawStoreForJdoConnection implements RawStore { return 0; } + @Override + public ColumnStatistics getTableColumnStatistics(String dbName, String tableName, String colName) + throws MetaException, NoSuchObjectException { + return null; + } + + + @Override + public boolean deleteTableColumnStatistics(String dbName, String tableName, + String colName) + throws NoSuchObjectException, MetaException, InvalidObjectException { + return false; + } + + + public boolean deletePartitionColumnStatistics(String dbName, String tableName, + String partName, List partVals, String colName) + throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException { + return false; + + } + + @Override + public ColumnStatistics getPartitionColumnStatistics(String dbName, String tableName, String partName, + List partVal, String colName) throws MetaException, NoSuchObjectException, + InvalidInputException, InvalidObjectException { + return null; + } + + @Override + public boolean updateTableColumnStatistics(ColumnStatistics statsObj) + throws NoSuchObjectException, MetaException, InvalidObjectException { + return false; + } + + public boolean updatePartitionColumnStatistics(ColumnStatistics statsObj,List partVals) + throws NoSuchObjectException, MetaException, InvalidObjectException { + return false; + } } + + diff --git metastore/src/test/org/apache/hadoop/hive/metastore/TestHiveMetaStore.java metastore/src/test/org/apache/hadoop/hive/metastore/TestHiveMetaStore.java index 925938d..e857002 100644 --- metastore/src/test/org/apache/hadoop/hive/metastore/TestHiveMetaStore.java +++ metastore/src/test/org/apache/hadoop/hive/metastore/TestHiveMetaStore.java @@ -37,8 +37,13 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.ConfigValSecurityException; import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; @@ -49,6 +54,7 @@ import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.SkewedInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.Type; import org.apache.hadoop.hive.metastore.api.UnknownDBException; @@ -1228,6 +1234,187 @@ public abstract class TestHiveMetaStore extends TestCase { } } + public void testColumnStatistics() throws Throwable { + + String dbName = "columnstatstestdb"; + String tblName = "tbl"; + String typeName = "Person"; + String tblOwner = "testowner"; + int lastAccessed = 6796; + + try { + cleanUp(dbName, tblName, typeName); + Database db = new Database(); + db.setName(dbName); + client.createDatabase(db); + createTableForTestFilter(dbName,tblName, tblOwner, lastAccessed, true); + + // Create a ColumnStatistics Obj + String[] colName = new String[]{"income", "name"}; + double lowValue = 50000.21; + double highValue = 1200000.4525; + long numNulls = 3; + long numDVs = 22; + double avgColLen = 50.30; + long maxColLen = 102; + String[] colType = new String[] {"double", "string"}; + boolean isTblLevel = true; + String partName = null; + List statsObjs = new ArrayList(); + + ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(); + statsDesc.setDbName(dbName); + statsDesc.setTableName(tblName); + statsDesc.setIsTblLevel(isTblLevel); + statsDesc.setPartName(partName); + + ColumnStatisticsObj statsObj = new ColumnStatisticsObj(); + statsObj.setColName(colName[0]); + statsObj.setColType(colType[0]); + + ColumnStatisticsData statsData = new ColumnStatisticsData(); + DoubleColumnStatsData numericStats = new DoubleColumnStatsData(); + statsData.setDoubleStats(numericStats); + + statsData.getDoubleStats().setHighValue(highValue); + statsData.getDoubleStats().setLowValue(lowValue); + statsData.getDoubleStats().setNumDVs(numDVs); + statsData.getDoubleStats().setNumNulls(numNulls); + + statsObj.setStatsData(statsData); + statsObjs.add(statsObj); + + statsObj = new ColumnStatisticsObj(); + statsObj.setColName(colName[1]); + statsObj.setColType(colType[1]); + + statsData = new ColumnStatisticsData(); + StringColumnStatsData stringStats = new StringColumnStatsData(); + statsData.setStringStats(stringStats); + statsData.getStringStats().setAvgColLen(avgColLen); + statsData.getStringStats().setMaxColLen(maxColLen); + statsData.getStringStats().setNumDVs(numDVs); + statsData.getStringStats().setNumNulls(numNulls); + + statsObj.setStatsData(statsData); + statsObjs.add(statsObj); + + ColumnStatistics colStats = new ColumnStatistics(); + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + + // write stats objs persistently + client.updateTableColumnStatistics(colStats); + + // retrieve the stats obj that was just written + ColumnStatistics colStats2 = client.getTableColumnStatistics(dbName, tblName, colName[0]); + + // compare stats obj to ensure what we get is what we wrote + assertNotNull(colStats2); + assertEquals(colStats2.getStatsDesc().getDbName(), dbName); + assertEquals(colStats2.getStatsDesc().getTableName(), tblName); + assertEquals(colStats2.getStatsObj().get(0).getColName(), colName[0]); + assertEquals(colStats2.getStatsObj().get(0).getStatsData().getDoubleStats().getLowValue(), + lowValue); + assertEquals(colStats2.getStatsObj().get(0).getStatsData().getDoubleStats().getHighValue(), + highValue); + assertEquals(colStats2.getStatsObj().get(0).getStatsData().getDoubleStats().getNumNulls(), + numNulls); + assertEquals(colStats2.getStatsObj().get(0).getStatsData().getDoubleStats().getNumDVs(), + numDVs); + assertEquals(colStats2.getStatsDesc().isIsTblLevel(), isTblLevel); + + // test delete column stats; if no col name is passed all column stats associated with the + // table is deleted + boolean status = client.deleteTableColumnStatistics(dbName, tblName, null); + assertTrue(status); + // try to query stats for a column for which stats doesn't exist + try { + colStats2 = client.getTableColumnStatistics(dbName, tblName, colName[1]); + assertTrue(true); + } catch (NoSuchObjectException e) { + System.out.println("Statistics for column=" + colName[1] + " not found"); + } + + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + + // update table level column stats + client.updateTableColumnStatistics(colStats); + + // query column stats for column whose stats were updated in the previous call + colStats2 = client.getTableColumnStatistics(dbName, tblName, colName[0]); + + // partition level column statistics test + // create a table with multiple partitions + cleanUp(dbName, tblName, typeName); + + List> values = new ArrayList>(); + values.add(makeVals("2008-07-01 14:13:12", "14")); + values.add(makeVals("2008-07-01 14:13:12", "15")); + values.add(makeVals("2008-07-02 14:13:12", "15")); + values.add(makeVals("2008-07-03 14:13:12", "151")); + + createMultiPartitionTableSchema(dbName, tblName, typeName, values); + + List partitions = client.listPartitionNames(dbName, tblName, (short)-1); + + partName = partitions.get(0); + isTblLevel = false; + + // create a new columnstatistics desc to represent partition level column stats + statsDesc = new ColumnStatisticsDesc(); + statsDesc.setDbName(dbName); + statsDesc.setTableName(tblName); + statsDesc.setPartName(partName); + statsDesc.setIsTblLevel(isTblLevel); + + colStats = new ColumnStatistics(); + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + + client.updatePartitionColumnStatistics(colStats); + + colStats2 = client.getPartitionColumnStatistics(dbName, tblName, partName, colName[1]); + + // compare stats obj to ensure what we get is what we wrote + assertNotNull(colStats2); + assertEquals(colStats2.getStatsDesc().getDbName(), dbName); + assertEquals(colStats2.getStatsDesc().getTableName(), tblName); + assertEquals(colStats.getStatsDesc().getPartName(), partName); + assertEquals(colStats2.getStatsObj().get(0).getColName(), colName[1]); + assertEquals(colStats2.getStatsObj().get(0).getStatsData().getStringStats().getMaxColLen(), + maxColLen); + assertEquals(colStats2.getStatsObj().get(0).getStatsData().getStringStats().getAvgColLen(), + avgColLen); + assertEquals(colStats2.getStatsObj().get(0).getStatsData().getStringStats().getNumNulls(), + numNulls); + assertEquals(colStats2.getStatsObj().get(0).getStatsData().getStringStats().getNumDVs(), + numDVs); + assertEquals(colStats2.getStatsDesc().isIsTblLevel(), isTblLevel); + + // test stats deletion at partition level + client.deletePartitionColumnStatistics(dbName, tblName, partName, colName[1]); + + colStats2 = client.getPartitionColumnStatistics(dbName, tblName, partName, colName[0]); + + // test get stats on a column for which stats doesn't exist + try { + colStats2 = client.getPartitionColumnStatistics(dbName, tblName, partName, colName[1]); + assertTrue(true); + } catch (NoSuchObjectException e) { + System.out.println("Statistics for column=" + colName[1] + " not found"); + } + + } catch (Exception e) { + System.err.println(StringUtils.stringifyException(e)); + System.err.println("testColumnStatistics() failed."); + throw e; + } finally { + cleanUp(dbName, tblName, typeName); + } + } + public void testAlterTable() throws Exception { String dbName = "alterdb"; String invTblName = "alter-tbl"; diff --git ql/build.xml ql/build.xml index 5de3f78..58fcf1b 100644 --- ql/build.xml +++ ql/build.xml @@ -207,6 +207,13 @@ + + + + + + + @@ -219,6 +226,7 @@ + diff --git ql/if/queryplan.thrift ql/if/queryplan.thrift index 05fbf58..4427929 100644 --- ql/if/queryplan.thrift +++ ql/if/queryplan.thrift @@ -91,6 +91,7 @@ enum StageType { MOVE, STATS, DEPENDENCY_COLLECTION, + COLUMNSTATS, } struct Stage { diff --git ql/ivy.xml ql/ivy.xml index aa3b8ce..d5a318b 100644 --- ql/ivy.xml +++ ql/ivy.xml @@ -73,6 +73,7 @@ transitive="false"/> + implements Serializable { + private static final long serialVersionUID = 1L; + + private FetchOperator ftOp; + private static transient final Log LOG = LogFactory.getLog(ColumnStatsTask.class); + + public ColumnStatsTask() { + super(); + } + + @Override + public void initialize(HiveConf conf, QueryPlan queryPlan, DriverContext ctx) { + super.initialize(conf, queryPlan, ctx); + try { + JobConf job = new JobConf(conf, ExecDriver.class); + ftOp = new FetchOperator(work.getfWork(), job); + + } catch (Exception e) { + LOG.error(StringUtils.stringifyException(e)); + throw new RuntimeException(e); + } + } + + private void unpackBooleanStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + long v = ((LongObjectInspector) oi).get(o); + if (fName.equals("counttrues")) { + statsObj.getStatsData().getBooleanStats().setNumTrues(v); + } else if (fName.equals("countfalses")) { + statsObj.getStatsData().getBooleanStats().setNumFalses(v); + } else if (fName.equals("countnulls")) { + statsObj.getStatsData().getBooleanStats().setNumNulls(v); + } + } + + private void unpackDoubleStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getDoubleStats().setNumNulls(v); + } else if (fName.equals("numdistinctvalues")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getDoubleStats().setNumDVs(v); + } else if (fName.equals("max")) { + double d = ((DoubleObjectInspector) oi).get(o); + statsObj.getStatsData().getDoubleStats().setHighValue(d); + } else if (fName.equals("min")) { + double d = ((DoubleObjectInspector) oi).get(o); + statsObj.getStatsData().getDoubleStats().setLowValue(d); + } + } + + private void unpackLongStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getLongStats().setNumNulls(v); + } else if (fName.equals("numdistinctvalues")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getLongStats().setNumDVs(v); + } else if (fName.equals("max")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getLongStats().setHighValue(v); + } else if (fName.equals("min")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getLongStats().setLowValue(v); + } + } + + private void unpackStringStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getStringStats().setNumNulls(v); + } else if (fName.equals("numdistinctvalues")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getStringStats().setNumDVs(v); + } else if (fName.equals("avglength")) { + double d = ((DoubleObjectInspector) oi).get(o); + statsObj.getStatsData().getStringStats().setAvgColLen(d); + } else if (fName.equals("maxlength")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getStringStats().setMaxColLen(v); + } + } + + private void unpackBinaryStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getBinaryStats().setNumNulls(v); + } else if (fName.equals("avglength")) { + double d = ((DoubleObjectInspector) oi).get(o); + statsObj.getStatsData().getBinaryStats().setAvgColLen(d); + } else if (fName.equals("maxlength")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getBinaryStats().setMaxColLen(v); + } + } + + private void unpackPrimitiveObject (ObjectInspector oi, Object o, String fieldName, + ColumnStatisticsObj statsObj) { + // First infer the type of object + if (fieldName.equals("columntype")) { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + String s = ((StringObjectInspector) poi).getPrimitiveJavaObject(o); + statsObj.setColType(s.toLowerCase()); + + ColumnStatisticsData statsData = new ColumnStatisticsData(); + if (s.equalsIgnoreCase("long")) { + LongColumnStatsData longStats = new LongColumnStatsData(); + statsData.setLongStats(longStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("double")) { + DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); + statsData.setDoubleStats(doubleStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("string")) { + StringColumnStatsData stringStats = new StringColumnStatsData(); + statsData.setStringStats(stringStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("boolean")) { + BooleanColumnStatsData booleanStats = new BooleanColumnStatsData(); + statsData.setBooleanStats(booleanStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("binary")) { + BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); + statsData.setBinaryStats(binaryStats); + statsObj.setStatsData(statsData); + } + } else { + // invoke the right unpack method depending on data type of the column + if (statsObj.getColType().equals("boolean")) { + unpackBooleanStats(oi, o, fieldName, statsObj); + } else if (statsObj.getColType().equals("long")) { + unpackLongStats(oi, o, fieldName, statsObj); + } else if (statsObj.getColType().equals("double")) { + unpackDoubleStats(oi,o,fieldName, statsObj); + } else if (statsObj.getColType().equals("string")) { + unpackStringStats(oi, o, fieldName, statsObj); + } else if (statsObj.getColType().equals("binary")) { + unpackBinaryStats(oi, o, fieldName, statsObj); + } + } + } + + private void unpackStructObject(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj cStatsObj) { + if (oi.getCategory() != ObjectInspector.Category.STRUCT) { + throw new RuntimeException("Invalid object datatype : " + oi.getCategory().toString()); + } + + StructObjectInspector soi = (StructObjectInspector) oi; + List fields = soi.getAllStructFieldRefs(); + List list = soi.getStructFieldsDataAsList(o); + + for (int i = 0; i < fields.size(); i++) { + // Get the field objectInspector, fieldName and the field object. + ObjectInspector foi = fields.get(i).getFieldObjectInspector(); + Object f = (list == null ? null : list.get(i)); + String fieldName = fields.get(i).getFieldName(); + + if (foi.getCategory() == ObjectInspector.Category.PRIMITIVE) { + unpackPrimitiveObject(foi, f, fieldName, cStatsObj); + } else { + unpackStructObject(foi, f, fieldName, cStatsObj); + } + } + } + + private ColumnStatistics constructColumnStatsFromPackedRow(ObjectInspector oi, + Object o) throws HiveException { + if (oi.getCategory() != ObjectInspector.Category.STRUCT) { + throw new HiveException("Unexpected object type encountered while unpacking row"); + } + + String dbName = db.getCurrentDatabase(); + String tableName = work.getColStats().getTableName(); + String partName = null; + String[] colName = work.getColStats().getColName(); + boolean isTblLevel = work.getColStats().isTblLevel(); + + if (!isTblLevel) { + partName = constructPartName(); + } + + ColumnStatisticsDesc statsDesc = getColumnStatsDesc(dbName, tableName, partName, isTblLevel); + + List statsObjs = new ArrayList(); + StructObjectInspector soi = (StructObjectInspector) oi; + List fields = soi.getAllStructFieldRefs(); + List list = soi.getStructFieldsDataAsList(o); + + for (int i = 0; i < fields.size(); i++) { + + // Get the field objectInspector, fieldName and the field object. + ObjectInspector foi = fields.get(i).getFieldObjectInspector(); + Object f = (list == null ? null : list.get(i)); + String fieldName = fields.get(i).getFieldName(); + ColumnStatisticsObj statsObj = new ColumnStatisticsObj(); + statsObj.setColName(colName[i]); + unpackStructObject(foi, f, fieldName, statsObj); + statsObjs.add(statsObj); + } + + ColumnStatistics colStats = new ColumnStatistics(); + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + return colStats; + } + + private String constructPartName() { + String[] partKeys = work.getColStats().getPartKeys(); + String partitionName = new String(); + int length = partKeys.length; + + for (int i = 0; i < partKeys.length; i++) { + partitionName += partKeys[i]; + + if (i != length - 1) { + partitionName = partitionName + "/"; + } + } + return partitionName; + } + + private ColumnStatisticsDesc getColumnStatsDesc(String dbName, String tableName, + String partName, boolean isTblLevel) + { + ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(); + statsDesc.setDbName(dbName); + statsDesc.setTableName(tableName); + statsDesc.setIsTblLevel(isTblLevel); + + if (!isTblLevel) { + partName = constructPartName(); + statsDesc.setPartName(partName); + } else { + statsDesc.setPartName(null); + } + return statsDesc; + } + + private int persistPartitionStats() throws HiveException { + InspectableObject io = null; + + try { + io = fetchColumnStats(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CommandNeedRetryException e) { + e.printStackTrace(); + } + + // Manufacture a column stats object + ColumnStatistics colStats = constructColumnStatsFromPackedRow(io.oi, io.o); + + // Persist stats to metastore + try { + db.updatePartitionColumnStatistics(colStats); + } catch (Exception e) { + e.printStackTrace(); + } + return 0; + } + + private int persistTableStats() throws HiveException { + InspectableObject io = null; + + // fetch() + try { + io = fetchColumnStats(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CommandNeedRetryException e) { + e.printStackTrace(); + } + + ColumnStatistics colStats = constructColumnStatsFromPackedRow(io.oi, io.o); + + // Persist stats to metastore + try { + db.updateTableColumnStatistics(colStats); + } catch (Exception e) { + e.printStackTrace(); + } + return 0; + } + + @Override + public int execute(DriverContext driverContext) { + try { + if (work.getColStats().isTblLevel()) { + return persistTableStats(); + } else { + return persistPartitionStats(); + } + } catch (Exception e) { + e.printStackTrace(); + } + return 1; + } + + private InspectableObject fetchColumnStats() throws IOException, CommandNeedRetryException { + try { + InspectableObject io = ftOp.getNextRow(); + if (io == null) { + throw new CommandNeedRetryException(); + } + return io; + } catch (CommandNeedRetryException e) { + throw e; + } catch (IOException e) { + throw e; + } catch (Exception e) { + throw new IOException(e); + } + } + + @Override + public StageType getType() { + return StageType.COLUMNSTATS; + } + + @Override + public String getName() { + return "COLUMNSTATS TASK"; + } + + @Override + protected void localizeMRTmpFilesImpl(Context ctx) { + FetchWork fWork = work.getfWork(); + String s = fWork.getTblDir(); + + if ((s != null) && ctx.isMRTmpFileURI(s)) { + fWork.setTblDir(ctx.localizeMRTmpFileURI(s)); + } + + ArrayList ls = fWork.getPartDir(); + if (ls != null) { + ctx.localizePaths(ls); + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 425900d..3c628eb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -129,6 +129,7 @@ import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFAverage; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCollectSet; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFComputeStats; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFContextNGrams; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCorrelation; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount; @@ -158,7 +159,6 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFCase; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFCoalesce; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFConcatWS; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSortArray; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFEWAHBitmapAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFEWAHBitmapEmpty; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFEWAHBitmapOr; @@ -169,8 +169,8 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFFromUtcTimestamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIf; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIndex; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFInFile; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIndex; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFInstr; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLocate; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFMap; @@ -193,6 +193,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFPrintf; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFReflect; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSentences; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSize; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSortArray; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSplit; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStringToMap; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; @@ -227,10 +228,8 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.util.ReflectionUtils; - import org.w3c.dom.Document; import org.w3c.dom.Element; -import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** @@ -410,7 +409,6 @@ public final class FunctionRegistry { registerGenericUDAF("sum", new GenericUDAFSum()); registerGenericUDAF("count", new GenericUDAFCount()); registerGenericUDAF("avg", new GenericUDAFAverage()); - registerGenericUDAF("std", new GenericUDAFStd()); registerGenericUDAF("stddev", new GenericUDAFStd()); registerGenericUDAF("stddev_pop", new GenericUDAFStd()); @@ -430,6 +428,8 @@ public final class FunctionRegistry { registerGenericUDAF("ewah_bitmap", new GenericUDAFEWAHBitmap()); + registerGenericUDAF("compute_stats" , new GenericUDAFComputeStats()); + registerUDAF("percentile", UDAFPercentile.class); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java index 4446952..b380a69 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java @@ -81,6 +81,9 @@ public abstract class Task implements Serializable, Node // Descendants tasks who subscribe feeds from this task protected transient List> feedSubscribers; + protected String id; + protected T work; + public static enum FeedType { DYNAMIC_PARTITIONS, // list of dynamic partitions }; @@ -324,8 +327,7 @@ public abstract class Task implements Serializable, Node return isrunnable; } - protected String id; - protected T work; + public void setWork(T work) { this.work = work; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java index 79b87f1..a47a180 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.rcfile.merge.BlockMergeTask; import org.apache.hadoop.hive.ql.io.rcfile.merge.MergeWork; +import org.apache.hadoop.hive.ql.plan.ColumnStatsWork; import org.apache.hadoop.hive.ql.plan.ConditionalWork; import org.apache.hadoop.hive.ql.plan.CopyWork; import org.apache.hadoop.hive.ql.plan.DDLWork; @@ -77,6 +78,7 @@ public final class TaskFactory { MapredLocalTask.class)); taskvec.add(new taskTuple(StatsWork.class, StatsTask.class)); + taskvec.add(new taskTuple(ColumnStatsWork.class, ColumnStatsTask.class)); taskvec.add(new taskTuple(MergeWork.class, BlockMergeTask.class)); taskvec.add(new taskTuple(DependencyCollectionWork.class, diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 7440889..b87968d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -31,13 +31,13 @@ import static org.apache.hadoop.hive.serde.Constants.STRING_TYPE_NAME; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import java.util.HashSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -56,6 +56,7 @@ import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.Constants; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.FieldSchema; @@ -63,7 +64,6 @@ import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; import org.apache.hadoop.hive.metastore.api.HiveObjectRef; import org.apache.hadoop.hive.metastore.api.HiveObjectType; import org.apache.hadoop.hive.metastore.api.Index; -import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; @@ -2176,6 +2176,24 @@ public class Hive { return indexes; } + public boolean updateTableColumnStatistics(ColumnStatistics statsObj) throws HiveException { + try { + return getMSC().updateTableColumnStatistics(statsObj); + } catch (Exception e) { + LOG.error(StringUtils.stringifyException(e)); + throw new HiveException(e); + } + } + + public boolean updatePartitionColumnStatistics(ColumnStatistics statsObj) throws HiveException { + try { + return getMSC().updatePartitionColumnStatistics(statsObj); + } catch (Exception e) { + LOG.error(StringUtils.stringifyException(e)); + throw new HiveException(e); + } + } + public Table newTable(String tableName) throws HiveException { String[] names = getQualifiedNames(tableName); switch (names.length) { diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteParseContextGenerator.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteParseContextGenerator.java index 0b55ac4..dee7d7e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteParseContextGenerator.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteParseContextGenerator.java @@ -35,6 +35,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory; import org.apache.hadoop.hive.ql.parse.SemanticException; + /** * RewriteParseContextGenerator is a class that offers methods to generate operator tree * for input queries. It is implemented on lines of the analyzeInternal(..) method @@ -108,7 +109,7 @@ public final class RewriteParseContextGenerator { ASTNode child = ast; ParseContext subPCtx = ((SemanticAnalyzer) sem).getParseContext(); subPCtx.setContext(ctx); - ((SemanticAnalyzer) sem).init(subPCtx); + ((SemanticAnalyzer) sem).initParseCtx(subPCtx); LOG.info("Starting Sub-query Semantic Analysis"); sem.doPhase1(child, qb, sem.initPhase1Ctx()); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java index 344dc69..0e0e32e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java @@ -240,9 +240,15 @@ public abstract class BaseSemanticAnalyzer { } public abstract void analyzeInternal(ASTNode ast) throws SemanticException; + public abstract void init(); - public void analyze(ASTNode ast, Context ctx) throws SemanticException { + public void initCtx(Context ctx) { this.ctx = ctx; + } + + public void analyze(ASTNode ast, Context ctx) throws SemanticException { + initCtx(ctx); + init(); analyzeInternal(ast); } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java index f7257cd..e7e4f24 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java @@ -19,9 +19,7 @@ package org.apache.hadoop.hive.ql.parse; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_CASCADE; -import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_DATABASECOMMENT; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_IFEXISTS; -import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_IFNOTEXISTS; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_SHOWDATABASES; import java.io.Serializable; @@ -647,10 +645,10 @@ public class DDLSemanticAnalyzer extends BaseSemanticAnalyzer { for (int i = 1; i < ast.getChildCount(); i++) { ASTNode childNode = (ASTNode) ast.getChild(i); switch (childNode.getToken().getType()) { - case TOK_IFNOTEXISTS: + case HiveParser.TOK_IFNOTEXISTS: ifNotExists = true; break; - case TOK_DATABASECOMMENT: + case HiveParser.TOK_DATABASECOMMENT: dbComment = unescapeSQLString(childNode.getChild(0).getText()); break; case HiveParser.TOK_DATABASEPROPERTIES: @@ -2427,4 +2425,8 @@ public class DDLSemanticAnalyzer extends BaseSemanticAnalyzer { } } } + + @Override + public void init() { + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ExplainSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/ExplainSemanticAnalyzer.java index e75a075..87366cc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ExplainSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ExplainSemanticAnalyzer.java @@ -55,7 +55,7 @@ public class ExplainSemanticAnalyzer extends BaseSemanticAnalyzer { if (ast.getChildCount() == 2) { int explainOptions = ast.getChild(1).getType(); formatted = (explainOptions == HiveParser.KW_FORMATTED); - extended = (explainOptions == HiveParser.KW_EXTENDED); + extended = (explainOptions == HiveParser.KW_EXTENDED); } ctx.setResFile(new Path(ctx.getLocalTmpFileURI())); @@ -86,4 +86,8 @@ public class ExplainSemanticAnalyzer extends BaseSemanticAnalyzer { public List getResultSchema() { return fieldList; } + + @Override + public void init() { + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ExportSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/ExportSemanticAnalyzer.java index 61bc7fd..9d54f68 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ExportSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ExportSemanticAnalyzer.java @@ -20,14 +20,10 @@ package org.apache.hadoop.hive.ql.parse; import java.io.FileNotFoundException; import java.io.IOException; -import java.io.OutputStream; import java.io.Serializable; import java.net.URI; import java.util.List; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.TransformerException; - import org.antlr.runtime.tree.Tree; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -38,15 +34,8 @@ import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; -import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; -import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.plan.CopyWork; -import org.apache.thrift.TException; -import org.apache.thrift.TSerializer; -import org.apache.thrift.protocol.TJSONProtocol; -import org.w3c.dom.Document; -import org.w3c.dom.Element; /** * ExportSemanticAnalyzer. @@ -134,4 +123,7 @@ public class ExportSemanticAnalyzer extends BaseSemanticAnalyzer { toURI.getScheme().equals("hdfs") ? true : false)); } + @Override + public void init() { + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/FunctionSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/FunctionSemanticAnalyzer.java index 6024dd4..8a59cb4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/FunctionSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/FunctionSemanticAnalyzer.java @@ -17,9 +17,6 @@ */ package org.apache.hadoop.hive.ql.parse; - -import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_IFEXISTS; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -64,7 +61,7 @@ public class FunctionSemanticAnalyzer extends BaseSemanticAnalyzer { private void analyzeDropFunction(ASTNode ast) throws SemanticException { String functionName = ast.getChild(0).getText(); - boolean ifExists = (ast.getFirstChildWithType(TOK_IFEXISTS) != null); + boolean ifExists = (ast.getFirstChildWithType(HiveParser.TOK_IFEXISTS) != null); // we want to signal an error if the function doesn't exist and we're // configured not to ignore this boolean throwException = @@ -76,4 +73,8 @@ public class FunctionSemanticAnalyzer extends BaseSemanticAnalyzer { DropFunctionDesc desc = new DropFunctionDesc(functionName); rootTasks.add(TaskFactory.get(new FunctionWork(desc), conf)); } + + @Override + public void init() { + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g index 356779a..585602c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g +++ ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g @@ -843,7 +843,7 @@ descStatement analyzeStatement @init { msgs.push("analyze statement"); } @after { msgs.pop(); } - : KW_ANALYZE KW_TABLE (parttype=tableOrPartition) KW_COMPUTE KW_STATISTICS -> ^(TOK_ANALYZE $parttype) + : KW_ANALYZE KW_TABLE (parttype=tableOrPartition) KW_COMPUTE KW_STATISTICS (KW_FOR KW_COLUMNS statsColumnName=columnNameList)? -> ^(TOK_ANALYZE $parttype $statsColumnName?) ; showStatement @@ -2431,6 +2431,7 @@ KW_UPDATE: 'UPDATE'; KW_RESTRICT: 'RESTRICT'; KW_CASCADE: 'CASCADE'; KW_SKEWED: 'SKEWED'; +KW_FOR: 'FOR'; // Operators diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java index 09ef969..f428f9b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java @@ -540,4 +540,7 @@ public class ImportSemanticAnalyzer extends BaseSemanticAnalyzer { return null; } + @Override + public void init() { + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index 22fa20f..3d0de45 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -304,4 +304,8 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { childTask.addDependentTask(statTask); } } + + @Override + public void init() { + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java index a0ccbe6..f737c4e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java @@ -49,6 +49,7 @@ public class QB { private QBJoinTree qbjoin; private String id; private boolean isQuery; + private boolean isAnalyzeRewrite; private CreateTableDesc tblDesc = null; // table descriptor of the final // results @@ -222,5 +223,14 @@ public class QB { skewedColNames = tbl.getSkewedColNames(); } return skewedColNames; + + } + + public boolean isAnalyzeRewrite() { + return isAnalyzeRewrite; + } + + public void setAnalyzeRewrite(boolean isAnalyzeRewrite) { + this.isAnalyzeRewrite = isAnalyzeRewrite; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java index b38c002..4df3b7f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java @@ -57,6 +57,12 @@ public class QBParseInfo { private final HashMap tableSpecs; // used for statistics + private String tableName; // used for column statistics + private String[] colName; // used for column statistics + private String[] partName; // used for column statistics + private String[] partNameWoValue; // used for column statistics + private boolean isTblLvl; // used for column statistics + /** * ClusterBy is a short name for both DistributeBy and SortBy. */ @@ -116,6 +122,7 @@ public class QBParseInfo { aliasToLateralViews = new HashMap>(); tableSpecs = new HashMap(); + } public void setAggregationExprsForClause(String clause, @@ -497,4 +504,44 @@ public class QBParseInfo { ORDER_BY_CLAUSE, SORT_BY_CLAUSE } -} + + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public String[] getColName() { + return colName; + } + + public void setColName(String[] colName) { + this.colName = colName; + } + + public String[] getPartName() { + return partName; + } + + public void setPartName(String[] partName) { + this.partName = partName; + } + + public String[] getPartNameWoValue() { + return partNameWoValue; + } + + public void setPartNameWoValue(String[] partNameWoValue) { + this.partNameWoValue = partNameWoValue; + } + + public boolean isTblLvl() { + return isTblLvl; + } + + public void setTblLvl(boolean isTblLvl) { + this.isTblLvl = isTblLvl; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 5ce31f1..4b1e0fa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -54,6 +54,7 @@ import org.apache.hadoop.hive.ql.QueryProperties; import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator; import org.apache.hadoop.hive.ql.exec.ArchiveUtils; import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.ColumnStatsTask; import org.apache.hadoop.hive.ql.exec.ConditionalTask; import org.apache.hadoop.hive.ql.exec.ExecDriver; import org.apache.hadoop.hive.ql.exec.FetchTask; @@ -116,6 +117,8 @@ import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec.SpecType; import org.apache.hadoop.hive.ql.parse.QBParseInfo.ClauseType; import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ColumnStatsDesc; +import org.apache.hadoop.hive.ql.plan.ColumnStatsWork; import org.apache.hadoop.hive.ql.plan.CreateTableDesc; import org.apache.hadoop.hive.ql.plan.CreateTableLikeDesc; import org.apache.hadoop.hive.ql.plan.CreateViewDesc; @@ -269,7 +272,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { prunedPartitions.clear(); } - public void init(ParseContext pctx) { + public void initParseCtx(ParseContext pctx) { opToPartPruner = pctx.getOpToPartPruner(); opToPartList = pctx.getOpToPartList(); opToSamplePruner = pctx.getOpToSamplePruner(); @@ -816,7 +819,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { case HiveParser.TOK_ANALYZE: // Case of analyze command + String table_name = getUnescapedName((ASTNode)ast.getChild(0).getChild(0)); + + qb.setTabAlias(table_name, table_name); qb.addAlias(table_name); qb.getParseInfo().setIsAnalyzeCommand(true); @@ -838,11 +844,13 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { case HiveParser.TOK_INSERT: ASTNode destination = (ASTNode) ast.getChild(0); Tree tab = destination.getChild(0); + // Proceed if AST contains partition & If Not Exists if (destination.getChildCount() == 2 && tab.getChildCount() == 2 && destination.getChild(1).getType() == HiveParser.TOK_IFNOTEXISTS) { String tableName = tab.getChild(0).getChild(0).getText(); + Tree partitions = tab.getChild(1); int childCount = partitions.getChildCount(); HashMap partition = new HashMap(); @@ -7163,19 +7171,63 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } } + /** + * A helper function to generate a column stats task on top of map-red task. The column stats + * task fetches from the output of the map-red task, constructs the column stats object and + * persists it to the metastore. + * + * This method generates a column stats task and setups the appropriate metadata to be used + * during execution. + * + * @param qb + */ + + private void genColStatsTask(QB qb) { + QBParseInfo qbParseInfo = qb.getParseInfo(); + ColumnStatsTask cStatsTask = null; + ColumnStatsWork cStatsWork = null; + FetchWork fetch = null; + String tableName = qbParseInfo.getTableName(); + String[] partName = qbParseInfo.getPartName(); + String[] partNameWoValue = qbParseInfo.getPartNameWoValue(); + String[] colName = qbParseInfo.getColName(); + boolean isTblLevel = qbParseInfo.isTblLvl(); + + String cols = loadFileWork.get(0).getColumns(); + String colTypes = loadFileWork.get(0).getColumnTypes(); + + String resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT); + TableDesc resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat); + + fetch = new FetchWork(new Path(loadFileWork.get(0).getSourceDir()).toString(), + resultTab, qb.getParseInfo().getOuterQueryLimit()); + + ColumnStatsDesc cStatsDesc = new ColumnStatsDesc(tableName, partName, + partNameWoValue, colName, isTblLevel); + cStatsWork = new ColumnStatsWork(fetch, cStatsDesc); + cStatsTask = (ColumnStatsTask) TaskFactory.get(cStatsWork, conf); + + rootTasks.add(cStatsTask); + } + @SuppressWarnings("nls") private void genMapRedTasks(ParseContext pCtx) throws SemanticException { + boolean isCStats = qb.isAnalyzeRewrite(); + if (pCtx.getFetchTask() != null) { // replaced by single fetch task - init(pCtx); + initParseCtx(pCtx); return; } - init(pCtx); + initParseCtx(pCtx); List> mvTask = new ArrayList>(); - // In case of a select, use a fetch task instead of a move task - if (qb.getIsQuery()) { + /* In case of a select, use a fetch task instead of a move task. + * If the select is from analyze table column rewrite, don't create a fetch task. Instead create + * a column stats task later. + */ + if (qb.getIsQuery() && !isCStats) { if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) { throw new SemanticException(ErrorMsg.GENERIC_ERROR.getMsg()); } @@ -7200,8 +7252,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { + ". Doesn't qualify limit optimiztion."); globalLimitCtx.disableOpt(); } - - } else { + } else if (!isCStats) { for (LoadTableDesc ltd : loadTableWork) { Task tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false), conf); @@ -7220,7 +7271,6 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } } - boolean oneLoadFile = true; for (LoadFileDesc lfd : loadFileWork) { if (qb.isCTAS()) { @@ -7319,6 +7369,13 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { topNodes.addAll(topOps.values()); ogw.startWalking(topNodes, null); + /* If the query was the result of analyze table column compute statistics rewrite, create + * a column stats task instead of a fetch task to persist stats to the metastore. + */ + if (isCStats) { + genColStatsTask(qb); + } + // reduce sink does not have any kids - since the plan by now has been // broken up into multiple // tasks, iterate over all tasks. @@ -7595,15 +7652,36 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { return ctx_1; } + public QB optimize(ASTNode child) throws SemanticException { + ParseContext pCtx = new ParseContext(conf, qb, child, opToPartPruner, + opToPartList, topOps, topSelOps, opParseCtx, joinContext, topToTable, + loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, + listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, + opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, opToSkewedPruner); + + Optimizer optm = new Optimizer(); + optm.setPctx(pCtx); + optm.initialize(conf); + pCtx = optm.optimize(); + initParseCtx(pCtx); + return pCtx.getQB(); + } + @Override - @SuppressWarnings("nls") - public void analyzeInternal(ASTNode ast) throws SemanticException { + public void init() { + // clear most members reset(); + // init QB qb = new QB(null, null, false); this.qb = qb; - this.ast = ast; + } + + @Override + @SuppressWarnings("nls") + public void analyzeInternal(ASTNode ast) throws SemanticException { ASTNode child = ast; + this.ast = ast; viewsExpanded = new ArrayList(); LOG.info("Starting Semantic Analysis"); @@ -7646,6 +7724,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { // such as JDBC would prefer instead of the c0, c1 we'll end // up with later. Operator sinkOp = genPlan(qb); + resultSchema = convertRowSchemaToViewSchema(opParseCtx.get(sinkOp).getRowResolver()); @@ -8548,4 +8627,12 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { return conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS); } + + public QB getQB() { + return qb; + } + + public void setQB(QB qb) { + this.qb = qb; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java index ad1a14c..fb6a5d7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.parse; +import java.io.IOException; import java.util.HashMap; import org.apache.hadoop.hive.conf.HiveConf; @@ -90,6 +91,7 @@ public final class SemanticAnalyzerFactory { commandType.put(HiveParser.TOK_SHOW_ROLE_GRANT, HiveOperation.SHOW_ROLE_GRANT); commandType.put(HiveParser.TOK_ALTERDATABASE_PROPERTIES, HiveOperation.ALTERDATABASE); commandType.put(HiveParser.TOK_DESCDATABASE, HiveOperation.DESCDATABASE); + commandType.put(HiveParser.TOK_ANALYZE, HiveOperation.ANALYZE_TABLE); } static { @@ -196,6 +198,15 @@ public final class SemanticAnalyzerFactory { case HiveParser.TOK_CREATEFUNCTION: case HiveParser.TOK_DROPFUNCTION: return new FunctionSemanticAnalyzer(conf); + + case HiveParser.TOK_ANALYZE: + try { + return new StatsSemanticAnalyzer(conf, tree); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParseException e) { + e.printStackTrace(); + } default: return new SemanticAnalyzer(conf); } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/StatsSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/StatsSemanticAnalyzer.java new file mode 100644 index 0000000..b1d1f74 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/parse/StatsSemanticAnalyzer.java @@ -0,0 +1,381 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.parse; + +import java.io.IOException; +import java.io.Serializable; +import java.util.HashSet; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.QueryProperties; +import org.apache.hadoop.hive.ql.exec.FetchTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.LineageInfo; +import org.apache.hadoop.hive.ql.hooks.ReadEntity; +import org.apache.hadoop.hive.ql.hooks.WriteEntity; + +/** + * StatsSemanticAnalyzer. + * + */ +public class StatsSemanticAnalyzer extends BaseSemanticAnalyzer { + + private static final Log LOG = LogFactory + .getLog(StatsSemanticAnalyzer.class); + + private ASTNode originalTree; + private ASTNode rewrittenTree; + private String rewrittenQuery; + private SemanticAnalyzer newSem; + + private Context ctx; + private boolean isRewritten; + + private String tableName; + private boolean isTblLvl; + private String[] partName; + private String[] partNameWoValue; + private String[] colNames; + + private class PartitionList { + private final String[] partKeys; + private final String[] partValues; + private int numPartitions; + private int numPartitionValues; + + PartitionList(int numPartitions) { + this.numPartitions = numPartitions; + partKeys = new String[numPartitions]; + partValues = new String[numPartitions]; + } + + public int getNumPartitions() { + return numPartitions; + } + + public void setNumPartitions(int numPartitions) { + this.numPartitions = numPartitions; + } + + public String[] getPartValues() { + return partValues; + } + + public String[] getPartKeys() { + return partKeys; + } + + public void addPartValue(String partValue, int index) { + partValues[index] = new String(partValue); + } + + public void addPartKey(String partKey, int index) { + partKeys[index] = new String(partKey); + } + + public int getNumPartValues() { + return numPartitionValues; + } + + public void setNumPartValues(int numPartValues) { + numPartitionValues = numPartValues; + } + } + + public StatsSemanticAnalyzer(HiveConf conf) throws SemanticException { + super(conf); + } + + private boolean isRewritable(ASTNode tree) { + boolean rwt = false; + + if (tree.getChildCount() > 1) { + ASTNode child0 = (ASTNode) tree.getChild(0); + ASTNode child1; + + if (child0.getToken().getType() == HiveParser.TOK_TAB) { + child0 = (ASTNode) child0.getChild(0); + + if (child0.getToken().getType() == HiveParser.TOK_TABNAME) { + + child1 = (ASTNode) tree.getChild(1); + + if (child1.getToken().getType() == HiveParser.TOK_TABCOLNAME) { + rwt = true; + } + } + } + } + return rwt; + } + + private boolean isPartitionLvlStats(ASTNode tree) { + boolean isPartitioned = false; + ASTNode child = (ASTNode) tree.getChild(0); + + if (child.getChildCount() > 1) { + child = (ASTNode) child.getChild(1); + + if (child.getToken().getType() == HiveParser.TOK_PARTSPEC) { + isPartitioned = true; + } + } + return isPartitioned; + } + + private String getTableName(ASTNode tree) + { + return getUnescapedName((ASTNode) tree.getChild(0).getChild(0)); + } + + private PartitionList getPartKeyValue(ASTNode tree) + { + ASTNode child = ((ASTNode) tree.getChild(0).getChild(1)); + int numParts = child.getChildCount(); + PartitionList partList = new PartitionList(numParts); + String partKey; + String partValue; + int numPartValue = 0; + + for (int i = 0; i < numParts; i++) { + partKey = new String(getUnescapedName((ASTNode) child.getChild(i).getChild(0))); + if (child.getChild(i).getChildCount() > 1) { + partValue = new String(getUnescapedName((ASTNode) child.getChild(i).getChild(1))); + numPartValue += 1; + } else { + partValue = null; + } + partList.addPartKey(partKey, i); + partList.addPartValue(partValue, i); + } + partList.setNumPartValues(numPartValue); + return partList; + } + + private String[] getColumnName(ASTNode tree) + { + int numCols = tree.getChild(1).getChildCount(); + String[] colName = new String[numCols]; + + for (int i = 0; i < numCols; i++) { + colName[i] = new String(getUnescapedName((ASTNode) tree.getChild(1).getChild(i))); + } + return colName; + } + + private String genPartitionClause(ASTNode tree) { + PartitionList partList = null; + String whereClause = new String(" where "); + boolean predPresent = false; + String groupByClause = new String(" group by "); + boolean aggPresent = false; + String retClause = null; + + partList = getPartKeyValue(tree); + String[] partKeys = partList.getPartKeys(); + String[] partValues = partList.getPartValues(); + int numPartKeys = partList.getNumPartitions(); + int numPartValues = partList.getNumPartValues(); + + partNameWoValue = new String[numPartKeys - numPartValues]; + partName = new String[numPartValues]; + + for (int i = 0, j = 0, k = 0; i < partList.getNumPartitions(); i++) + { + if (partValues[i] != null) { + if (!predPresent) { + whereClause += partKeys[i] + " = " + partValues[i]; + predPresent = true; + } else { + whereClause += " and " + partKeys[i] + " = " + partValues[i]; + } + partName[j] = new String(partKeys[i] + "=" + + partValues[i].substring(1, partValues[i].length() - 1)); + j += 1; + } else { + if (!aggPresent) { + groupByClause += partKeys[i]; + aggPresent = true; + } else { + groupByClause += "," + partKeys[i]; + } + partNameWoValue[k] = new String(partKeys[i]); + k += 1; + } + } + + // attach the predicate and group by to the return clause + if (predPresent) { + retClause = new String(whereClause); + } + if (aggPresent) { + retClause += new String(groupByClause); + } + return retClause; + } + + public StatsSemanticAnalyzer(HiveConf conf, ASTNode tree) + throws SemanticException, IOException, ParseException { + super(conf); + + /* + * Rewrite only analyze table <> column <> compute statistics; Don't rewrite analyze table + * command - table stats are collected by the table scan operator and is not rewritten to + * an aggregation. + */ + if (isRewritable(tree)) { + tableName = new String(getTableName(tree)); + colNames = getColumnName(tree); + + // Save away the original AST + originalTree = tree; + + // Rewrite the query to select compute_stats(...) group by + rewrittenQuery = new String("select "); + for (int i = 0; i < colNames.length; i++) { + if (i > 0) { + rewrittenQuery += new String(" , "); + } + rewrittenQuery += new String("compute_stats(") + colNames[i] + new String(")"); + } + rewrittenQuery += " from " + tableName; + isRewritten = true; + + /* + * If partition level stats is requested, add predicate and group by as needed to rewritten + * query + */ + if (isPartitionLvlStats(tree)) { + isTblLvl = false; + rewrittenQuery += genPartitionClause(tree); + } else { + isTblLvl = true; + } + rewrittenQuery = new VariableSubstitution().substitute(conf, rewrittenQuery); + + // Parse the rewritten query string + ctx = new Context(conf); + ctx.setCmd(rewrittenQuery); + + ParseDriver pd = new ParseDriver(); + rewrittenTree = pd.parse(rewrittenQuery, ctx); + rewrittenTree = ParseUtils.findRootNonNullToken(rewrittenTree); + + // Get a semantic analyzer object for the rewritten query + newSem = (SemanticAnalyzer) SemanticAnalyzerFactory.get(conf, rewrittenTree); + } else { + /* + * Not an analyze table column compute statistics stmt - don't do any rewrites + */ + originalTree = rewrittenTree = tree; + rewrittenQuery = null; + isRewritten = false; + newSem = (SemanticAnalyzer) new SemanticAnalyzer(conf); + } + } + + @Override + public void analyzeInternal(ASTNode ast) throws SemanticException { + LOG.info("Invoking analyzeInternal on rewritten query"); + newSem.analyzeInternal(rewrittenTree); + } + + @Override + public void analyze(ASTNode ast, Context origCtx) throws SemanticException { + LOG.info("Invoking analyze on rewritten query"); + + QB qb; + QBParseInfo qbp; + + // initialize QB + newSem.init(); + + // Setup the necessary metadata if originating from analyze rewrite + if (isRewritten) { + qb = newSem.getQB(); + qb.setAnalyzeRewrite(true); + qbp = qb.getParseInfo(); + qbp.setTableName(tableName); + qbp.setTblLvl(isTblLvl); + + if (!isTblLvl) { + qbp.setPartName(partName); + qbp.setPartNameWoValue(partNameWoValue); + } + qbp.setColName(colNames); + newSem.initCtx(ctx); + } else { + newSem.initCtx(origCtx); + } + + // Invoke analyze on the rewriten query + newSem.analyzeInternal(rewrittenTree); + } + + @Override + public void init() { + } + + @Override + public void validate() throws SemanticException { + LOG.info("Invoking validate on rewritten query"); + newSem.validate(); + } + + @Override + public List> getRootTasks() { + LOG.info("Invoking getRootTasks on a rewritten query"); + return newSem.getRootTasks(); + } + + @Override + public HashSet getInputs() { + return newSem.getInputs(); + } + + @Override + public HashSet getOutputs() { + return newSem.getOutputs(); + } + + @Override + public LineageInfo getLineageInfo() { + return newSem.getLineageInfo(); + } + + @Override + public QueryProperties getQueryProperties() { + return newSem.getQueryProperties(); + } + + @Override + public FetchTask getFetchTask() { + return newSem.getFetchTask(); + } + + @Override + public List getResultSchema() { + return newSem.getResultSchema(); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java new file mode 100644 index 0000000..c33d90c --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.plan; + +import java.io.Serializable; + +/** + * Contains the information needed to persist column level statistics + */ +public class ColumnStatsDesc extends DDLDesc implements Serializable, Cloneable { + + private static final long serialVersionUID = 1L; + private boolean isTblLevel; + private String tableName; + private String[] partKeys; + private String[] partKeysWoValue; + private String[] colName; + + public ColumnStatsDesc() { + } + + public ColumnStatsDesc(String tableName, + String[] partKeys, String[] partKeysWoValue, + String[] colName, boolean isTblLevel) { + this.tableName = tableName; + this.colName = colName; + this.partKeys = partKeys; + this.partKeysWoValue = partKeysWoValue; + this.isTblLevel = isTblLevel; + } + + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public boolean isTblLevel() { + return isTblLevel; + } + + public void setTblLevel(boolean isTblLevel) { + this.isTblLevel = isTblLevel; + } + + public String[] getPartKeys() { + return partKeys; + } + + public void setPartKeys(String[] partKeys) { + this.partKeys = partKeys; + } + + public String[] getColName() { + return colName; + } + + public void setColName(String[] colName) { + this.colName = colName; + } + + public String[] getPartKeysWoValue() { + return partKeysWoValue; + } + + public void setPartKeysWoValue(String[] partKeysWoValue) { + this.partKeysWoValue = partKeysWoValue; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsWork.java new file mode 100644 index 0000000..0a2c2a8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsWork.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.io.Serializable; + +/** + * ColumnStats Work. + * + */ +@Explain(displayName = "Column Stats Work") +public class ColumnStatsWork implements Serializable { + private static final long serialVersionUID = 1L; + private FetchWork fWork; + private ColumnStatsDesc colStats; + + public ColumnStatsWork() { + } + + public ColumnStatsWork(FetchWork work, ColumnStatsDesc colStats) { + this.fWork = work; + this.setColStats(colStats); + } + + @Override + public String toString() { + String ret; + ret = fWork.toString(); + return ret; + } + + public FetchWork getfWork() { + return fWork; + } + + public void setfWork(FetchWork fWork) { + this.fWork = fWork; + } + + public ColumnStatsDesc getColStats() { + return colStats; + } + + public void setColStats(ColumnStatsDesc colStats) { + this.colStats = colStats; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java index cb54753..b19e44a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java @@ -49,6 +49,7 @@ public enum HiveOperation { ALTERTABLE_SERDEPROPERTIES("ALTERTABLE_SERDEPROPERTIES", new Privilege[]{Privilege.ALTER_METADATA}, null), ALTERPARTITION_SERDEPROPERTIES("ALTERPARTITION_SERDEPROPERTIES", new Privilege[]{Privilege.ALTER_METADATA}, null), ALTERTABLE_CLUSTER_SORT("ALTERTABLE_CLUSTER_SORT", new Privilege[]{Privilege.ALTER_METADATA}, null), + ANALYZE_TABLE("ANALYZE_TABLE", null, null), SHOWDATABASES("SHOWDATABASES", new Privilege[]{Privilege.SHOW_DATABASE}, null), SHOWTABLES("SHOWTABLES", null, null), SHOWCOLUMNS("SHOWCOLUMNS", null, null), diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java new file mode 100644 index 0000000..4837c78 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +public class DoubleNumDistinctValueEstimator extends NumDistinctValueEstimator { + + public DoubleNumDistinctValueEstimator() { + super(); + } + + public DoubleNumDistinctValueEstimator(String s) { + super(s); + } + + public void addToEstimator(double d) { + int v = new Double(d).hashCode(); + super.addToEstimator(v); + } + + public void addToEstimatorPCSA(double d) { + int v = new Double(d).hashCode(); + super.addToEstimatorPCSA(v); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java new file mode 100644 index 0000000..703a921 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java @@ -0,0 +1,1377 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.StringUtils; + +/** + * GenericUDAFComputeStats + * + */ +@Description(name = "compute_stats", + value = "_FUNC_(x) - Returns the statistical summary of a set of primitive type values") +public class GenericUDAFComputeStats extends AbstractGenericUDAFResolver { + + static final Log LOG = LogFactory.getLog(GenericUDAFComputeStats.class.getName()); + + @Override + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) + throws SemanticException { + if (parameters.length != 1) { + throw new UDFArgumentTypeException(parameters.length - 1, + "Exactly one argument is expected."); + } + + if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(0, + "Only primitive type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) { + case BOOLEAN: + return new GenericUDAFBooleanStatsEvaluator(); + case BYTE: + case SHORT: + case INT: + case LONG: + case TIMESTAMP: + return new GenericUDAFLongStatsEvaluator(); + case FLOAT: + case DOUBLE: + return new GenericUDAFDoubleStatsEvaluator(); + case STRING: + return new GenericUDAFStringStatsEvaluator(); + case BINARY: + return new GenericUDAFBinaryStatsEvaluator(); + default: + throw new UDFArgumentTypeException(0, + "Only integer/long/timestamp/float/double/string/binary/boolean type argument " + + "is accepted but " + + parameters[0].getTypeName() + " is passed."); + } + } + + /** + * GenericUDAFBooleanStatsEvaluator. + * + */ + public static class GenericUDAFBooleanStatsEvaluator extends GenericUDAFEvaluator { + + /* Object Inspector corresponding to the input parameter. + */ + PrimitiveObjectInspector inputOI; + + /* Partial aggregation result returned by TerminatePartial. Partial result is a struct + * containing a long field named "count". + */ + Object[] partialResult; + + /* Object Inspectors corresponding to the struct returned by TerminatePartial and the long + * field within the struct - "count" + */ + StructObjectInspector soi; + + StructField columnTypeField; + WritableStringObjectInspector columnTypeFieldOI; + + StructField countTruesField; + WritableLongObjectInspector countTruesFieldOI; + + StructField countFalsesField; + WritableLongObjectInspector countFalsesFieldOI; + + StructField countNullsField; + WritableLongObjectInspector countNullsFieldOI; + + /* Output of final result of the aggregation + */ + Object[] result; + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) + throws HiveException { + assert (parameters.length == 1); + super.init(m, parameters); + + // initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector) parameters[0]; + } else { + soi = (StructObjectInspector) parameters[0]; + + columnTypeField = soi.getStructFieldRef("ColumnType"); + columnTypeFieldOI = (WritableStringObjectInspector) + columnTypeField.getFieldObjectInspector(); + + countTruesField = soi.getStructFieldRef("CountTrues"); + countTruesFieldOI = (WritableLongObjectInspector) + countTruesField.getFieldObjectInspector(); + + countFalsesField = soi.getStructFieldRef("CountFalses"); + countFalsesFieldOI = (WritableLongObjectInspector) + countFalsesField.getFieldObjectInspector(); + + countNullsField = soi.getStructFieldRef("CountNulls"); + countNullsFieldOI = (WritableLongObjectInspector) countNullsField.getFieldObjectInspector(); + } + + // initialize output + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("CountTrues"); + fname.add("CountFalses"); + fname.add("CountNulls"); + + partialResult = new Object[4]; + partialResult[0] = new Text(); + partialResult[1] = new LongWritable(0); + partialResult[2] = new LongWritable(0); + partialResult[3] = new LongWritable(0); + + result = new Object[4]; + result[0] = new Text(); + result[1] = new LongWritable(0); + result[2] = new LongWritable(0); + result[3] = new LongWritable(0); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } + + static class BooleanStatsAgg implements AggregationBuffer { + String columnType; /* Datatype of column */ + long countTrues; /* Count of number of true values seen so far */ + long countFalses; /* Count of number of false values seen so far */ + long countNulls; /* Count of number of null values seen so far */ + }; + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + BooleanStatsAgg result = new BooleanStatsAgg(); + reset(result); + return result; + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + BooleanStatsAgg myagg = (BooleanStatsAgg) agg; + myagg.columnType = new String("Boolean"); + myagg.countTrues = 0; + myagg.countFalses = 0; + myagg.countNulls = 0; + } + + private void printDebugOutput(String functionName, AggregationBuffer agg) { + BooleanStatsAgg myagg = (BooleanStatsAgg) agg; + + LOG.debug(functionName); + + LOG.debug("Count of True Values:"); + LOG.debug(myagg.countTrues); + + LOG.debug("Count of False Values:"); + LOG.debug(myagg.countFalses); + + LOG.debug("Count of Null Values:"); + LOG.debug(myagg.countNulls); + } + + boolean warned = false; + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + assert (parameters.length == 1); + Object p = parameters[0]; + BooleanStatsAgg myagg = (BooleanStatsAgg) agg; + if (p == null) { + myagg.countNulls++; + } + else { + try { + boolean v = PrimitiveObjectInspectorUtils.getBoolean(p, inputOI); + if (v == false) { + myagg.countFalses++; + } else if (v == true){ + myagg.countTrues++; + } + } catch (NumberFormatException e) { + if (!warned) { + warned = true; + LOG.warn(getClass().getSimpleName() + " " + + StringUtils.stringifyException(e)); + LOG.warn(getClass().getSimpleName() + + " ignoring similar exceptions."); + } + } + } + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + BooleanStatsAgg myagg = (BooleanStatsAgg) agg; + ((Text) partialResult[0]).set(myagg.columnType); + ((LongWritable) partialResult[1]).set(myagg.countTrues); + ((LongWritable) partialResult[2]).set(myagg.countFalses); + ((LongWritable) partialResult[3]).set(myagg.countNulls); + return partialResult; + } + + @Override + public void merge(AggregationBuffer agg, Object partial) + throws HiveException { + if (partial != null) { + BooleanStatsAgg myagg = (BooleanStatsAgg) agg; + + Object partialCount = soi.getStructFieldData(partial, countTruesField); + myagg.countTrues += countTruesFieldOI.get(partialCount); + + partialCount = soi.getStructFieldData(partial, countFalsesField); + myagg.countFalses += countFalsesFieldOI.get(partialCount); + + partialCount = soi.getStructFieldData(partial, countNullsField); + myagg.countNulls += countNullsFieldOI.get(partialCount); + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + BooleanStatsAgg myagg = (BooleanStatsAgg) agg; + ((Text)result[0]).set(myagg.columnType); + ((LongWritable)result[1]).set(myagg.countTrues); + ((LongWritable)result[2]).set(myagg.countFalses); + ((LongWritable)result[3]).set(myagg.countNulls); + return result; + } + } + + /** + * GenericUDAFLongStatsEvaluator. + * + */ + public static class GenericUDAFLongStatsEvaluator extends GenericUDAFEvaluator { + + /* Object Inspector corresponding to the input parameter. + */ + PrimitiveObjectInspector inputOI; + + /* Partial aggregation result returned by TerminatePartial. Partial result is a struct + * containing a long field named "count". + */ + Object[] partialResult; + + /* Object Inspectors corresponding to the struct returned by TerminatePartial and the long + * field within the struct - "count" + */ + StructObjectInspector soi; + + StructField columnType; + WritableStringObjectInspector columnTypeFieldOI; + + StructField minField; + WritableLongObjectInspector minFieldOI; + + StructField maxField; + WritableLongObjectInspector maxFieldOI; + + StructField countNullsField; + WritableLongObjectInspector countNullsFieldOI; + + StructField ndvField; + WritableStringObjectInspector ndvFieldOI; + + /* Output of final result of the aggregation + */ + Object[] result; + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) + throws HiveException { + assert (parameters.length == 1); + super.init(m, parameters); + + // initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector) parameters[0]; + } else { + soi = (StructObjectInspector) parameters[0]; + + columnType = soi.getStructFieldRef("ColumnType"); + columnTypeFieldOI = (WritableStringObjectInspector) columnType.getFieldObjectInspector(); + + minField = soi.getStructFieldRef("Min"); + minFieldOI = (WritableLongObjectInspector) minField.getFieldObjectInspector(); + + maxField = soi.getStructFieldRef("Max"); + maxFieldOI = (WritableLongObjectInspector) maxField.getFieldObjectInspector(); + + countNullsField = soi.getStructFieldRef("CountNulls"); + countNullsFieldOI = (WritableLongObjectInspector) countNullsField.getFieldObjectInspector(); + + ndvField = soi.getStructFieldRef("BitVector"); + ndvFieldOI = (WritableStringObjectInspector) ndvField.getFieldObjectInspector(); + } + + // initialize output + if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) { + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("Min"); + fname.add("Max"); + fname.add("CountNulls"); + fname.add("BitVector"); + + partialResult = new Object[5]; + partialResult[0] = new Text(); + partialResult[1] = new LongWritable(0); + partialResult[2] = new LongWritable(0); + partialResult[3] = new LongWritable(0); + partialResult[4] = new Text(); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } else { + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("Min"); + fname.add("Max"); + fname.add("CountNulls"); + fname.add("NumDistinctValues"); + + result = new Object[5]; + result[0] = new Text(); + result[1] = new LongWritable(0); + result[2] = new LongWritable(0); + result[3] = new LongWritable(0); + result[4] = new LongWritable(0); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } + } + + static class LongStatsAgg implements AggregationBuffer { + String columnType; + long min; /* Minimum value seen so far */ + long max; /* Maximum value seen so far */ + long countNulls; /* Count of number of null values seen so far */ + LongNumDistinctValueEstimator numDV; /* Distinct value estimator */ + }; + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + LongStatsAgg result = new LongStatsAgg(); + result.numDV = new LongNumDistinctValueEstimator(); + reset(result); + return result; + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + LongStatsAgg myagg = (LongStatsAgg) agg; + myagg.columnType = new String("Long"); + myagg.min = 0; + myagg.max = 0; + myagg.countNulls = 0; + myagg.numDV.reset(); + } + + boolean warned = false; + + private void printDebugOutput(String functionName, AggregationBuffer agg) { + LongStatsAgg myagg = (LongStatsAgg) agg; + + LOG.debug(functionName); + + LOG.debug("Max Value:"); + LOG.debug(myagg.max); + + LOG.debug("Min Value:"); + LOG.debug(myagg.min); + + LOG.debug("Count of Null Values:"); + LOG.debug(myagg.countNulls); + + myagg.numDV.printNumDistinctValueEstimator(); + } + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + assert (parameters.length == 1); + + Object p = parameters[0]; + LongStatsAgg myagg = (LongStatsAgg) agg; + + //Update null counter if a null value is seen + if (p == null) { + myagg.countNulls++; + } + else { + try { + long v = PrimitiveObjectInspectorUtils.getLong(p, inputOI); + + //Update min counter if new value is less than min seen so far + if (v < myagg.min) { + myagg.min = v; + } + + //Update max counter if new value is greater than max seen so far + if (v > myagg.max) { + myagg.max = v; + } + + // Add value to NumDistinctValue Estimator + myagg.numDV.addToEstimator(v); + + } catch (NumberFormatException e) { + if (!warned) { + warned = true; + LOG.warn(getClass().getSimpleName() + " " + + StringUtils.stringifyException(e)); + LOG.warn(getClass().getSimpleName() + + " ignoring similar exceptions."); + } + } + } + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + LongStatsAgg myagg = (LongStatsAgg) agg; + + // Serialize numDistinctValue Estimator + Text t = myagg.numDV.serialize(); + + // Serialize rest of the field in the AggBuffer + ((Text) partialResult[0]).set(myagg.columnType); + ((LongWritable) partialResult[1]).set(myagg.min); + ((LongWritable) partialResult[2]).set(myagg.max); + ((LongWritable) partialResult[3]).set(myagg.countNulls); + ((Text) partialResult[4]).set(t); + + return partialResult; + } + + @Override + public void merge(AggregationBuffer agg, Object partial) + throws HiveException { + if (partial != null) { + LongStatsAgg myagg = (LongStatsAgg) agg; + + // Update min if min is lesser than the smallest value seen so far + Object partialValue = soi.getStructFieldData(partial, minField); + if (myagg.min > minFieldOI.get(partialValue)) { + myagg.min = minFieldOI.get(partialValue); + } + + // Update max if max is greater than the largest value seen so far + partialValue = soi.getStructFieldData(partial, maxField); + if (myagg.max < maxFieldOI.get(partialValue)) { + myagg.max = maxFieldOI.get(partialValue); + } + + // Update the null counter + partialValue = soi.getStructFieldData(partial, countNullsField); + myagg.countNulls += countNullsFieldOI.get(partialValue); + + // Merge numDistinctValue Estimators + partialValue = soi.getStructFieldData(partial, ndvField); + String v = ndvFieldOI.getPrimitiveJavaObject(partialValue); + NumDistinctValueEstimator o = new NumDistinctValueEstimator(v); + myagg.numDV.mergeEstimators(o); + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + LongStatsAgg myagg = (LongStatsAgg) agg; + long numDV = myagg.numDV.estimateNumDistinctValues(); + + // Serialize the result struct + ((Text) result[0]).set(myagg.columnType); + ((LongWritable) result[1]).set(myagg.min); + ((LongWritable) result[2]).set(myagg.max); + ((LongWritable) result[3]).set(myagg.countNulls); + ((LongWritable) result[4]).set(numDV); + + return result; + } + } + + /** + * GenericUDAFDoubleStatsEvaluator. + * + */ + public static class GenericUDAFDoubleStatsEvaluator extends GenericUDAFEvaluator { + + /* Object Inspector corresponding to the input parameter. + */ + PrimitiveObjectInspector inputOI; + + /* Partial aggregation result returned by TerminatePartial. Partial result is a struct + * containing a long field named "count". + */ + Object[] partialResult; + + /* Object Inspectors corresponding to the struct returned by TerminatePartial and the long + * field within the struct - "count" + */ + StructObjectInspector soi; + + StructField columnTypeField; + WritableStringObjectInspector columnTypeFieldOI; + + StructField minField; + WritableDoubleObjectInspector minFieldOI; + + StructField maxField; + WritableDoubleObjectInspector maxFieldOI; + + StructField countNullsField; + WritableLongObjectInspector countNullsFieldOI; + + StructField ndvField; + WritableStringObjectInspector ndvFieldOI; + + /* Output of final result of the aggregation + */ + Object[] result; + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) + throws HiveException { + assert (parameters.length == 1); + super.init(m, parameters); + + // initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector) parameters[0]; + } else { + + soi = (StructObjectInspector) parameters[0]; + + columnTypeField = soi.getStructFieldRef("ColumnType"); + columnTypeFieldOI = (WritableStringObjectInspector) columnTypeField.getFieldObjectInspector(); + + minField = soi.getStructFieldRef("Min"); + minFieldOI = (WritableDoubleObjectInspector) minField.getFieldObjectInspector(); + + maxField = soi.getStructFieldRef("Max"); + maxFieldOI = (WritableDoubleObjectInspector) maxField.getFieldObjectInspector(); + + countNullsField = soi.getStructFieldRef("CountNulls"); + countNullsFieldOI = (WritableLongObjectInspector) countNullsField.getFieldObjectInspector(); + + ndvField = soi.getStructFieldRef("BitVector"); + ndvFieldOI = (WritableStringObjectInspector) ndvField.getFieldObjectInspector(); + } + + // initialize output + if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) { + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("Min"); + fname.add("Max"); + fname.add("CountNulls"); + fname.add("BitVector"); + + partialResult = new Object[5]; + partialResult[0] = new Text(); + partialResult[1] = new DoubleWritable(0); + partialResult[2] = new DoubleWritable(0); + partialResult[3] = new LongWritable(0); + partialResult[4] = new Text(); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } else { + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("Min"); + fname.add("Max"); + fname.add("CountNulls"); + fname.add("NumDistinctValues"); + + result = new Object[5]; + result[0] = new Text(); + result[1] = new DoubleWritable(0); + result[2] = new DoubleWritable(0); + result[3] = new LongWritable(0); + result[4] = new LongWritable(0); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } + } + + static class DoubleStatsAgg implements AggregationBuffer { + String columnType; + double min; /* Minimum value seen so far */ + double max; /* Maximum value seen so far */ + long countNulls; /* Count of number of null values seen so far */ + DoubleNumDistinctValueEstimator numDV; /* Distinct value estimator */ + }; + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + DoubleStatsAgg result = new DoubleStatsAgg(); + result.numDV = new DoubleNumDistinctValueEstimator(); + reset(result); + return result; + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + DoubleStatsAgg myagg = (DoubleStatsAgg) agg; + myagg.columnType = new String("Double"); + myagg.min = 0.0; + myagg.max = 0.0; + myagg.countNulls = 0; + myagg.numDV.reset(); + } + + boolean warned = false; + + private void printDebugOutput(String functionName, AggregationBuffer agg) { + DoubleStatsAgg myagg = (DoubleStatsAgg) agg; + + LOG.debug(functionName); + + LOG.debug("Max Value:"); + LOG.debug(myagg.max); + + LOG.debug("Min Value:"); + LOG.debug(myagg.min); + + LOG.debug("Count of Null Values:"); + LOG.debug(myagg.countNulls); + + myagg.numDV.printNumDistinctValueEstimator(); + } + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + assert (parameters.length == 1); + + Object p = parameters[0]; + DoubleStatsAgg myagg = (DoubleStatsAgg) agg; + + // Update null counter if a null value is seen + if (p == null) { + myagg.countNulls++; + } + else { + try { + double v = PrimitiveObjectInspectorUtils.getDouble(p, inputOI); + + // Update min counter if new value is less than min seen so far + if (v < myagg.min) { + myagg.min = v; + } + + //ÊUpdate max counter if new value is greater than max seen so far + if (v > myagg.max) { + myagg.max = v; + } + + // Add value to NumDistinctValue Estimator + myagg.numDV.addToEstimator(v); + + } catch (NumberFormatException e) { + if (!warned) { + warned = true; + LOG.warn(getClass().getSimpleName() + " " + + StringUtils.stringifyException(e)); + LOG.warn(getClass().getSimpleName() + + " ignoring similar exceptions."); + } + } + } + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + DoubleStatsAgg myagg = (DoubleStatsAgg) agg; + + // Serialize numDistinctValue Estimator + Text t = myagg.numDV.serialize(); + + // Serialize the rest of the values in the AggBuffer + ((Text) partialResult[0]).set(myagg.columnType); + ((DoubleWritable) partialResult[1]).set(myagg.min); + ((DoubleWritable) partialResult[2]).set(myagg.max); + ((LongWritable) partialResult[3]).set(myagg.countNulls); + ((Text) partialResult[4]).set(t); + + return partialResult; + } + + @Override + public void merge(AggregationBuffer agg, Object partial) + throws HiveException { + if (partial != null) { + DoubleStatsAgg myagg = (DoubleStatsAgg) agg; + + // Update min if min is lesser than the smallest value seen so far + Object partialValue = soi.getStructFieldData(partial, minField); + if (myagg.min > minFieldOI.get(partialValue)) { + myagg.min = minFieldOI.get(partialValue); + } + + // Update max if max is greater than the largest value seen so far + partialValue = soi.getStructFieldData(partial, maxField); + if (myagg.max < maxFieldOI.get(partialValue)) { + myagg.max = maxFieldOI.get(partialValue); + } + + // Update the null counter + partialValue = soi.getStructFieldData(partial, countNullsField); + myagg.countNulls += countNullsFieldOI.get(partialValue); + + // Merge numDistinctValue Estimators + partialValue = soi.getStructFieldData(partial, ndvField); + String v = ndvFieldOI.getPrimitiveJavaObject(partialValue); + + NumDistinctValueEstimator o = new NumDistinctValueEstimator(v); + myagg.numDV.mergeEstimators(o); + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + DoubleStatsAgg myagg = (DoubleStatsAgg) agg; + long numDV = myagg.numDV.estimateNumDistinctValues(); + + // Serialize the result struct + ((Text) result[0]).set(myagg.columnType); + ((DoubleWritable) result[1]).set(myagg.min); + ((DoubleWritable) result[2]).set(myagg.max); + ((LongWritable) result[3]).set(myagg.countNulls); + ((LongWritable) result[4]).set(numDV); + + return result; + } + } + + /** + * GenericUDAFStringStatsEvaluator. + * + */ + public static class GenericUDAFStringStatsEvaluator extends GenericUDAFEvaluator { + + /* Object Inspector corresponding to the input parameter. + */ + PrimitiveObjectInspector inputOI; + + /* Partial aggregation result returned by TerminatePartial. Partial result is a struct + * containing a long field named "count". + */ + Object[] partialResult; + + /* Object Inspectors corresponding to the struct returned by TerminatePartial and the + * fields within the struct - "maxLength", "sumLength", "count", "countNulls", "ndv" + */ + StructObjectInspector soi; + + StructField columnTypeField; + WritableStringObjectInspector columnTypeFieldOI; + + StructField maxLengthField; + WritableLongObjectInspector maxLengthFieldOI; + + StructField sumLengthField; + WritableLongObjectInspector sumLengthFieldOI; + + StructField countField; + WritableLongObjectInspector countFieldOI; + + StructField countNullsField; + WritableLongObjectInspector countNullsFieldOI; + + StructField ndvField; + WritableStringObjectInspector ndvFieldOI; + + /* Output of final result of the aggregation + */ + Object[] result; + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) + throws HiveException { + assert (parameters.length == 1); + super.init(m, parameters); + + // initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector) parameters[0]; + } else { + soi = (StructObjectInspector) parameters[0]; + + columnTypeField = soi.getStructFieldRef("ColumnType"); + columnTypeFieldOI = (WritableStringObjectInspector) columnTypeField.getFieldObjectInspector(); + + maxLengthField = soi.getStructFieldRef("MaxLength"); + maxLengthFieldOI = (WritableLongObjectInspector) maxLengthField.getFieldObjectInspector(); + + sumLengthField = soi.getStructFieldRef("SumLength"); + sumLengthFieldOI = (WritableLongObjectInspector) sumLengthField.getFieldObjectInspector(); + + countField = soi.getStructFieldRef("Count"); + countFieldOI = (WritableLongObjectInspector) countField.getFieldObjectInspector(); + + countNullsField = soi.getStructFieldRef("CountNulls"); + countNullsFieldOI = (WritableLongObjectInspector) countNullsField.getFieldObjectInspector(); + + ndvField = soi.getStructFieldRef("BitVector"); + ndvFieldOI = (WritableStringObjectInspector) ndvField.getFieldObjectInspector(); + + } + + // initialize output + if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) { + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("MaxLength"); + fname.add("SumLength"); + fname.add("Count"); + fname.add("CountNulls"); + fname.add("BitVector"); + + partialResult = new Object[6]; + partialResult[0] = new Text(); + partialResult[1] = new LongWritable(0); + partialResult[2] = new LongWritable(0); + partialResult[3] = new LongWritable(0); + partialResult[4] = new LongWritable(0); + partialResult[5] = new Text(); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } else { + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("MaxLength"); + fname.add("AvgLength"); + fname.add("CountNulls"); + fname.add("NumDistinctValues"); + + result = new Object[5]; + result[0] = new Text(); + result[1] = new LongWritable(0); + result[2] = new DoubleWritable(0); + result[3] = new LongWritable(0); + result[4] = new LongWritable(0); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } + } + + static class StringStatsAgg implements AggregationBuffer { + String columnType; + long maxLength; /* Maximum length seen so far */ + long sumLength; /* Sum of lengths of all values seen so far */ + long count; /* Count of all values seen so far */ + long countNulls; /* Count of number of null values seen so far */ + StringNumDistinctValueEstimator numDV; /* Distinct value estimator */ + }; + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + StringStatsAgg result = new StringStatsAgg(); + result.numDV = new StringNumDistinctValueEstimator(); + reset(result); + return result; + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + StringStatsAgg myagg = (StringStatsAgg) agg; + myagg.columnType = new String("String"); + myagg.maxLength = 0; + myagg.sumLength = 0; + myagg.count = 0; + myagg.countNulls = 0; + myagg.numDV.reset(); + } + + boolean warned = false; + + private void printDebugOutput(String functionName, AggregationBuffer agg) { + StringStatsAgg myagg = (StringStatsAgg) agg; + + LOG.debug(functionName); + + LOG.debug("Max Length:"); + LOG.debug(myagg.maxLength); + + LOG.debug("Sum of Length:"); + LOG.debug(myagg.sumLength); + + LOG.debug("Count of non-Null Values:"); + LOG.debug(myagg.count); + + LOG.debug("Count of Null Values:"); + LOG.debug(myagg.countNulls); + + myagg.numDV.printNumDistinctValueEstimator(); + } + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + assert (parameters.length == 1); + + Object p = parameters[0]; + StringStatsAgg myagg = (StringStatsAgg) agg; + + // Update null counter if a null value is seen + if (p == null) { + myagg.countNulls++; + } + else { + try { + String v = PrimitiveObjectInspectorUtils.getString(p, inputOI); + + // Update max length if new length is greater than the ones seen so far + int len = v.length(); + if (len > myagg.maxLength) { + myagg.maxLength = len; + } + + // Update sum length with the new length + myagg.sumLength += len; + + // Increment count of values seen so far + myagg.count++; + + // Add string value to NumDistinctValue Estimator + myagg.numDV.addToEstimator(v); + + } catch (NumberFormatException e) { + if (!warned) { + warned = true; + LOG.warn(getClass().getSimpleName() + " " + + StringUtils.stringifyException(e)); + LOG.warn(getClass().getSimpleName() + + " ignoring similar exceptions."); + } + } + } + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + StringStatsAgg myagg = (StringStatsAgg) agg; + + // Serialize numDistinctValue Estimator + Text t = myagg.numDV.serialize(); + + // Serialize the rest of the values in the AggBuffer + ((Text) partialResult[0]).set(myagg.columnType); + ((LongWritable) partialResult[1]).set(myagg.maxLength); + ((LongWritable) partialResult[2]).set(myagg.sumLength); + ((LongWritable) partialResult[3]).set(myagg.count); + ((LongWritable) partialResult[4]).set(myagg.countNulls); + ((Text) partialResult[5]).set(t); + + return partialResult; + } + + @Override + public void merge(AggregationBuffer agg, Object partial) + throws HiveException { + if (partial != null) { + StringStatsAgg myagg = (StringStatsAgg) agg; + + // Update maxLength if length is greater than the largest value seen so far + Object partialValue = soi.getStructFieldData(partial, maxLengthField); + if (myagg.maxLength < maxLengthFieldOI.get(partialValue)) { + myagg.maxLength = maxLengthFieldOI.get(partialValue); + } + + // Update sum of the length of the values seen so far + partialValue = soi.getStructFieldData(partial, sumLengthField); + myagg.sumLength += sumLengthFieldOI.get(partialValue); + + // Update the count of the number of values seen so far + partialValue = soi.getStructFieldData(partial, countField); + myagg.count += countFieldOI.get(partialValue); + + // Update the null counter + partialValue = soi.getStructFieldData(partial, countNullsField); + myagg.countNulls += countNullsFieldOI.get(partialValue); + + // Merge numDistinctValue Estimators + partialValue = soi.getStructFieldData(partial, ndvField); + String v = ndvFieldOI.getPrimitiveJavaObject(partialValue); + NumDistinctValueEstimator o = new NumDistinctValueEstimator(v); + myagg.numDV.mergeEstimators(o); + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + StringStatsAgg myagg = (StringStatsAgg) agg; + long numDV = myagg.numDV.estimateNumDistinctValues(); + double avgLength = (double)(myagg.sumLength/(1.0 * (myagg.count + myagg.countNulls))); + + // Serialize the result struct + ((Text) result[0]).set(myagg.columnType); + ((LongWritable) result[1]).set(myagg.maxLength); + ((DoubleWritable) result[2]).set(avgLength); + ((LongWritable) result[3]).set(myagg.countNulls); + ((LongWritable) result[4]).set(numDV); + + return result; + } + } + + /** + * GenericUDAFBinaryStatsEvaluator. + * + */ + public static class GenericUDAFBinaryStatsEvaluator extends GenericUDAFEvaluator { + + /* Object Inspector corresponding to the input parameter. + */ + PrimitiveObjectInspector inputOI; + + /* Partial aggregation result returned by TerminatePartial. Partial result is a struct + * containing a long field named "count". + */ + Object[] partialResult; + + /* Object Inspectors corresponding to the struct returned by TerminatePartial and the + * fields within the struct - "maxLength", "sumLength", "count", "countNulls" + */ + StructObjectInspector soi; + + StructField columnTypeField; + WritableStringObjectInspector columnTypeFieldOI; + + StructField maxLengthField; + WritableLongObjectInspector maxLengthFieldOI; + + StructField sumLengthField; + WritableLongObjectInspector sumLengthFieldOI; + + StructField countField; + WritableLongObjectInspector countFieldOI; + + StructField countNullsField; + WritableLongObjectInspector countNullsFieldOI; + + /* Output of final result of the aggregation + */ + Object[] result; + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) + throws HiveException { + assert (parameters.length == 1); + super.init(m, parameters); + + // initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector) parameters[0]; + } else { + soi = (StructObjectInspector) parameters[0]; + + columnTypeField = soi.getStructFieldRef("ColumnType"); + columnTypeFieldOI = (WritableStringObjectInspector) columnTypeField.getFieldObjectInspector(); + + maxLengthField = soi.getStructFieldRef("MaxLength"); + maxLengthFieldOI = (WritableLongObjectInspector) maxLengthField.getFieldObjectInspector(); + + sumLengthField = soi.getStructFieldRef("SumLength"); + sumLengthFieldOI = (WritableLongObjectInspector) sumLengthField.getFieldObjectInspector(); + + countField = soi.getStructFieldRef("Count"); + countFieldOI = (WritableLongObjectInspector) countField.getFieldObjectInspector(); + + countNullsField = soi.getStructFieldRef("CountNulls"); + countNullsFieldOI = (WritableLongObjectInspector) countNullsField.getFieldObjectInspector(); + + } + + // initialize output + if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) { + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("MaxLength"); + fname.add("SumLength"); + fname.add("Count"); + fname.add("CountNulls"); + + partialResult = new Object[5]; + partialResult[0] = new Text(); + partialResult[1] = new LongWritable(0); + partialResult[2] = new LongWritable(0); + partialResult[3] = new LongWritable(0); + partialResult[4] = new LongWritable(0); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } else { + List foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + + List fname = new ArrayList(); + fname.add("ColumnType"); + fname.add("MaxLength"); + fname.add("AvgLength"); + fname.add("CountNulls"); + + result = new Object[4]; + result[0] = new Text(); + result[1] = new LongWritable(0); + result[2] = new DoubleWritable(0); + result[3] = new LongWritable(0); + + return ObjectInspectorFactory.getStandardStructObjectInspector(fname, + foi); + } + } + + static class BinaryStatsAgg implements AggregationBuffer { + String columnType; + long maxLength; /* Maximum length seen so far */ + long sumLength; /* Sum of lengths of all values seen so far */ + long count; /* Count of all values seen so far */ + long countNulls; /* Count of number of null values seen so far */ + }; + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + BinaryStatsAgg result = new BinaryStatsAgg(); + reset(result); + return result; + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + BinaryStatsAgg myagg = (BinaryStatsAgg) agg; + myagg.columnType = new String("Binary"); + myagg.maxLength = 0; + myagg.sumLength = 0; + myagg.count = 0; + myagg.countNulls = 0; + } + + boolean warned = false; + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + assert (parameters.length == 1); + + Object p = parameters[0]; + BinaryStatsAgg myagg = (BinaryStatsAgg) agg; + + // Update null counter if a null value is seen + if (p == null) { + myagg.countNulls++; + } + else { + try { + BytesWritable v = PrimitiveObjectInspectorUtils.getBinary(p, inputOI); + + // Update max length if new length is greater than the ones seen so far + int len = v.getLength(); + if (len > myagg.maxLength) { + myagg.maxLength = len; + } + + // Update sum length with the new length + myagg.sumLength += len; + + // Increment count of values seen so far + myagg.count++; + + } catch (NumberFormatException e) { + if (!warned) { + warned = true; + LOG.warn(getClass().getSimpleName() + " " + + StringUtils.stringifyException(e)); + LOG.warn(getClass().getSimpleName() + + " ignoring similar exceptions."); + } + } + } + } + + private void printDebugOutput(String functionName, AggregationBuffer agg) { + BinaryStatsAgg myagg = (BinaryStatsAgg) agg; + + LOG.debug(functionName); + + LOG.debug("Max Length:"); + LOG.debug(myagg.maxLength); + + LOG.debug("Sum of Length:"); + LOG.debug(myagg.sumLength); + + LOG.debug("Count of non-Null Values:"); + LOG.debug(myagg.count); + + LOG.debug("Count of Null Values:"); + LOG.debug(myagg.countNulls); + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + BinaryStatsAgg myagg = (BinaryStatsAgg) agg; + + // Serialize AggBuffer + ((Text) partialResult[0]).set(myagg.columnType); + ((LongWritable) partialResult[1]).set(myagg.maxLength); + ((LongWritable) partialResult[2]).set(myagg.sumLength); + ((LongWritable) partialResult[3]).set(myagg.count); + ((LongWritable) partialResult[4]).set(myagg.countNulls); + + return partialResult; + } + + @Override + public void merge(AggregationBuffer agg, Object partial) + throws HiveException { + if (partial != null) { + BinaryStatsAgg myagg = (BinaryStatsAgg) agg; + + // Update maxLength if length is greater than the largest value seen so far + Object partialValue = soi.getStructFieldData(partial, maxLengthField); + if (myagg.maxLength < maxLengthFieldOI.get(partialValue)) { + myagg.maxLength = maxLengthFieldOI.get(partialValue); + } + + // Update sum of the length of the values seen so far + partialValue = soi.getStructFieldData(partial, sumLengthField); + myagg.sumLength += sumLengthFieldOI.get(partialValue); + + // Update the count of the number of values seen so far + partialValue = soi.getStructFieldData(partial, countField); + myagg.count += countFieldOI.get(partialValue); + + // Update the null counter + partialValue = soi.getStructFieldData(partial, countNullsField); + myagg.countNulls += countNullsFieldOI.get(partialValue); + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + BinaryStatsAgg myagg = (BinaryStatsAgg) agg; + double avgLength = (double)(myagg.sumLength/(1.0 * (myagg.count + myagg.countNulls))); + + // Serialize the result struct + ((Text) result[0]).set(myagg.columnType); + ((LongWritable) result[1]).set(myagg.maxLength); + ((DoubleWritable) result[2]).set(avgLength); + ((LongWritable) result[3]).set(myagg.countNulls); + + return result; + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/LongNumDistinctValueEstimator.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/LongNumDistinctValueEstimator.java new file mode 100644 index 0000000..6311079 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/LongNumDistinctValueEstimator.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +public class LongNumDistinctValueEstimator extends NumDistinctValueEstimator { + + public LongNumDistinctValueEstimator() { + super(); + } + + public LongNumDistinctValueEstimator(String s) { + super(s); + } + + @Override + public void addToEstimator(long v) { + /* Update summary bitVector : + * Generate hash value of the long value and mod it by 2^bitVectorSize-1. + * In this implementation bitVectorSize is 31. + */ + super.addToEstimator(v); + } + + @Override + public void addToEstimatorPCSA(long v) { + super.addToEstimatorPCSA(v); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java new file mode 100644 index 0000000..dda82f5 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java @@ -0,0 +1,326 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; +import java.util.Random; + +import javolution.util.FastBitSet; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; + +public class NumDistinctValueEstimator { + + static final Log LOG = LogFactory.getLog(NumDistinctValueEstimator.class.getName()); + + private final int bitVectorSize = 32; + //TODO: make this a config param - take in accuracy percentage + private final int numBitVectors = 64; + + // Refer to Flajolet-Martin'86 for the value of phi + private final double phi = 0.77351; + + private int[] a; + private int[] b; + private FastBitSet[] bitVector = new FastBitSet[numBitVectors]; + + private Random aValue; + private Random bValue; + + /* Create a new distinctValueEstimator + * + */ + public NumDistinctValueEstimator() { + bitVector = new FastBitSet[numBitVectors]; + + for (int i=0; i< numBitVectors; i++) { + bitVector[i] = new FastBitSet(bitVectorSize); + } + + a = new int[numBitVectors]; + b = new int[numBitVectors]; + + aValue = new Random(79798); + bValue = new Random(34115); + + for (int i = 0; i < numBitVectors; i++) { + int randVal; + /* a and b shouldn't be even; If a and b are even, then none of the values + * will set bit 0 thus introducing errors in the estimate. Both a and b can be even + * 25% of the times and as a result 25% of the bit vectors could be inaccurate. To avoid this + * always pick odd values for a and b. + */ + do { + randVal = aValue.nextInt(); + } while (randVal % 2 == 0); + + a[i] = randVal; + + do { + randVal = bValue.nextInt(); + } while (randVal % 2 == 0); + + b[i] = randVal; + + if (a[i] < 0) { + a[i] = a[i] + (1 << (bitVectorSize -1)); + } + + if (b[i] < 0) { + b[i] = b[i] + (1 << (bitVectorSize -1)); + } + } + } + + public NumDistinctValueEstimator(String s) { + FastBitSet b[] = deserialize(s); + bitVector = new FastBitSet[numBitVectors]; + for(int i=0; i = '0' && c <= '9') { + String t = new String(); + t = t + c; + c = s.charAt(i); + i = i + 1; + + while (c != ',' && c!= '}') { + t = t + c; + c = s.charAt(i); + i = i + 1; + } + + int bitIndex = Integer.parseInt(t); + assert(bitIndex >= 0); + assert(vectorIndex < numBitVectors); + b[vectorIndex].set(bitIndex); + if (c == '}') { + vectorIndex = vectorIndex + 1; + } + } + } + return b; + } + + private int generateHash(long v, int hashNum) { + int mod = 1 << (bitVectorSize - 1) - 1; + long tempHash = a[hashNum] * v + b[hashNum]; + tempHash %= mod; + int hash = (int) tempHash; + + /* Hash function should map the long value to 0...2^L-1. + * Hence hash value has to be non-negative. + */ + if (hash < 0) { + hash = hash + mod + 1; + } + return hash; + } + + private int generateHashForPCSA(long v) { + int mod = 1 << (bitVectorSize - 1) - 1; + long tempHash = a[0] * v + b[0]; + tempHash %= mod; + int hash = (int) tempHash; + + /* Hash function should map the long value to 0...2^L-1. + * Hence hash value has to be non-negative. + */ + if (hash < 0) { + hash = hash + mod + 1; + } + return hash; + } + + public void addToEstimator(long v) { + /* Update summary bitVector : + * Generate hash value of the long value and mod it by 2^bitVectorSize-1. + * In this implementation bitVectorSize is 31. + */ + + for (int i = 0; i> 1; + } + + // Set bitvector[index] := 1 + bitVector[i].set(index); + } + } + + public void addToEstimatorPCSA(long v) { + int hash = generateHashForPCSA(v); + int rho = hash/numBitVectors; + int index; + + // Find the index of the least significant bit that is 1 + for (index=0; index> 1; + } + + // Set bitvector[index] := 1 + bitVector[hash%numBitVectors].set(index); + } + + public void mergeEstimators(NumDistinctValueEstimator o) { + // Bitwise OR the bitvector with the bitvector in the agg buffer + for (int i=0; i Map Operator Tree: + employee_part + TableScan + alias: employee_part + Select Operator + expressions: + expr: employeeid + type: int + outputColumnNames: employeeid + Group By Operator + aggregations: + expr: compute_stats(employeeid) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: struct + Reduce Operator Tree: + Group By Operator + aggregations: + expr: compute_stats(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: struct + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Column Stats Work + + +PREHOOK: query: analyze table Employee_Part partition (employeeSalary='2000') compute statistics for columns employeeID +PREHOOK: type: QUERY +PREHOOK: Input: default@employee_part@employeesalary=2000 +#### A masked pattern was here #### +POSTHOOK: query: analyze table Employee_Part partition (employeeSalary='2000') compute statistics for columns employeeID +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee_part@employeesalary=2000 +#### A masked pattern was here #### +PREHOOK: query: explain +analyze table Employee_Part partition (employeeSalary='4000') compute statistics for columns employeeName, employeeID +PREHOOK: type: QUERY +POSTHOOK: query: explain +analyze table Employee_Part partition (employeeSalary='4000') compute statistics for columns employeeName, employeeID +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_ANALYZE (TOK_TAB (TOK_TABNAME Employee_Part) (TOK_PARTSPEC (TOK_PARTVAL employeeSalary '4000'))) (TOK_TABCOLNAME employeeName employeeID)) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Alias -> Map Operator Tree: + employee_part + TableScan + alias: employee_part + Select Operator + expressions: + expr: employeename + type: string + expr: employeeid + type: int + outputColumnNames: employeename, employeeid + Group By Operator + aggregations: + expr: compute_stats(employeename) + expr: compute_stats(employeeid) + bucketGroup: false + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: struct + expr: _col1 + type: struct + Reduce Operator Tree: + Group By Operator + aggregations: + expr: compute_stats(VALUE._col0) + expr: compute_stats(VALUE._col1) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: struct + expr: _col1 + type: struct + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Column Stats Work + + +PREHOOK: query: analyze table Employee_Part partition (employeeSalary='4000') compute statistics for columns employeeName, employeeID +PREHOOK: type: QUERY +PREHOOK: Input: default@employee_part@employeesalary=4000 +#### A masked pattern was here #### +POSTHOOK: query: analyze table Employee_Part partition (employeeSalary='4000') compute statistics for columns employeeName, employeeID +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee_part@employeesalary=4000 +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/columnstats_tbllvl.q.out ql/src/test/results/clientpositive/columnstats_tbllvl.q.out new file mode 100644 index 0000000..e2afe6b --- /dev/null +++ ql/src/test/results/clientpositive/columnstats_tbllvl.q.out @@ -0,0 +1,119 @@ +PREHOOK: query: DROP TABLE IF EXISTS UserVisits_web_text_none +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS UserVisits_web_text_none +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE UserVisits_web_text_none ( + sourceIP string, + destURL string, + visitDate string, + adRevenue float, + userAgent string, + cCode string, + lCode string, + sKeyword string, + avgTimeOnSite int) +row format delimited fields terminated by '|' stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE UserVisits_web_text_none ( + sourceIP string, + destURL string, + visitDate string, + adRevenue float, + userAgent string, + cCode string, + lCode string, + sKeyword string, + avgTimeOnSite int) +row format delimited fields terminated by '|' stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@UserVisits_web_text_none +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/UserVisits.dat" INTO TABLE UserVisits_web_text_none +PREHOOK: type: LOAD +PREHOOK: Output: default@uservisits_web_text_none +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/UserVisits.dat" INTO TABLE UserVisits_web_text_none +POSTHOOK: type: LOAD +POSTHOOK: Output: default@uservisits_web_text_none +PREHOOK: query: explain +analyze table UserVisits_web_text_none compute statistics for columns sourceIP, avgTimeOnSite, adRevenue +PREHOOK: type: QUERY +POSTHOOK: query: explain +analyze table UserVisits_web_text_none compute statistics for columns sourceIP, avgTimeOnSite, adRevenue +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_ANALYZE (TOK_TAB (TOK_TABNAME UserVisits_web_text_none)) (TOK_TABCOLNAME sourceIP avgTimeOnSite adRevenue)) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Alias -> Map Operator Tree: + uservisits_web_text_none + TableScan + alias: uservisits_web_text_none + Select Operator + expressions: + expr: sourceip + type: string + expr: avgtimeonsite + type: int + expr: adrevenue + type: float + outputColumnNames: sourceip, avgtimeonsite, adrevenue + Group By Operator + aggregations: + expr: compute_stats(sourceip) + expr: compute_stats(avgtimeonsite) + expr: compute_stats(adrevenue) + bucketGroup: false + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: struct + expr: _col1 + type: struct + expr: _col2 + type: struct + Reduce Operator Tree: + Group By Operator + aggregations: + expr: compute_stats(VALUE._col0) + expr: compute_stats(VALUE._col1) + expr: compute_stats(VALUE._col2) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: struct + expr: _col1 + type: struct + expr: _col2 + type: struct + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Column Stats Work + + +PREHOOK: query: analyze table UserVisits_web_text_none compute statistics for columns sourceIP, avgTimeOnSite, adRevenue +PREHOOK: type: QUERY +PREHOOK: Input: default@uservisits_web_text_none +#### A masked pattern was here #### +POSTHOOK: query: analyze table UserVisits_web_text_none compute statistics for columns sourceIP, avgTimeOnSite, adRevenue +POSTHOOK: type: QUERY +POSTHOOK: Input: default@uservisits_web_text_none +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/compute_stats_binary.q.out ql/src/test/results/clientpositive/compute_stats_binary.q.out new file mode 100644 index 0000000..4eec218 --- /dev/null +++ ql/src/test/results/clientpositive/compute_stats_binary.q.out @@ -0,0 +1,33 @@ +PREHOOK: query: create table tab_binary(a binary) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table tab_binary(a binary) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tab_binary +PREHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/binary.txt" INTO TABLE tab_binary +PREHOOK: type: LOAD +PREHOOK: Output: default@tab_binary +POSTHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/binary.txt" INTO TABLE tab_binary +POSTHOOK: type: LOAD +POSTHOOK: Output: default@tab_binary +PREHOOK: query: select count(*) from tab_binary +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_binary +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tab_binary +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_binary +#### A masked pattern was here #### +10 +PREHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_binary +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_binary +#### A masked pattern was here #### +POSTHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_binary +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_binary +#### A masked pattern was here #### +{"columntype":"Binary","maxlength":36,"avglength":20.0,"countnulls":0} diff --git ql/src/test/results/clientpositive/compute_stats_boolean.q.out ql/src/test/results/clientpositive/compute_stats_boolean.q.out new file mode 100644 index 0000000..0a7f111 --- /dev/null +++ ql/src/test/results/clientpositive/compute_stats_boolean.q.out @@ -0,0 +1,33 @@ +PREHOOK: query: create table tab_bool(a boolean) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table tab_bool(a boolean) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tab_bool +PREHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/bool.txt" INTO TABLE tab_bool +PREHOOK: type: LOAD +PREHOOK: Output: default@tab_bool +POSTHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/bool.txt" INTO TABLE tab_bool +POSTHOOK: type: LOAD +POSTHOOK: Output: default@tab_bool +PREHOOK: query: select count(*) from tab_bool +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_bool +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tab_bool +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_bool +#### A masked pattern was here #### +33 +PREHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_bool +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_bool +#### A masked pattern was here #### +POSTHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_bool +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_bool +#### A masked pattern was here #### +{"columntype":"Boolean","counttrues":13,"countfalses":19,"countnulls":1} diff --git ql/src/test/results/clientpositive/compute_stats_double.q.out ql/src/test/results/clientpositive/compute_stats_double.q.out new file mode 100644 index 0000000..6d9837a --- /dev/null +++ ql/src/test/results/clientpositive/compute_stats_double.q.out @@ -0,0 +1,33 @@ +PREHOOK: query: create table tab_double(a double) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table tab_double(a double) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tab_double +PREHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/double.txt" INTO TABLE tab_double +PREHOOK: type: LOAD +PREHOOK: Output: default@tab_double +POSTHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/double.txt" INTO TABLE tab_double +POSTHOOK: type: LOAD +POSTHOOK: Output: default@tab_double +PREHOOK: query: select count(*) from tab_double +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_double +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tab_double +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_double +#### A masked pattern was here #### +16 +PREHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_double +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_double +#### A masked pattern was here #### +POSTHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_double +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_double +#### A masked pattern was here #### +{"columntype":"Double","min":-87.2,"max":435.33,"countnulls":2,"numdistinctvalues":9} diff --git ql/src/test/results/clientpositive/compute_stats_long.q.out ql/src/test/results/clientpositive/compute_stats_long.q.out new file mode 100644 index 0000000..0668115 --- /dev/null +++ ql/src/test/results/clientpositive/compute_stats_long.q.out @@ -0,0 +1,33 @@ +PREHOOK: query: create table tab_int(a int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table tab_int(a int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tab_int +PREHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/int.txt" INTO TABLE tab_int +PREHOOK: type: LOAD +PREHOOK: Output: default@tab_int +POSTHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/int.txt" INTO TABLE tab_int +POSTHOOK: type: LOAD +POSTHOOK: Output: default@tab_int +PREHOOK: query: select count(*) from tab_int +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_int +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tab_int +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_int +#### A masked pattern was here #### +12 +PREHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_int +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_int +#### A masked pattern was here #### +POSTHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_int +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_int +#### A masked pattern was here #### +{"columntype":"Long","min":0,"max":344,"countnulls":1,"numdistinctvalues":14} diff --git ql/src/test/results/clientpositive/compute_stats_string.q.out ql/src/test/results/clientpositive/compute_stats_string.q.out new file mode 100644 index 0000000..e6cf3cb --- /dev/null +++ ql/src/test/results/clientpositive/compute_stats_string.q.out @@ -0,0 +1,33 @@ +PREHOOK: query: create table tab_string(a string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table tab_string(a string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tab_string +PREHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/string.txt" INTO TABLE tab_string +PREHOOK: type: LOAD +PREHOOK: Output: default@tab_string +POSTHOOK: query: -- insert some data +LOAD DATA LOCAL INPATH "../data/files/string.txt" INTO TABLE tab_string +POSTHOOK: type: LOAD +POSTHOOK: Output: default@tab_string +PREHOOK: query: select count(*) from tab_string +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_string +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tab_string +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_string +#### A masked pattern was here #### +10 +PREHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_string +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_string +#### A masked pattern was here #### +POSTHOOK: query: -- compute statistical summary of data +select compute_stats(a) from tab_string +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_string +#### A masked pattern was here #### +{"columntype":"String","maxlength":11,"avglength":3.9,"countnulls":0,"numdistinctvalues":5} diff --git ql/src/test/results/clientpositive/show_functions.q.out ql/src/test/results/clientpositive/show_functions.q.out index 02f6a94..7f45981 100644 --- ql/src/test/results/clientpositive/show_functions.q.out +++ ql/src/test/results/clientpositive/show_functions.q.out @@ -36,6 +36,7 @@ ceil ceiling coalesce collect_set +compute_stats concat concat_ws context_ngrams @@ -182,6 +183,7 @@ ceil ceiling coalesce collect_set +compute_stats concat concat_ws context_ngrams diff --git ql/src/test/results/clientpositive/udaf_histogram.q.out ql/src/test/results/clientpositive/udaf_histogram.q.out new file mode 100644 index 0000000..8b13789 --- /dev/null +++ ql/src/test/results/clientpositive/udaf_histogram.q.out @@ -0,0 +1 @@ + diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java index 5430814..b049294 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java @@ -665,7 +665,7 @@ public final class PrimitiveObjectInspectorUtils { /** * Get the String value out of a primitive object. Note that * NullPointerException will be thrown if o is null. Note that - * NumberFormatException will be thrown if o is not a valid number. + * RuntimeException will be thrown if o is not a valid string. */ public static String getString(Object o, PrimitiveObjectInspector oi) {