diff --git beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java index 74591ac..7e5fee3 100644 --- beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java +++ beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java @@ -190,6 +190,36 @@ boolean validateLocations(Connection conn, URI[] defaultServers) throws HiveMeta return rtn; } + /** + * Validate the stats stored in the TAB_COL_STATS table. + * Each column should only contain a single row. Data is invalid otherwise. + */ + boolean validateColumnStatistics(Connection conn) throws HiveMetaException { + System.out.println("Validating Column Statistics"); + boolean rtn = true; + try { + Statement stmt = conn.createStatement(); + String query = needsQuotedIdentifier ? + ("select \"DB_NAME\", \"TABLE_NAME\", \"COLUMN_NAME\", count(*) from \"TAB_COL_STATS\" group by \"DB_NAME\", \"TABLE_NAME\", \"COLUMN_NAME\" having count(*) > 1 ") + : ("select DB_NAME, TABLE_NAME, COLUMN_NAME, count(*) from TAB_COL_STATS group by DB_NAME, TABLE_NAME, COLUMN_NAME having count(*) > 1 "); + + ResultSet res = stmt.executeQuery(query); + while (res.next()) { + String dbName = res.getString("DB_NAME"); + String tableName = res.getString("TABLE_NAME"); + String columnName = res.getString("COLUMN_NAME"); + int statCount = res.getInt(4); + rtn = false; + System.err.println("TAB_COL_STATS should contain only ONE row for DB Name=" + dbName + ", Table Name=" + tableName + ", Column Name=" + columnName + ", count=" + statCount); + } + + System.out.println((rtn ? "Succeeded" : "Failed") + " in column statistics validation for max rows per column."); + } catch(SQLException e) { + throw new HiveMetaException("Failed to validate column statistics for max number of rows ", e); + } + return rtn; + } + private String getNameOrID(ResultSet res, int nameInx, int idInx) throws SQLException { String itemName = res.getString(nameInx); return (itemName == null || itemName.isEmpty()) ? "ID: " + res.getString(idInx) : "Name: " + itemName; @@ -621,6 +651,11 @@ public void doValidate() throws HiveMetaException { } else { System.out.println("[WARN]\n"); } + if (validateColumnStatistics(conn)) { + System.out.println("[SUCCESS]\n"); + } else { + System.out.println("[WARN]\n"); + } } finally { if (conn != null) { try { diff --git itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java index 9f08693..3cf4b4d 100644 --- itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java +++ itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java @@ -155,6 +155,43 @@ public void testValidateSchemaTables() throws Exception { assertTrue(isValid); } + /** + * Test to validate that there is only a single set of column statistics for a given column in the HMS metastore. + * More than a single row is invalid and the code fails. TAB_COL_STATS + * @throws Exception + */ + public void testValidateColumnStatistics() throws Exception { + schemaTool.doInit(); + boolean isValid = (boolean)schemaTool.validateColumnStatistics(conn); + assertTrue(isValid); + + // insert some good data + String[] scripts = new String[] { + "insert into DBS values(2, 'my db', 'hdfs://myhost.com:8020/user/hive/warehouse/mydb', 'mydb', 'public', 'role')", + "insert into SDS(SD_ID,CD_ID,INPUT_FORMAT,IS_COMPRESSED,IS_STOREDASSUBDIRECTORIES,LOCATION,NUM_BUCKETS,OUTPUT_FORMAT,SERDE_ID) values (1,null,'org.apache.hadoop.mapred.TextInputFormat','N', 'N','hdfs://myhost.com:8020/user/hive/warehouse/mydb',-1,'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',null)", + "insert into TBLS(TBL_ID,CREATE_TIME,DB_ID,LAST_ACCESS_TIME,OWNER,RETENTION,SD_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT,VIEW_ORIGINAL_TEXT,IS_REWRITE_ENABLED) values (2 ,1435255431,2,0 ,'hive',0,1,'mytable','MANAGED_TABLE',NULL,NULL,'n')", + "insert into TAB_COL_STATS(CS_ID,DB_NAME,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE,TBL_ID,LONG_LOW_VALUE,LONG_HIGH_VALUE,NUM_NULLS,LAST_ANALYZED) values (1 , 'mydb', 'mytable', 'col1', 'int', 2, 11111, 111111, 0, 1513187965)", + "insert into TAB_COL_STATS(CS_ID,DB_NAME,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE,TBL_ID,LONG_LOW_VALUE,LONG_HIGH_VALUE,NUM_NULLS,LAST_ANALYZED) values (2 , 'mydb', 'mytable', 'col2', 'int', 2, 22222, 222222, 0, 1513187965)", + "insert into TAB_COL_STATS(CS_ID,DB_NAME,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE,TBL_ID,DOUBLE_HIGH_VALUE,DOUBLE_LOW_VALUE,NUM_NULLS,LAST_ANALYZED) values (3 , 'mydb', 'mytable', 'col3', 'float', 2, 33333.33, 333333.33, 0, 1513187965)" + }; + + File scriptFile = generateTestScript(scripts); + schemaTool.runBeeLine(scriptFile.getPath()); + isValid = schemaTool.validateColumnStatistics(conn); + assertTrue(isValid); + + scripts = new String[] { + "insert into TAB_COL_STATS(CS_ID,DB_NAME,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE,TBL_ID,LONG_LOW_VALUE,LONG_HIGH_VALUE,NUM_NULLS,LAST_ANALYZED) values (4 , 'mydb', 'mytable', 'col1', 'int', 2, 44444, 444444, 0, 1513187965)", + "insert into TAB_COL_STATS(CS_ID,DB_NAME,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE,TBL_ID,LONG_LOW_VALUE,LONG_HIGH_VALUE,NUM_NULLS,LAST_ANALYZED) values (5 , 'mydb', 'mytable', 'col2', 'int', 2, 55555, 555555, 0, 1513187965)", + "insert into TAB_COL_STATS(CS_ID,DB_NAME,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE,TBL_ID,DOUBLE_HIGH_VALUE,DOUBLE_LOW_VALUE,NUM_NULLS,LAST_ANALYZED) values (6 , 'mydb', 'mytable', 'col3', 'float', 2, 66666.66, 666666.66, 0, 1513187965)" + }; + + scriptFile = generateTestScript(scripts); + schemaTool.runBeeLine(scriptFile.getPath()); + isValid = schemaTool.validateColumnStatistics(conn); + assertFalse(isValid); + } + /* * Test the validation of incorrect NULL values in the tables * @throws Exception