Index: contrib/src/java/org/apache/hadoop/hive/metastore/hooks/AuditMetaStoreEventListener.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/metastore/hooks/AuditMetaStoreEventListener.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/metastore/hooks/AuditMetaStoreEventListener.java (working copy) @@ -0,0 +1,301 @@ +package org.apache.hadoop.hive.metastore.hooks; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.logging.Log; + +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.MetaStoreEventListener; +import org.apache.hadoop.hive.metastore.HiveMetaStore.HMSHandler; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.events.AddPartitionEvent; +import org.apache.hadoop.hive.metastore.events.AlterPartitionEvent; +import org.apache.hadoop.hive.metastore.events.AlterTableEvent; +import org.apache.hadoop.hive.metastore.events.CreateDatabaseEvent; +import org.apache.hadoop.hive.metastore.events.CreateTableEvent; +import org.apache.hadoop.hive.metastore.events.DropDatabaseEvent; +import org.apache.hadoop.hive.metastore.events.DropPartitionEvent; +import org.apache.hadoop.hive.metastore.events.DropTableEvent; +import org.apache.hadoop.hive.metastore.events.ListenerEvent; +import org.apache.hadoop.hive.metastore.events.LoadPartitionDoneEvent; +import org.apache.hadoop.hive.ql.hooks.BaseReplicationHook; +import org.apache.hadoop.hive.ql.hooks.ConnectionUrlFactory; +import org.apache.hadoop.hive.ql.hooks.HookUtils; +import org.apache.hadoop.hive.ql.hooks.ReplicationHook; +import org.apache.hadoop.hive.ql.hooks.ReadEntity; +import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.thrift.TBase; +import org.apache.thrift.TException; +import org.apache.thrift.TSerializer; +import org.apache.thrift.protocol.TSimpleJSONProtocol; +import org.json.JSONException; +import org.json.JSONObject; + +/* + * MetaStoreEventListener that logs metastore operations to the audit log. + * The operations that this listener logs are only those from the thrift server, + * and not the CLI, because ReplicationHook currently logs queries from the CLI. + */ +public class AuditMetaStoreEventListener extends MetaStoreEventListener { + public static final Log LOG = LogFactory.getLog(AuditMetaStoreEventListener.class); + + private static final String COMMAND_TYPE = "METASTORE_API"; + private static final String COMMAND_NAME = "name"; + private static final String ADD_PARTITION_COMMAND = "ADD_PARTITION"; + private static final String ALTER_PARTITION_COMMAND = "ALTER_PARTITION"; + private static final String ALTER_TABLE_COMMAND = "ALTER_TABLE"; + private static final String CREATE_TABLE_COMMAND = "CREATE_TABLE"; + private static final String DROP_PARTITION_COMMAND = "DROP_PARTITION"; + private static final String DROP_DATABASE_COMMAND = "DROP_DATABASE"; + private static final String DROP_TABLE_COMMAND = "DROP_TABLE"; + private static final String NEW_TABLE = "new_table"; + private static final String OLD_TABLE = "old_table"; + private static final String NEW_PARTITION = "new_partition"; + private static final String OLD_PARTITION = "old_partition"; + + private final TSerializer jsonSerializer; + + + protected ConnectionUrlFactory urlFactory = null; + + public AuditMetaStoreEventListener(Configuration config) throws Exception{ + super(config); + urlFactory = BaseReplicationHook.getReplicationMySqlUrl(); + jsonSerializer = new TSerializer(new TSimpleJSONProtocol.Factory()); + } + + private void insertToDB(Set inputs, + Set outputs, ListenerEvent event, String command) throws MetaException { + HiveConf conf = event.getHandler().getHiveConf(); + //if HIVEQUERYID is set, then this command came from a CLI + // (and will execute posthooks). We don't want to log such a command + if (conf.getVar(HiveConf.ConfVars.HIVEQUERYID) == null || + conf.getVar(HiveConf.ConfVars.HIVEQUERYID).isEmpty()) { + try { + ArrayList sqlParams = new ArrayList(); + sqlParams.add(command); + sqlParams.add(StringEscapeUtils.escapeJava(ReplicationHook.entitiesToString(inputs))); + sqlParams.add(ReplicationHook.entitiesToString(outputs)); + sqlParams.add(COMMAND_TYPE); + + // Assertion at beginning of method guarantees this string will remain empty + String sql = "insert into snc1_command_log set command = ?, inputs = ?, outputs = ?, command_type = ?"; + + String ipAddress = HMSHandler.getIpAddress(); + if (ipAddress != null) { + if (ipAddress.startsWith("/")) { + ipAddress = ipAddress.replaceFirst("/", ""); + } + sql += ", user_info = ?"; + sqlParams.add(ipAddress); + } + + HookUtils.runInsert(conf, urlFactory, sql, sqlParams, HookUtils + .getSqlNumRetry(conf)); + } catch (Exception e) { + throw new MetaException(e.getMessage()); + } + } + } + + private Table getTableFromPart(Partition p, ListenerEvent event) throws MetaException { + try { + return event.getHandler().get_table(p.getDbName(), p.getTableName()); + } catch (Exception e) { + throw new MetaException(e.getMessage()); + } + } + + private org.apache.hadoop.hive.ql.metadata.Table getQlTable(Table t) { + return new org.apache.hadoop.hive.ql.metadata.Table(t); + } + + private org.apache.hadoop.hive.ql.metadata.Partition getQlPartition(Table t, Partition p) + throws MetaException{ + try { + org.apache.hadoop.hive.ql.metadata.Table qlTable = getQlTable(t); + return new org.apache.hadoop.hive.ql.metadata.Partition(qlTable, p); + } catch (HiveException e) { + throw new MetaException(e.getMessage()); + } + } + + private ReadEntity getPartitionInput(Partition p, ListenerEvent event) throws MetaException { + Table mTable = getTableFromPart(p, event); + ReadEntity input = new ReadEntity(getQlPartition(mTable, p)); + return input; + } + + private WriteEntity getPartitionOutput(Partition p, ListenerEvent event) throws MetaException { + try { + Table mTable = event.getHandler().get_table(p.getDbName(), p.getTableName()); + WriteEntity output = new WriteEntity(getQlPartition(mTable, p)); + return output; + } catch (Exception e) { + throw new MetaException(e.getMessage()); + } + } + + private void logNoSuccess() { + LOG.info("ListenerEvent success is false"); + } + + private void addCommandNameToCommand(JSONObject command, String name) { + try { + command.put(COMMAND_NAME, name); + } catch (JSONException e) { + LOG.error("Could not add command name to JSON object", e); + } + } + + private void addTBaseToCommand(JSONObject command, TBase object, String objectName) { + try { + command.put(objectName, new JSONObject(jsonSerializer.toString(object))); + } catch (JSONException e) { + LOG.error("Could not add " + objectName + " to JSON object", e); + } catch (TException e) { + LOG.error("Could not serialize " + objectName + " to JSON", e); + } + } + + @Override + public void onAddPartition(AddPartitionEvent event) throws MetaException { + if (!event.getStatus()) { + logNoSuccess(); + return; + } + + Set inputs = new HashSet(); + Set outputs = new HashSet(); + JSONObject command = new JSONObject(); + inputs.add(new ReadEntity(getQlTable(getTableFromPart(event.getPartition(), event)))); + outputs.add(getPartitionOutput(event.getPartition(), event)); + addCommandNameToCommand(command, ADD_PARTITION_COMMAND); + + insertToDB(inputs, outputs, event, command.toString()); + } + + @Override + public void onAlterPartition(AlterPartitionEvent event) throws MetaException { + if (!event.getStatus()) { + logNoSuccess(); + return; + } + + Set inputs = new HashSet(); + Set outputs = new HashSet(); + JSONObject command = new JSONObject(); + inputs.add(getPartitionInput(event.getOldPartition(), event)); + outputs.add(getPartitionOutput(event.getNewPartition(), event)); + addCommandNameToCommand(command, ALTER_PARTITION_COMMAND); + addTBaseToCommand(command, event.getOldPartition(), OLD_PARTITION); + addTBaseToCommand(command, event.getNewPartition(), NEW_PARTITION); + + insertToDB(inputs, outputs, event, command.toString()); + } + + @Override + public void onDropPartition(DropPartitionEvent event) throws MetaException { + if (!event.getStatus()) { + logNoSuccess(); + return; + } + + Set inputs = new HashSet(); + Set outputs = new HashSet(); + JSONObject command = new JSONObject(); + inputs.add(new ReadEntity(getQlTable(getTableFromPart(event.getPartition(), event)))); + outputs.add(getPartitionOutput(event.getPartition(), event)); + addCommandNameToCommand(command, DROP_PARTITION_COMMAND); + + insertToDB(inputs, outputs, event, command.toString()); + } + + @Override + /* + * Currently, on the create database CLI command, nothing gets logged. + */ + public void onCreateDatabase(CreateDatabaseEvent event) throws MetaException { + } + + @Override + public void onDropDatabase(DropDatabaseEvent event) throws MetaException { + if (!event.getStatus()) { + logNoSuccess(); + return; + } + + Set inputs = new HashSet(); + Set outputs = new HashSet(); + JSONObject command = new JSONObject(); + addCommandNameToCommand(command, DROP_DATABASE_COMMAND); + + insertToDB(inputs, outputs, event, command.toString()); + } + + @Override + public void onCreateTable(CreateTableEvent event) throws MetaException { + if (!event.getStatus()) { + logNoSuccess(); + return; + } + + Set inputs = new HashSet(); + Set outputs = new HashSet(); + JSONObject command = new JSONObject(); + outputs.add(new WriteEntity(getQlTable(event.getTable()))); + addCommandNameToCommand(command, CREATE_TABLE_COMMAND); + + insertToDB(inputs, outputs, event, command.toString()); + } + + @Override + public void onDropTable(DropTableEvent event) throws MetaException { + if (!event.getStatus()) { + logNoSuccess(); + return; + } + + Set inputs = new HashSet(); + Set outputs = new HashSet(); + JSONObject command = new JSONObject(); + inputs.add(new ReadEntity(getQlTable(event.getTable()))); + outputs.add(new WriteEntity(getQlTable(event.getTable()))); + addCommandNameToCommand(command, DROP_TABLE_COMMAND); + + insertToDB(inputs, outputs, event, command.toString()); + } + + @Override + public void onAlterTable(AlterTableEvent event) throws MetaException { + if (!event.getStatus()) { + logNoSuccess(); + return; + } + + Set inputs = new HashSet(); + Set outputs = new HashSet(); + JSONObject command = new JSONObject(); + inputs.add(new ReadEntity(getQlTable(event.getOldTable()))); + outputs.add(new WriteEntity(getQlTable(event.getOldTable()))); + outputs.add(new WriteEntity(getQlTable(event.getNewTable()))); + addCommandNameToCommand(command, ALTER_TABLE_COMMAND); + addTBaseToCommand(command, event.getOldTable(), OLD_TABLE); + addTBaseToCommand(command, event.getNewTable(), NEW_TABLE); + + insertToDB(inputs, outputs, event, command.toString()); + } + + @Override + public void onLoadPartitionDone(LoadPartitionDoneEvent lpe) throws MetaException { + } + +} Index: contrib/src/java/org/apache/hadoop/hive/metastore/hooks/CounterMetaStoreEndFunctionListener.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/metastore/hooks/CounterMetaStoreEndFunctionListener.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/metastore/hooks/CounterMetaStoreEndFunctionListener.java (working copy) @@ -0,0 +1,60 @@ +package org.apache.hadoop.hive.metastore.hooks; + +import java.util.AbstractMap; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.MetaStoreEndFunctionContext; +import org.apache.hadoop.hive.metastore.MetaStoreEndFunctionListener; +import org.apache.hadoop.hive.ql.hooks.HookUtils; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; + +/* + * MetaStoreEndFunctionListener that uses the StatsManager to collect fb303 counters for + * the number of successes and failures for each metastore thrift function, bucketed by time. + */ +public class CounterMetaStoreEndFunctionListener extends MetaStoreEndFunctionListener { + + StatsManager stats = null; + + public CounterMetaStoreEndFunctionListener(Configuration config) { + super(config); + String statsMgr = config.get(FBHiveConf.METASTORE_LISTENER_STATS_MANAGER); + if ((statsMgr == null) || (statsMgr.isEmpty())) { + return; + } + + stats = HookUtils.getObject(config, statsMgr); + } + + @Override + public void onEndFunction(String functionName, MetaStoreEndFunctionContext context) { + if (stats == null) { + return; + } + + // Construct the counter name, as for success + // and for failure + String statName = functionName + (context.isSuccess() ? "" : ".failure"); + + // If this is the first time this counter name has been seen, initialize it + if (!stats.containsKey(statName)) { + stats.addCountStatType(statName); + } + + stats.addStatValue(statName, 1); + } + + @Override + public void exportCounters(AbstractMap counters) { + if (stats == null) { + return; + } + + // For each counter the StatsManager has collected, add it to the map of fb303 counters + for (Entry entry : stats.getCounters().entrySet()) { + counters.put(entry.getKey(), entry.getValue()); + } + } + +} Index: contrib/src/java/org/apache/hadoop/hive/metastore/hooks/FbhiveAlterHandler.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/metastore/hooks/FbhiveAlterHandler.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/metastore/hooks/FbhiveAlterHandler.java (working copy) @@ -0,0 +1,65 @@ +package org.apache.hadoop.hive.metastore.hooks; + +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.metastore.HiveAlterHandler; +import org.apache.hadoop.hive.metastore.RawStore; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.InvalidObjectException; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.ql.hooks.HookUtils; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; + +/* + * Subclass of HiveAlterHandler. Checks that if the table, or partition's table has a + * creation_cluster set, that cluster matches the current cluster where the metastore is running. + * If so, or if the table or partition's table does not have creation_cluster set, it calls its + * super classes implementation of the alter method, i.e. it behaves normally. If not, it throws + * a MetaException. + */ +public class FbhiveAlterHandler extends HiveAlterHandler { + public static final Log LOG = LogFactory.getLog(FbhiveAlterHandler.class); + + @Override + public Partition alterPartition(RawStore ms, Warehouse wh, String dbname, + String name, List part_vals, Partition new_part) + throws InvalidOperationException, InvalidObjectException, + AlreadyExistsException, MetaException { + + String exception = "Partition in table " + name + " cannot be altered."; + checkTableCluster(ms, dbname, name, exception); + + return super.alterPartition(ms, wh, dbname, name, part_vals, new_part); + } + + @Override + public void alterTable(RawStore msdb, Warehouse wh, String dbname, + String name, Table newt) throws InvalidOperationException, MetaException { + + String exception = "Table " + name + " cannot be altered."; + checkTableCluster(msdb, dbname, name, exception); + + super.alterTable(msdb, wh, dbname, name, newt); + } + + private void checkTableCluster(RawStore msdb, String dbName, String tableName, + String exception) throws MetaException{ + + Table oldt = msdb.getTable(dbName.toLowerCase(), tableName.toLowerCase()); + if (oldt != null) { + String creationCluster = oldt.getParameters().get(HookUtils.TABLE_CREATION_CLUSTER); + String currentCluster = hiveConf.get(FBHiveConf.FB_CURRENT_CLUSTER); + if (creationCluster != null && currentCluster != null && !creationCluster.equals(currentCluster)) { + throw new MetaException(exception + + " Table's cluster is " + creationCluster + "," + + " whereas current package is " + currentCluster); + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/metastore/hooks/MysqlSmcHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/metastore/hooks/MysqlSmcHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/metastore/hooks/MysqlSmcHook.java (working copy) @@ -0,0 +1,52 @@ +package org.apache.hadoop.hive.metastore.hooks; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.hooks.ConnectionUrlFactory; +import org.apache.hadoop.hive.ql.hooks.HookUtils; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; + +public class MysqlSmcHook implements JDOConnectionURLHook { + + static final private Log LOG = + LogFactory.getLog("hive.metastore.hooks.MysqlSmcHook"); + + ConnectionUrlFactory urlFactory = null; + + @Override + public String getJdoConnectionUrl(Configuration conf) + throws Exception { + + String smcUrl = conf.get(FBHiveConf.METASTORE_SMC_URL); + if (smcUrl == null) { + throw new Exception(FBHiveConf.METASTORE_SMC_URL + " is not defined"); + } + String mysqlTier = conf.get(FBHiveConf.METASTORE_MYSQL_TIER_VAR_NAME); + if (mysqlTier == null) { + throw new Exception(FBHiveConf.METASTORE_MYSQL_TIER_VAR_NAME + " is not defined"); + } + String mysqlProps = conf.get(FBHiveConf.METASTORE_MYSQL_PROPS); + if (mysqlProps == null) { + throw new Exception(FBHiveConf.METASTORE_MYSQL_PROPS + " is not defined"); + } + if (urlFactory == null) { + urlFactory = HookUtils.getUrlFactory( + conf, + FBHiveConf.CONNECTION_FACTORY, + null, + FBHiveConf.METASTORE_MYSQL_TIER_VAR_NAME, + null, + FBHiveConf.METASTORE_MYSQL_PROPS); + } + + urlFactory.updateProps(smcUrl, mysqlTier, mysqlProps); + return urlFactory.getUrl(); + } + + @Override + public void notifyBadConnectionUrl(String url) { + LOG.error("Notified of a bad URL: " + url); + } + +} Index: contrib/src/java/org/apache/hadoop/hive/metastore/hooks/StatsManager.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/metastore/hooks/StatsManager.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/metastore/hooks/StatsManager.java (working copy) @@ -0,0 +1,11 @@ +package org.apache.hadoop.hive.metastore.hooks; + +import java.util.Map; + + +public interface StatsManager { + public boolean containsKey(String statName); + public void addCountStatType(String statName); + public void addStatValue(String statName, int value); + public Map getCounters(); +} \ No newline at end of file Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/FifoPoolHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/FifoPoolHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/FifoPoolHook.java (working copy) @@ -0,0 +1,136 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.DDLTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.HookUtils.InputInfo; +import org.apache.hadoop.hive.ql.session.SessionState; + +/** + * The implementation of the hook that based on the input size of the query + * submits big jobs into a fifo pool. + */ +public class FifoPoolHook { + + static final private Log LOG = LogFactory.getLog(FifoPoolHook.class.getName()); + static private boolean fifoed = false; + + static final private String failure = "FifoHook failure: "; + + public static class PreExec implements ExecuteWithHookContext { + @Override + public void run(HookContext hookContext) throws Exception { + assert(hookContext.getHookType() == HookContext.HookType.PRE_EXEC_HOOK); + SessionState sess = SessionState.get(); + Set inputs = hookContext.getInputs(); + Map inputToCS = hookContext.getInputPathToContentSummary(); + + QueryPlan queryPlan = hookContext.getQueryPlan(); + List> rootTasks = queryPlan.getRootTasks(); + + // If it is a pure DDL task, + if (rootTasks == null) { + return; + } + if (rootTasks.size() == 1) { + Task tsk = rootTasks.get(0); + if (tsk instanceof DDLTask) { + return; + } + } + + HiveConf conf = sess.getConf(); + + // In case posthook of the previous query was not triggered, + // we revert job tracker to clean state first. + if (fifoed) { + conf.set("mapred.fairscheduler.pool", ""); + fifoed = false; + } + // if the pool is specified already - bailout + String poolValue = conf.get("mapred.fairscheduler.pool", null); + if ((poolValue != null) && !poolValue.isEmpty()){ + return; + } + + // if we are set on local mode execution (via user or auto) bail + if ("local".equals(conf.getVar(HiveConf.ConfVars.HADOOPJT))) { + return; + } + + // check if we need to run at all + if (!conf.getBoolean("fbhive.fifopool.auto", false)) { + return; + } + + long maxGigaBytes = conf.getLong("fbhive.fifopool.GigaBytes", 0L); + if (maxGigaBytes == 0) { + LOG.info (failure + "fifopool.GigaBytes = 0"); + return; + } + + long maxBytes = maxGigaBytes * 1024 * 1024 * 1024L; + + if (maxGigaBytes < 0) { + LOG.warn (failure + "fifopool.GigaBytes value of " + maxGigaBytes + + "is invalid"); + return; + } + + // Get the size of the input + Map pathToTopPercentage = new HashMap(); + Set nonSampledInputs = new HashSet(); + boolean isThereSampling = HookUtils.checkForSamplingTasks( + hookContext.getQueryPlan().getRootTasks(), + pathToTopPercentage, nonSampledInputs); + + InputInfo info = HookUtils.getInputInfo(inputs, inputToCS, conf, + isThereSampling, pathToTopPercentage, nonSampledInputs, + Long.MAX_VALUE, maxBytes); + + if (info.getSize() > maxBytes) { + LOG.info ("Submitting to the fifo pool since the input length of " + + info.getSize() + " is more than " + maxBytes); + } else { + LOG.info("Not submitting to the fifo pool since the input length " + + info.getSize() + " is less than " + maxBytes); + return; + } + + // The job meets at least one of the requirements to be submitted into the + // fifo pool + String fifoPool = conf.get("fbhive.fifopool.name", "fifo"); + fifoed = true; + conf.set("mapred.fairscheduler.pool", fifoPool); + } + } + + public static class PostExec implements ExecuteWithHookContext { + @Override + public void run(HookContext hookContext) throws Exception { + assert(hookContext.getHookType() == HookContext.HookType.POST_EXEC_HOOK); + SessionState ss = SessionState.get(); + this.run(ss); + } + + public void run(SessionState sess) throws Exception { + HiveConf conf = sess.getConf(); + + if (fifoed) { + conf.set("mapred.fairscheduler.pool", ""); + fifoed = false; + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/CreateTableChangeDFSHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/CreateTableChangeDFSHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/CreateTableChangeDFSHook.java (working copy) @@ -0,0 +1,96 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.util.ArrayList; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.plan.CreateTableDesc; +import org.apache.hadoop.hive.ql.plan.CreateTableLikeDesc; +import org.apache.hadoop.hive.ql.plan.DDLWork; + +/** + * Implementation of a pre execute hook that is used to change + * the location of the DFS. + * This is only applicable to new tables - this can be used to + * eventually spread the load evenly on more than 1 DFS. + */ +public class CreateTableChangeDFSHook implements ExecuteWithHookContext { + static final private Log LOG = LogFactory.getLog(CreateTableChangeDFSHook.class.getName()); + + public void run(HookContext hookContext) throws Exception { + assert(hookContext.getHookType() == HookContext.HookType.PRE_EXEC_HOOK); + + QueryPlan queryPlan = hookContext.getQueryPlan(); + + // This change is only needed when a new table is being created + ArrayList> rootTasks = queryPlan.getRootTasks(); + + if ((rootTasks == null) || (rootTasks.size() != 1)) { + return; + } + + Task tsk = rootTasks.get(0); + + if (!(tsk.getWork() instanceof DDLWork)) { + return; + } + + HiveConf conf = hookContext.getConf(); + DDLWork ddlWork = (DDLWork)tsk.getWork(); + + float pubPercent = conf.getFloat(FBHiveConf.ENABLE_PARTIAL_CHANGEDFS, 0); + + // if pubPercent == 0, make sure it returns. + if (!HookUtils.rollDice(pubPercent)) { + return; + } + + String newDir = conf.get(FBHiveConf.SECONDARYMETASTOREWAREHOUSE); + + if (ddlWork.getCreateTblDesc() != null) { + CreateTableDesc crtTblDesc = ddlWork.getCreateTblDesc(); + // The user has already specified the location + if (crtTblDesc.getLocation() != null) { + return; + } + + // This is only for tmp tables right now + if ((crtTblDesc.getTableName() == null) || + ((!crtTblDesc.getTableName().startsWith("tmp_")) && + (!crtTblDesc.getTableName().startsWith("temp_")))) { + return; + } + + String locn = (new Warehouse(conf)).getTablePath(newDir, crtTblDesc.getTableName()).toString(); + crtTblDesc.setLocation(locn); + LOG.info("change location for table " + crtTblDesc.getTableName()); + return; + } + + if (ddlWork.getCreateTblLikeDesc() != null) { + CreateTableLikeDesc crtTblLikeDesc = ddlWork.getCreateTblLikeDesc(); + // The user has already specified the location + if (crtTblLikeDesc.getLocation() != null) { + return; + } + + // This is only for tmp tables right now + if ((crtTblLikeDesc.getTableName() == null) || + ((!crtTblLikeDesc.getTableName().startsWith("tmp_")) && + (!crtTblLikeDesc.getTableName().startsWith("temp_")))) { + return; + } + + String locn = (new Warehouse(conf)).getTablePath(newDir, crtTblLikeDesc.getTableName()).toString(); + crtTblLikeDesc.setLocation(locn); + LOG.info("change location for table " + crtTblLikeDesc.getTableName()); + return; + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/ConfUrlFactory.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/ConfUrlFactory.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/ConfUrlFactory.java (working copy) @@ -0,0 +1,60 @@ +package org.apache.hadoop.hive.ql.hooks; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.conf.Configuration; + +/** + * + * This factory creates the connection URL from the supplied configuration. + * + */ + +public class ConfUrlFactory implements ConnectionUrlFactory { + + HiveConf conf = null; + String confVarName = null; + public ConfUrlFactory() { + this(new HiveConf(ConfUrlFactory.class), ""); + } + + public ConfUrlFactory(String confVarName) { + this(new HiveConf(ConfUrlFactory.class), confVarName); + } + + public ConfUrlFactory(HiveConf conf, String confVarName) { + this.conf = conf; + this.confVarName = confVarName; + } + + public boolean init(Configuration hconf) { + this.conf = (HiveConf)hconf; + return true; + } + + @Override + public void init(String param1Name, String param2Name) { + this.confVarName = param1Name; + } + + @Override + public String getUrl() throws Exception { + String dbstr = conf.get(confVarName); + String[] hostDatabases = dbstr.split(":"); + return "jdbc:mysql://" + hostDatabases[0] + "/" + hostDatabases[1]; + } + + @Override + public String getUrl(boolean isWrite) throws Exception { + return getUrl(); + } + + @Override + public String getValue(String param1, String param2) throws Exception { + return null; + } + + @Override + public void updateProps(String param1, String param2, String param3) { + return; + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/QueryPlanHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/QueryPlanHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/QueryPlanHook.java (working copy) @@ -0,0 +1,92 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.ExplainTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.plan.ExplainWork; +import org.apache.hadoop.hive.ql.plan.HiveOperation; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.json.JSONObject; + +/** + * A hook which populates the query_plan_log MySQL table with + * the query plan for the query. The query plan is recorded as a json string. + * the stats through it as well. + */ +public class QueryPlanHook implements ExecuteWithHookContext { + + private ConnectionUrlFactory urlFactory = null; + private HiveConf conf; + + public static ConnectionUrlFactory getQueryPlanUrlFactory(HiveConf conf) { + return HookUtils.getUrlFactory( + conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.QUERYPLAN_CONNECTION_FACTORY, + FBHiveConf.QUERYPLAN_MYSQL_TIER_VAR_NAME, + FBHiveConf.QUERYPLAN_HOST_DATABASE_VAR_NAME); + } + + public QueryPlanHook() throws Exception { + conf = new HiveConf(QueryPlanHook.class); + } + + @Override + public void run(HookContext hookContext) throws Exception { + + assert(hookContext.getHookType() == HookContext.HookType.POST_EXEC_HOOK); + + String queryId = ""; + SessionState sess = SessionState.get(); + + if (sess != null) { + conf = sess.getConf(); + queryId = conf.getVar(HiveConf.ConfVars.HIVEQUERYID); + HiveOperation op = sess.getHiveOperation(); + + // No need to log for DDLs + if ((op == null) || + ((!op.equals(HiveOperation.CREATETABLE_AS_SELECT)) && + (!op.equals(HiveOperation.CREATEVIEW)) && + (!op.equals(HiveOperation.LOAD)) && + (!op.equals(HiveOperation.QUERY)))) { + return; + } + } + // QueryId not present - nothing to do + else { + return; + } + + // Get the list of root tasks + List> rootTasks = hookContext.getQueryPlan().getRootTasks(); + if ((rootTasks == null) || (rootTasks.isEmpty())) { + return; + } + + ExplainWork explainWork = new ExplainWork(null, rootTasks, null, false, true); + JSONObject queryPlan = ExplainTask.getJSONPlan(null, explainWork); + + List sqlParams = new ArrayList(); + sqlParams.add(StringEscapeUtils.escapeJava(queryId)); + sqlParams.add(StringEscapeUtils.escapeJava(queryPlan.toString())); + + // Assertion at beginning of method guarantees this string will not remain empty + String sql = "insert into query_plan_log set queryId = ?, queryPlan = ?"; + if (urlFactory == null) { + urlFactory = getQueryPlanUrlFactory(conf); + if (urlFactory == null) { + throw new RuntimeException("DB parameters not set!"); + } + } + + HookUtils.runInsert(conf, urlFactory, sql, sqlParams, HookUtils + .getSqlNumRetry(conf)); + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/RegressionTestHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/RegressionTestHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/RegressionTestHook.java (working copy) @@ -0,0 +1,85 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.Set; +import java.util.ArrayList; +import java.sql.Connection; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.hooks.ReadEntity; +import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.hooks.PostExecute; +import org.apache.hadoop.hive.ql.hooks.LineageInfo; +import org.apache.hadoop.security.UserGroupInformation; + +/** + * Implementation of a post execute hook that prints out some more information + * to console to allow regression tests to check correctness. + */ +public class RegressionTestHook implements PostExecute { + static final private Log LOG = LogFactory + .getLog("hive.ql.hooks.RegressionTestHook"); + + final static String REGRESSION_TEST_PRINT_SWITCH_VAR_NAME = "fbhive.regressiontesthook.swtich"; + + public RegressionTestHook() throws Exception { + } + + public void run(SessionState sess, Set inputs, + Set outputs, LineageInfo lInfo, + UserGroupInformation ugi) throws Exception { + HiveConf conf = sess.getConf(); + + String hookSwitch = conf.get(REGRESSION_TEST_PRINT_SWITCH_VAR_NAME, ""); + + if (!hookSwitch.equals("1")) { + return; + } + + String inputStr = ""; + + if (inputs != null) { + StringBuilder inputsSB = new StringBuilder(); + + boolean first = true; + + for (ReadEntity inp : inputs) { + if (!first) + inputsSB.append(","); + first = false; + inputsSB.append(inp.toString()); + } + inputStr = StringEscapeUtils.escapeJava(inputsSB.toString()); + } + + String outputStr = ""; + + if (outputs != null) { + StringBuilder outputsSB = new StringBuilder(); + + boolean first = true; + + for (WriteEntity o : outputs) { + if (!first) + outputsSB.append(","); + first = false; + outputsSB.append(o.toString()); + } + outputStr = StringEscapeUtils.escapeJava(outputsSB.toString()); + } + + String queryId = conf.getVar(HiveConf.ConfVars.HIVEQUERYID); + + System.out + .println("++++++++++Regression Test Hook Output Start+++++++++"); + + System.out.println("+++queryId:" + queryId); + System.out.println("+++input:" + inputStr); + System.out.println("+++output:" + outputStr); + System.out + .println("++++++++++Regression Test Hook Output End+++++++++"); + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/SplitSizeHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/SplitSizeHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/SplitSizeHook.java (working copy) @@ -0,0 +1,60 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.Random; +import java.util.List; +import java.io.Serializable; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.parse.AbstractSemanticAnalyzerHook; +import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContext; +import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContextImpl; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.parse.HiveParser; + +/** + * Implementation of a compile time hook to set all split size parameters from + * mapred.min.split.size if it is CombineHiveInputFormat + * of queries + */ +public class SplitSizeHook extends AbstractSemanticAnalyzerHook { + final static String COMBINE_HIVE_INPUT_FORMAT = "org.apache.hadoop.hive.ql.io.CombineHiveInputFormat"; + final static String CONF_MAPRED_MAX_SPLIT_SIZE = "mapred.max.split.size"; + final static String CONF_MAPRED_MIN_SPLIT_PER_RACK = "mapred.min.split.size.per.rack"; + final static String CONF_MAPRED_MIN_SPLIT_PER_NODE = "mapred.min.split.size.per.node"; + + // If input format is CombineHiveInputFormat, set all 3 related split size parameter to + // mapred.min.split.size. mapred.max.split.size remains its old value if it is larger + // than the new value. + public ASTNode preAnalyze( + HiveSemanticAnalyzerHookContext context, + ASTNode ast) throws SemanticException { + HiveSemanticAnalyzerHookContextImpl ctx = (HiveSemanticAnalyzerHookContextImpl)context; + HiveConf conf = (HiveConf)ctx.getConf(); + + String hiveInputFormat = conf.getVar(HiveConf.ConfVars.HIVEINPUTFORMAT); + + if (!hiveInputFormat.equals(COMBINE_HIVE_INPUT_FORMAT)) { + return ast; + } + + long mapredMinSplitSize = conf.getLongVar(HiveConf.ConfVars.MAPREDMINSPLITSIZE); + + conf.setLong(CONF_MAPRED_MIN_SPLIT_PER_NODE, mapredMinSplitSize); + conf.setLong(CONF_MAPRED_MIN_SPLIT_PER_RACK, mapredMinSplitSize); + long maxSplit = conf.getLong(CONF_MAPRED_MAX_SPLIT_SIZE, (long)-1); + if (mapredMinSplitSize > maxSplit) { + conf.setLong(CONF_MAPRED_MAX_SPLIT_SIZE, mapredMinSplitSize); + } + + return ast; + } + + // Nothing to do + public void postAnalyze( + HiveSemanticAnalyzerHookContext context, + List> rootTasks) throws SemanticException { + // no nothing + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/AuditLocalModeHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/AuditLocalModeHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/AuditLocalModeHook.java (working copy) @@ -0,0 +1,71 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskRunner; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.session.SessionState; + +/** + * Implementation of a post execute hook that checks whether a partition is + * archived or not and also sets that query time for the partition. + */ +public class AuditLocalModeHook implements ExecuteWithHookContext { + static final private Log LOG = LogFactory + .getLog("hive.ql.hooks.AuditLocalModeHook"); + final static String MYSQL_TIER_VAR_NAME = "fbhive.audit.mysql.tier"; + final static String HOST_DATABASE_VAR_NAME = "fbhive.audit.mysql"; + + ConnectionUrlFactory urlFactory = null; + + public AuditLocalModeHook() throws Exception { + HiveConf conf = new HiveConf(AuditLocalModeHook.class); + urlFactory = HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.AUDIT_CONNECTION_FACTORY, + FBHiveConf.AUDIT_MYSQL_TIER_VAR_NAME, + FBHiveConf.AUDIT_HOST_DATABASE_VAR_NAME); + } + + public void run(HookContext hookContext) throws Exception { + HiveConf conf = hookContext.getConf(); + String command = StringEscapeUtils.escapeJava(SessionState.get().getCmd()); + QueryPlan plan = hookContext.getQueryPlan(); + String queryID = StringEscapeUtils.escapeJava(plan.getQueryId()); + int numLocalModeTask = 0; + int numMapRedTask = 0; + int numTask = 0; + List list = hookContext.getCompleteTaskList(); + numTask = list.size(); + for (TaskRunner tskRunner : list) { + Task tsk = tskRunner.getTask(); + if(tsk.isMapRedTask()){ + if(tsk.isLocalMode()) { + numLocalModeTask++; + } + numMapRedTask++; + } + } + if(numLocalModeTask == 0){ + return ; + } + ArrayList sqlParams = new ArrayList(); + sqlParams.add(StringEscapeUtils.escapeJava(command)); + sqlParams.add(StringEscapeUtils.escapeJava(queryID)); + sqlParams.add(new Integer(numTask)); + sqlParams.add(new Integer(numMapRedTask)); + sqlParams.add(new Integer(numLocalModeTask)); + + String sql = "insert into audit_local set command = ?, query_id = ?, num_tasks = ?, num_mapred_tasks = ?," + + " num_local_mapred_tasks = ?"; + HookUtils.runInsert(conf, urlFactory, sql, sqlParams, HookUtils + .getSqlNumRetry(conf)); + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/HiveConfigLoggingHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/HiveConfigLoggingHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/HiveConfigLoggingHook.java (working copy) @@ -0,0 +1,69 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.plan.HiveOperation; +import org.apache.hadoop.hive.ql.session.SessionState; + +/** + * A post-execute hook that will log the overridden Hive Config + * for this query to the audit_log database. + */ +public class HiveConfigLoggingHook implements ExecuteWithHookContext{ + + protected ConnectionUrlFactory urlFactory = null; + + public static ConnectionUrlFactory getQueryConfigUrlFactory(HiveConf conf) { + // Writes to the same database as BaseReplicationHook. + return HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.REPLICATION_CONNECTION_FACTORY, + FBHiveConf.REPLICATION_MYSQL_TIER_VAR_NAME, + FBHiveConf.REPLICATION_HOST_DATABASE_VAR_NAME); + } + + @Override + public void run(HookContext hookContext) throws Exception { + SessionState ss = SessionState.get(); + if (ss == null) { + // QueryId not present. Nothing to do. + return; + } + HiveConf conf = ss.getConf(); + String queryId = conf.getVar(HiveConf.ConfVars.HIVEQUERYID); + HiveOperation op = ss.getHiveOperation(); + + if ((op == null) || + (!op.equals(HiveOperation.CREATETABLE_AS_SELECT) && + !op.equals(HiveOperation.LOAD) && + !op.equals(HiveOperation.QUERY))) { + return; + } + + Map overriddenConfig = ss.getOverriddenConfigurations(); + String sql = "insert into query_config_log set queryId = ?, config_key = ?, config_value = ?"; + List sqlParams = new ArrayList(); + urlFactory = getQueryConfigUrlFactory(conf); + if (urlFactory == null) { + throw new RuntimeException("DB parameters not set!"); + } + + for (Map.Entry e : overriddenConfig.entrySet()) { + String key = e.getKey(); + String val = conf.get(key); + if (val != null) { + sqlParams.clear(); + sqlParams.add(StringEscapeUtils.escapeJava(queryId)); + sqlParams.add(StringEscapeUtils.escapeJava(key)); + sqlParams.add(StringEscapeUtils.escapeJava(val)); + HookUtils.runInsert(conf, urlFactory, sql, sqlParams, HookUtils + .getSqlNumRetry(conf)); + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/ConnectionUrlFactory.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/ConnectionUrlFactory.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/ConnectionUrlFactory.java (working copy) @@ -0,0 +1,22 @@ +package org.apache.hadoop.hive.ql.hooks; + +/** + * + * Classes implementing this interface create JDBC connection URL's. + * This can also be used to store a parameters array + */ +public interface ConnectionUrlFactory { + + public void init(String param1Name, String param2Name); + + /** + * @return the JDBC connection URL + * @throws Exception + */ + String getUrl() throws Exception; + + String getUrl(boolean isWrite) throws Exception; + + String getValue(String param1, String param2) throws Exception; + void updateProps(String param1, String param2, String param3); +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/AbstractSmcConfigHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/AbstractSmcConfigHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/AbstractSmcConfigHook.java (working copy) @@ -0,0 +1,112 @@ +package org.apache.hadoop.hive.ql.hooks; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.thrift.TException; +import org.json.JSONException; +import org.json.JSONObject; + +/** + * An abstract class which should be extended by hooks which read configurations from an SMC + * config tier. + */ +public abstract class AbstractSmcConfigHook { + + static final private Log LOG = LogFactory.getLog(AbstractSmcConfigHook.class); + + private static final String CONFIG_FIELD = "config"; + private static final String ENABLED_FIELD = "enabled"; + private ThreadLocal urlFactory = null; + + /** + * Given a HiveConf, checks if the SMC hook enabled config is set to true + * + * @param conf + * @return + */ + protected boolean isEnabled(HiveConf conf) { + boolean enabled = conf.getBoolean(FBHiveConf.ENABLED_CONFIG, false); + + if (!enabled) { + LOG.error("SMC hook is not enabled."); + } + + return enabled; + } + + /** + * In each top level config object (jo) there is an enabled field. This method checks that that + * field exists, is set properly, and is set to true. + * + * @param jo + * @param packageName + * @return + * @throws JSONException + */ + protected boolean isConfigEnabled(JSONObject jo, String packageName) throws JSONException { + boolean enabled = false; + + Object enabledObj = null; + + if (jo.has(ENABLED_FIELD)) { + enabledObj = jo.get(ENABLED_FIELD); + } + + if (enabledObj == null || !(enabledObj instanceof Boolean) ) { + LOG.error("enabled not properly set!"); + return false; + } + + enabled = enabledObj.equals(Boolean.TRUE); + + if (!enabled) { + LOG.error("package " + packageName + " is not enabled"); + } + + return enabled; + } + + /** + * Given a HiveConf object, this method goes to the config tier and retrieves the underlying + * config object (whether that's an array, object, or any other type of JSON). It also performs + * checks that the tier can be retrieved, the package name is set, the config is enabled, etc. + * + * @param conf + * @return + * @throws JSONException + * @throws ServiceException + * @throws TException + */ + protected Object getConfigObject(HiveConf conf) + throws JSONException, Exception, TException { + + // Get the properties for this package + String packageName = conf.get(FBHiveConf.FB_CURRENT_CLUSTER); + if (packageName == null) { + LOG.error("Unable to use configs stored in SMC - no hive package set."); + return null; + } + + if (urlFactory == null) { + urlFactory = new ThreadLocal(); + urlFactory.set(HookUtils.getUrlFactory(conf, FBHiveConf.CONNECTION_FACTORY, null, null, null)); + } + + String s = urlFactory.get().getValue(conf.get(FBHiveConf.HIVE_CONFIG_TIER), packageName); + JSONObject jo = new JSONObject(s); + + Object configObj = null; + + if (!isConfigEnabled(jo, packageName)) { + return null; + } + + if (jo.has(CONFIG_FIELD)) { + configObj = jo.get(CONFIG_FIELD); + } + + return configObj; + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/FbUpdateInputAccessTimeHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/FbUpdateInputAccessTimeHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/FbUpdateInputAccessTimeHook.java (working copy) @@ -0,0 +1,247 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.HashMap; +import java.util.Set; +import java.util.LinkedHashMap; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; + +/** + * Implementation of a pre AND post execute hook that updates the access times + * for all the inputs. + * + * It is required that this hook is put as the last pre-hook and the first + * post-hook. Invoked as pre-hook, it will start a background thread to update + * update time of all partitions and tables in the input set. Invoked as + * post-hook, it will wait the background thread to finish. And fail the query + * if the background thread fails. + */ +public class FbUpdateInputAccessTimeHook implements ExecuteWithHookContext { + static final private Log LOG = LogFactory + .getLog("hive.ql.hooks.FbUpdateInputAccessTimeHook"); + + private Hive db; + static private Object staticLock = new Object(); + static private HookThread hookThread = null; + static private HookContext lastHookContext = null; + + @Override + public void run(HookContext hookContext) throws Exception { + assert (hookContext != null); + + // if no input, there is no need to start the backgrond thread. + if (hookContext.getInputs() == null || hookContext.getInputs().isEmpty()) { + return; + } + + // This race condition should never happen. But since we use static + // member to keep some global states, we lock it in case it happens + // because of a bug, we won't produce unpredictable results + synchronized (staticLock) { + // there is no flag to determine it is pre-hook or post-hook. + // We just simply make the assumption that if one hook context + // is passed again, it is post hook. + if (lastHookContext == hookContext) { + lastHookContext = null; + runPosthook(hookContext); + } else if (lastHookContext != null || hookThread != null) { + // If we don't forget to put the hook in post-execution hooks, + // likely the previous task failed so that post-hook didn't have + // chance to be executed. + // + // Ideally this error message should print to SessionState's error + // stream if assigned. However, it is not in HookContext. + // We use standard error message for now. + System.err.println( + "WARNING: FbUpdateInputAccessTimeHook doesn't start with a clear " + + "state. Ignore this message if the previous query failed. If " + + "previous task succeeded, check whether " + + "FbUpdateInputAccessTimeHook is among the post-execution hooks"); + + if (hookThread != null) { + System.err.println("Waiting for pending background thread of " + + "FbUpdateInputAccessTimeHook to finish..."); + hookThread.join(); + System.err.println("Background thread of FbUpdateInputAccessTimeHook" + + " finished."); + hookThread = null; + } + lastHookContext = hookContext; + runPrehook(hookContext); + } else { + if (!hookContext.getCompleteTaskList().isEmpty()) { + throw new HiveException( + "FbUpdateInputAccessTimeHook is not a part of " + + "pre-execution hook?"); + } + lastHookContext = hookContext; + runPrehook(hookContext); + } + } + } + + private void runPrehook(HookContext hookContext) { + LOG.info("run as pre-execution hook"); + hookThread = new HookThread(hookContext.getConf(), hookContext.getInputs(), + hookContext.getOutputs()); + hookThread.start(); + } + + private void runPosthook(HookContext hookContext) throws HiveException { + LOG.info("run as post-execution hook"); + if (hookThread != null) { + HookThread pendingThread = hookThread; + try { + pendingThread.join(); + } catch (InterruptedException e) { + throw new HiveException( + "Background thread in FbUpdateInputAccessTimeHook failed", e); + } finally { + hookThread = null; + } + + if (!pendingThread.isSuccessful()) { + if (pendingThread.getHiveException() != null) { + throw new HiveException("FbUpdateInputAccessTimeHook failed", + pendingThread.getHiveException()); + } else if (pendingThread.getInvalidOperationException() != null) { + throw new HiveException("FbUpdateInputAccessTimeHook failed", + pendingThread.getInvalidOperationException()); + } else { + throw new HiveException("FbUpdateInputAccessTimeHook failed with " + + "Unhandled Exception."); + } + } + } else { + throw new HiveException( + "FbUpdateInputAccessTimeHook is not one of pre-execution hook, " + + "but it is one of the post-execution hook."); + } + } + + /** + * class for the background thread + * + * @author sdong + * + */ + class HookThread extends Thread { + Set inputs; + Set outputs; + HiveConf hiveConf; + boolean success; + + HiveException hiveException; + InvalidOperationException invalidOperationException; + + HookThread(HiveConf hiveConf, Set inputs, + Set outputs) { + this.hiveConf = hiveConf; + this.inputs = inputs; + this.outputs = outputs; + success = false; + } + + public boolean isSuccessful() { + return success; + } + + public HiveException getHiveException() { + return hiveException; + } + + public InvalidOperationException getInvalidOperationException() { + return invalidOperationException; + } + + private void updateTableAccessTime(HashMap tableMap, + Table table, int lastAccessTime) throws HiveException, + InvalidOperationException { + if (!tableMap.containsKey(table.getTableName())) { + Table t = db.getTable(table.getTableName()); + t.setLastAccessTime(lastAccessTime); + db.alterTable(t.getTableName(), t); + tableMap.put(table.getTableName(), t); + } + } + + public void run() { + try { + if (db == null) { + try { + db = Hive.get(hiveConf); + } catch (HiveException e) { + // ignore + db = null; + return; + } + } + + int lastAccessTime = (int) (System.currentTimeMillis() / 1000); + + HashMap tableMap = new HashMap (); + + for (ReadEntity re : inputs) { + // Set the last query time + ReadEntity.Type typ = re.getType(); + switch (typ) { + // It is possible that read and write entities contain a old + // version + // of the object, before it was modified by StatsTask. + // Get the latest versions of the object + case TABLE: { + updateTableAccessTime(tableMap, re.getTable(), + lastAccessTime); + break; + } + case PARTITION: { + Partition p = re.getPartition(); + updateTableAccessTime(tableMap, p.getTable(), lastAccessTime); + // table already in the map after updating tables' access time + Table t = tableMap.get(p.getTable().getTableName()); + p = db.getPartition(t, p.getSpec(), false); + p.setLastAccessTime(lastAccessTime); + db.alterPartition(t.getTableName(), p); + break; + } + default: + // ignore dummy inputs + break; + } + } + success = true; + } catch (HiveException e) { + hiveException = e; + } catch (InvalidOperationException e) { + invalidOperationException = e; + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/CheckArchivedDataHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/CheckArchivedDataHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/CheckArchivedDataHook.java (working copy) @@ -0,0 +1,45 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.Set; + +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.hive.conf.HiveConf; + +/** + * Implementation of a pre execute hook that checks whether + * a partition is archived or not + */ +public class CheckArchivedDataHook { + + private static final String ARCHIVE_FLAG = "archivedFlag"; + final static String DISABLE_CHECK_ARCHIVAL_HOOK = "fbhive.disable.checkArchival.hook"; + + public static class PreExec implements PreExecute { + + public void run(SessionState sess, Set inputs, + Set outputs, UserGroupInformation ugi) + throws Exception { + + // Did the user explicitly ask to disable the hook + HiveConf conf = sess.getConf(); + String disableArch = conf.get(DISABLE_CHECK_ARCHIVAL_HOOK); + if ((disableArch != null) && (disableArch.compareToIgnoreCase("false") == 0)) { + return; + } + + //Go over the input paths and check if they are archived or not + for(ReadEntity re: inputs) { + boolean isArchived = false; + if (re.getParameters() != null) { + String archF = re.getParameters().get(ARCHIVE_FLAG); + if (archF != null) { + isArchived = archF.equals("true"); + if (isArchived) + throw new Exception("Path: " + re.getLocation().toString() + " needs to be unarchived."); + } + } + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/conf/FBHiveConf.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/conf/FBHiveConf.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/conf/FBHiveConf.java (working copy) @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.hooks.conf; + +/** + * Hive Configuration needed for testing various hooks + */ +public class FBHiveConf { + public final static String CONNECTION_FACTORY = "fbhive.urlfactory"; + + // StartFinishHook.java + public final static String STARTFINISH_CONNECTION_FACTORY = "fbhive.startfinishhook.urlfactory"; + public final static String STARTFINISH_HOST_DATABASE_VAR_NAME = "fbhive.startfinish.mysql"; + public final static String STARTFINISH_MYSQL_TIER_VAR_NAME = "fbhive.startfinish.mysql.tier"; + + // AuditJoinHook.java and AuditLocalModeHook.java + public final static String AUDIT_CONNECTION_FACTORY = "fbhive.audit.urlfactory"; + public final static String AUDIT_HOST_DATABASE_VAR_NAME = "fbhive.audit.mysql"; + public final static String AUDIT_MYSQL_TIER_VAR_NAME = "fbhive.audit.mysql.tier"; + + // BaseReplicationHook.java + public final static String REPLICATION_CONNECTION_FACTORY = "fbhive.replication.urlfactory"; + public final static String REPLICATION_HOST_DATABASE_VAR_NAME = "fbhive.replication.mysql"; + public final static String REPLICATION_MYSQL_TIER_VAR_NAME = "fbhive.replication.mysql.tier"; + + // JobStatsHook.java + public final static String JOBSTATS_CONNECTION_FACTORY = "fbhive.jobstats.urlfactory"; + public final static String JOBSTATS_HOST_DATABASE_VAR_NAME = "fbhive.jobstats.mysql"; + public final static String JOBSTATS_MYSQL_TIER_VAR_NAME = "fbhive.jobstats.mysql.tier"; + + // Lineage.java + public final static String LINEAGE_CONNECTION_FACTORY = "fbhive.lineage.urlfactory"; + public final static String LINEAGE_HOST_DATABASE_VAR_NAME = "fbhive.lineage.mysql"; + public final static String LINEAGE_MYSQL_TIER_VAR_NAME = "fbhive.lineage.mysql.tier"; + + // QueryPlanHook.java + public final static String QUERYPLAN_CONNECTION_FACTORY = "fbhive.queryplan.urlfactory"; + public final static String QUERYPLAN_HOST_DATABASE_VAR_NAME = "fbhive.queryplan.mysql"; + public final static String QUERYPLAN_MYSQL_TIER_VAR_NAME = "fbhive.queryplan.mysql.tier"; + + // ExternalInputsHook.java + public final static String METASTORE_CONNECTION_FACTORY = "fbhive.metastore.urlfactory"; + public final static String METASTORE_HOST_DATABASE_VAR_NAME = "fbhive.metastore.mysql"; + public final static String METASTORE_MYSQL_TIER_VAR_NAME = "fbhive.metastore.smc.tier"; + + // SMCStatsDBHook.java + public final static String STATS_CONNECTION_FACTORY = "fbhive.stats.urlfactory"; + public final static String STATS_HOST_DATABASE_VAR_NAME = "fbhive.stats.mysql"; + public final static String STATS_MYSQL_TIER_VAR_NAME = "fbhive.stats.mysql.tier"; + + // QueryDroppedPartitionsHook.java + public final static String QUERYDROPPED_PARTITIONS_CONNECTION_FACTORY = + "fbhive.querydropped.partitions.urlfactory"; + public final static String QUERYDROPPED_PARTITIONS_HOST_DATABASE_VAR_NAME = + "fbhive.querydropped.partitions.mysql"; + public final static String QUERYDROPPED_PARTITIONS_MYSQL_TIER_VAR_NAME = + "fbhive.querydropped.partitions.mysql.tier"; + + // JobTrackerHook.java -- all of them will be set to null + public final static String JOBTRACKER_CONNECTION_FACTORY = "fbhive.jobtracker.urlfactory"; + public final static String JOBTRACKER_HOST_DATABASE_VAR_NAME = "fbhive.jobtracker.mysql"; + public final static String JOBTRACKER_MYSQL_TIER_VAR_NAME = "fbhive.jobtracker.mysql.tier"; + + public final static String ENABLE_PARTIAL_CONCURRENCY = "fbhive.concurrency.percent"; + public final static String NO_RETENTION_WARNING_ONLY = "fbhive.retention.warningOnly"; + public static final String FB_CURRENT_CLUSTER = "fbhive.package.name"; + public static final String HIVE_CONFIG_TIER = "fbhive.config.tier"; + + public static final String HIVE_METRICS_PUBLISHER = "hive.metrics.publisher"; + + public static final String ENABLE_PARTIAL_CHANGEDFS = + "fbhive.changde.dfs.percent"; + public static final String SECONDARYMETASTOREWAREHOUSE = + "fbhive.secondary.metastore.warehouse.dir"; + + public static final String ENABLED_CONFIG = "fbhive.smc.config.hook.enabled"; + + // used by MysqlSmcHook + public static final String METASTORE_SMC_URL = "fbhive.smc.url"; + public static final String METASTORE_MYSQL_PROPS = + "fbhive.metastore.mysql.props"; + + // used by CounterMetaStoreEndFunctionListener + public static final String METASTORE_LISTENER_STATS_MANAGER = + "fbhive.metastore.listener.statsmanager"; + + public static final String FBHIVE_DB_USERNAME = + "fbhive.db.username"; + + public static final String FBHIVE_DB_PASSWORD = + "fbhive.db.password"; + + public static final String FBHIVE_SILVER_DFS_PREFIX = + "fbhive.silver.dfs.prefix"; + public static final String FBHIVE_SILVER_DFS_PREFIX2 = + "fbhive.silver.dfs.prefix2"; + public static final String FBHIVE_SILVER_DFS_PREFIX3 = + "fbhive.silver.dfs.prefix3"; + public static final String FBHIVE_PLATINUM_DFS_PREFIX = + "fbhive.platinum.dfs.prefix"; + + public static final String FBHIVE_BRONZE_JOBTRACKER = + "fbhive.bronze.jobtracker"; +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/ExternalInputsHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/ExternalInputsHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/ExternalInputsHook.java (working copy) @@ -0,0 +1,204 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.AbstractSemanticAnalyzerHook; +import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContext; +import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContextImpl; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * Implementation of a compile time hook that updates the inputs to include the managed objects + * which have data external inputs are pointing to + */ +public class ExternalInputsHook extends AbstractSemanticAnalyzerHook { + + private static final int SQL_NUM_RETRIES = 3 ; + private static final int RETRY_MAX_INTERVAL_SEC = 60; + + // Does nothing + @Override + public ASTNode preAnalyze( + HiveSemanticAnalyzerHookContext context, + ASTNode ast) throws SemanticException { + //do nothing + return ast; + } + + // Updates the inputs to include managed tables/partitions whose data is pointed to by external + // inputs + @Override + public void postAnalyze( + HiveSemanticAnalyzerHookContext context, + List> rootTasks) throws SemanticException { + + HiveSemanticAnalyzerHookContextImpl ctx = (HiveSemanticAnalyzerHookContextImpl)context; + HiveConf conf = (HiveConf)ctx.getConf(); + + Set externalLocations = new HashSet(); + + for (ReadEntity input : ctx.getInputs()) { + + // If this input is either an external table or a partition in an external table, add its + // location to the set of locations + if (input.getTable().getTableType() == TableType.EXTERNAL_TABLE) { + String location = null; + try { + location = input.getLocation().toString(); + } catch (Exception e) { + throw new SemanticException("GetLocation failed", e); + } + + // We assume all managed tables exist in /user/facebook/warehouse/ + // This helps to avoid having to look up if there are managed tables pointing to the data + // being pointed to by scrape and scribe staging tables, which point to directories like + // /user/facebook/scrape_staging (staging tables) and /user/facebook/scribe_staging + // (current tables) and /tmp (signal tables) + // We are also excluding inputs which are partitioned tables (without their partitions) + // If the input is a partitioned table, it can only be a metadata change, and hence only + // needs the external table, not the underlying managed table. If any data was queried + // the partition queried will also be in the inputs and we can get the managed + // table/partition from this. + if (location.contains("/user/facebook/warehouse/") && + (!input.getTable().isPartitioned() || input.getType() != ReadEntity.Type.TABLE)) { + externalLocations.add(location); + } + } + } + + // If there were some external inputs, get the managed tables/partitions whose data they + // point to + if (!externalLocations.isEmpty()) { + // The 2 cases in the select are as follows: + // d1.name, t1.tbl_name, p1.part_name + // 1) The external entity's location is such that there are one or more partitions whose + // location is a subdirectory, this includes if the external entity's location is the same + // as the location of a partitioned table, in which case all partitions whose location has + // the table's location as a prefix will be returned, not the table (If the location of + // the table was ever changed, this means only the subset of partitions created after the + // location was changed will be included) + // d2.name, t2.tbl_name, NULL + // 2) The external entity's location is such that there is an unpartitioned whose location is + // a prefix. In this case the table is returned. + + String sql = "SELECT IF(p1.part_name IS NOT NULL, d1.name, d2.name), " + + " IF(p1.part_name IS NOT NULL, t1.tbl_name, t2.tbl_name), " + + " p1.part_name " + + "FROM SDS s LEFT JOIN PARTITIONS p1 ON s.sd_id = p1.sd_id " + + "LEFT JOIN TBLS t1 ON t1.tbl_id = p1.tbl_id " + + "LEFT JOIN DBS d1 ON t1.db_id = d1.db_id " + + "LEFT JOIN TBLS t2 ON t2.sd_id = s.sd_id " + + "LEFT JOIN DBS d2 ON d2.db_id = t2.db_id " + + "LEFT JOIN PARTITION_KEYS k on t2.tbl_id = k.tbl_id " + + "WHERE ((p1.part_name IS NOT NULL AND t1.tbl_type = 'MANAGED_TABLE') OR " + + " (p1.part_name IS NULL AND t2.tbl_type = 'MANAGED_TABLE' AND" + + " k.tbl_id IS NULL)) AND ("; + + List sqlParams = new ArrayList(); + + boolean firstLocation = true; + for (String location : externalLocations) { + if (!firstLocation) { + sql += "OR "; + } else { + firstLocation = false; + } + + sql += "s.location LIKE ? "; + sql += "OR s.location = ? "; + // Adding the / ensures that we will only get locations which are subdirectories of the + // external entities location, rather than just having it as a prefix + sqlParams.add(location + "/%"); + // Also check if it is equal, in which case the final / will not be in the location or it + // will be captured by the LIKE + sqlParams.add(location); + } + + sql += ");"; + ConnectionUrlFactory metastoreDbUrlFactory = + HookUtils.getUrlFactory( + conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.METASTORE_CONNECTION_FACTORY, + FBHiveConf.METASTORE_MYSQL_TIER_VAR_NAME, + FBHiveConf.METASTORE_HOST_DATABASE_VAR_NAME); + List> results = null; + try { + results = HookUtils.runInsertSelect(conf, + metastoreDbUrlFactory, sql, sqlParams, false, SQL_NUM_RETRIES, + RETRY_MAX_INTERVAL_SEC, false); + } catch (Exception e) { + throw new SemanticException("SQL query to retrieve names of managed tables/partitions " + + "pointed to by externals failed", e); + } + + // Construct a mapping to pass to updateInputs, the structure of the mapping is described in + // updateInputs's method description + Map> tableToPartitions = new HashMap>(); + + for (List result : results) { + + String[] dbTable = {(String)result.get(0), (String)result.get(1)}; + if (!tableToPartitions.containsKey(dbTable)) { + tableToPartitions.put(dbTable, new ArrayList()); + } + + String partitionName = (String)result.get(2); + if (partitionName != null) { + tableToPartitions.get(dbTable).add(partitionName); + } + } + + try { + updateInputs(ctx.getInputs(), tableToPartitions, ctx.getHive()); + } catch (HiveException e) { + throw new SemanticException("Failed to retrieve managed Table(s)/Partition(s) mapped to " + + "by externals from the metastore.", e); + } + } + } + + /** + * Given a set of inputs and a map from db/table name to a list of partition names, and an + * instance of Hive it updates the inputs to include for each db/table name the partitions, or if + * the list of partitions is empty, the table + * @param inputs A set of ReadEntities + * @param tableToPartitions A map, whose keys are arrays of strings of length 2, the first index + * should correspond to the db name and the second to the table name, + * the values are lists of Strings representing partition names, if the + * list is empty it is assumed the table is unpartitioned + * @param db An instance of Hive, used to connect to the metastore. + * @throws HiveException + */ + private void updateInputs(Set inputs, Map> tableToPartitions, Hive db) throws HiveException { + for (Entry> entry : tableToPartitions.entrySet()) { + + Table table = db.getTable(entry.getKey()[0], entry.getKey()[1]); + + if (entry.getValue().isEmpty()) { + inputs.add(new ReadEntity(table)); + } else { + List partitions = db.getPartitionsByNames(table, entry.getValue()); + for (Partition partition : partitions) { + inputs.add(new ReadEntity(partition)); + } + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/CheckRetentionsHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/CheckRetentionsHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/CheckRetentionsHook.java (working copy) @@ -0,0 +1,201 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.DDLTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.plan.CreateTableDesc; +import org.apache.hadoop.hive.ql.plan.DDLWork; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; + + +/** + * Implementation of a pre execute hook that checks the table RETENTION is set. + */ +public class CheckRetentionsHook { + + private static final Log LOG = LogFactory.getLog(CheckRetentionsHook.class.getName()); + private static HiveConf conf; + + // If warningOnly = true, we print out some warnnings without fail + // the CREATE TABLE DDL. + private static boolean warningOnly = false; + + // required table parameters + private static final String RETENTION_FLAG = "RETENTION"; + private static final String RETENTION_PLATINUM_FLAG = "RETENTION_PLATINUM"; + + // wiki page URL that explains the policies + private static final String wikiURL = + "https://www.intern.facebook.com/intern/wiki/index.php/Data/Hive/" + + "Retention_on_new_tables"; + + private static String retentionKey = null; + + private static String ErrMsg(String str) { + return str + "\n Here's how to add retention: " + wikiURL; + } + + public static class PreExec implements ExecuteWithHookContext { + + public void run(HookContext hookContext) + throws Exception { + + assert(hookContext.getHookType() == HookContext.HookType.PRE_EXEC_HOOK); + + SessionState sess = SessionState.get(); + Set inputs = hookContext.getInputs(); + Set outputs = hookContext.getOutputs(); + UserGroupInformation ugi = hookContext.getUgi(); + conf = sess.getConf(); + + warningOnly = conf.getBoolean(FBHiveConf.NO_RETENTION_WARNING_ONLY, true); + + // figure out if we are on silver or platinum + String whDir = HiveConf.getVar(conf, HiveConf.ConfVars.METASTOREWAREHOUSE); + if (whDir == null) { + throw new Exception(ErrMsg("Cannot determine which cluster this query is running on: " + + "hive.metastore.warehouse.dir is not set!")); + } + + Path p = new Path(whDir); + String hostName = p.toUri().getHost(); + + if (hostName.equals(conf.get(FBHiveConf.FBHIVE_SILVER_DFS_PREFIX)) || + hostName.equals(conf.get(FBHiveConf.FBHIVE_SILVER_DFS_PREFIX2)) || + hostName.equals(conf.get(FBHiveConf.FBHIVE_SILVER_DFS_PREFIX3))) { + retentionKey = RETENTION_FLAG; + } else if (hostName.equals(conf.get(FBHiveConf.FBHIVE_PLATINUM_DFS_PREFIX))) { + retentionKey = RETENTION_PLATINUM_FLAG; + } else { + throw new Exception(ErrMsg("Cannot determine which cluster this query is running on: " + + "hive.metastore.warehouse.dir=" + whDir + + "; does not seems to belong to either silver or platinum!")); + } + + Set> tasks = new HashSet>(); + getReachableTasks(tasks, hookContext.getQueryPlan()); + + for (Task task: tasks) { + if (task instanceof DDLTask) { + DDLWork work = (DDLWork) task.getWork(); + if (work.getCreateTblDesc() != null) { + checkRetention(work.getCreateTblDesc(), retentionKey); + } + } + } + } + + private void getReachableTasks(Set> tasks, QueryPlan qp) { + ArrayList> rootTasks = qp.getRootTasks(); + for (Task task: rootTasks) { + getReachableTasks(tasks, task); + } + } + + /** + * Recursively traverse the task dependence tree and gather all tasks into + * the set. + */ + private void getReachableTasks(Set> tasks, + Task rootTask) { + if (!tasks.contains(rootTask)) { + tasks.add(rootTask); + if (rootTask.getDependentTasks() != null) { + for (Task child: rootTask.getDependentTasks()) { + getReachableTasks(tasks, child); + } + } + } + } + + private void warnOrFail(boolean warning, String mesg) throws Exception { + if (warning) { + // shout loud on stderr! + System.err.println("\n ----------"); + System.err.println("| WARNING: | "); + System.err.println(" ----------"); + System.err.println(" This command does NOT comply with the RETENTION " + + "policies. This command will fail in the near future. \n" + + mesg); + } else { + throw new Exception(mesg); + } + } + + + /** + * Check if the CREATE TABLE statement has retention and data growth + * estimation set. If not throw an exception. + */ + private void checkRetention(CreateTableDesc desc, String retentionKey) + throws Exception { + + // exclude EXTERNAL tables + if (desc.isExternal()) { + return; + } + + // TODO: remove this whenever it becomes feasible + // exclude table name starts with tmp, temp, or test: tmp tables should be set default retention + if (desc.getTableName().startsWith("tmp") || + desc.getTableName().startsWith("temp") || + desc.getTableName().startsWith("test")) { + return; + } + + // check if table already exists + if (tableExists(desc.getTableName())) { + return; + } + + String tableNeedsRetention = "Newly created tables have to have " + retentionKey + + " set unless the table name has one of the prefixes \"tmp\", \"temp\", or \"test\"."; + + String tableRetentionFormat = "The value of the " + retentionKey + " parameter must be an " + + "integer greater than or equal to -1, i.e. -1,0,1,..."; + + // check 'RETENTION' parameter exists + String retentionValue = ""; + if (desc.getTblProps() == null || + (retentionValue = desc.getTblProps().get(retentionKey)) == null) { + warnOrFail(warningOnly, ErrMsg("Table " + desc.getTableName() + " does not have " + + retentionKey + " parameter set. " + tableNeedsRetention + " " + tableRetentionFormat)); + return; + } + + // check 'RETENTION' parameter is set to a value in the range -1,0,1,... + int retentionIntValue; + try { + retentionIntValue = Integer.parseInt(retentionValue); + } catch (Exception e) { + // retentionValue is not a valid integer, set retentionIntValue to an invalid value + retentionIntValue = Integer.MIN_VALUE; + } + + if (retentionIntValue < -1) { + warnOrFail(warningOnly, ErrMsg("Table " + desc.getTableName() + " has an invalid value " + + retentionValue + " for the parameter " + retentionKey + ". " + + tableRetentionFormat + " " + tableNeedsRetention)); + } + } + + private boolean tableExists(String tabName) throws Exception { + Hive db = Hive.get(conf); + Table table = db.getTable("default", tabName, false); + return table != null; + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/JobStatsHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/JobStatsHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/JobStatsHook.java (working copy) @@ -0,0 +1,296 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.MapRedStats; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskRunner; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.plan.HiveOperation; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.stats.HiveStatsMetricsPublisher; +import org.apache.hadoop.hive.ql.stats.HiveStatsMetricsPublisher.QueryTag; +import org.apache.hadoop.mapred.Counters; +import org.json.JSONObject; + +/** + * A hook which populates the job_stats_log MySQL table with + * stats for each job which has run for this query, the query ID, + * and whether or not the query succeeded. + * + * It also sets the query attributes in HiveStatsMetricsPublisher and logs + * the stats through it as well. + */ +public class JobStatsHook implements ExecuteWithHookContext { + + public static final String HIVE_QUERY_SOURCE = "hive.query.source"; + + public static ConnectionUrlFactory getJobStatsUrlFactory(HiveConf conf){ + return HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.JOBSTATS_CONNECTION_FACTORY, + FBHiveConf.JOBSTATS_MYSQL_TIER_VAR_NAME, + FBHiveConf.JOBSTATS_HOST_DATABASE_VAR_NAME); + } + + @Override + public void run(HookContext hookContext) throws Exception { + + assert(hookContext.getHookType() == HookContext.HookType.POST_EXEC_HOOK || + hookContext.getHookType() == HookContext.HookType.ON_FAILURE_HOOK); + + String queryId = ""; + String querySrc = ""; + String queryTagsStr = ""; + String statsString = ""; + SessionState sess = SessionState.get(); + String queryFailed = hookContext.getHookType() == HookContext.HookType.ON_FAILURE_HOOK ? "1" + : "0"; + HiveConf conf = sess.getConf(); + HiveStatsMetricsPublisher metricsPublisher = + (HiveStatsMetricsPublisher)HookUtils.getObject(conf, conf.get(FBHiveConf.HIVE_METRICS_PUBLISHER)); + if (metricsPublisher == null) { + return; + } + + metricsPublisher.extractAndOverwriteQueryAttributes(hookContext); + + JSONObject jobStats = new JSONObject(); + + ConnectionUrlFactory urlFactory = getJobStatsUrlFactory(conf); + if (urlFactory == null) { + throw new RuntimeException("DB parameters for audit_log not set!"); + } + + if (sess != null) { + queryId = conf.getVar(HiveConf.ConfVars.HIVEQUERYID); + querySrc = conf.get(HIVE_QUERY_SOURCE, ""); + + List completedTasks = hookContext.getCompleteTaskList(); + Map jobToStageMap = new HashMap(); + + if (completedTasks != null) { + for (TaskRunner taskRunner : completedTasks) { + Task task = taskRunner.getTask(); + // If the Job ID is null, this indicates the task is not a map + // reduce task, or it was run locally + if (task.getJobID() != null) { + String jobID = StringEscapeUtils.escapeJava(task.getJobID()); + String stageID = StringEscapeUtils.escapeJava(task.getId()); + jobToStageMap.put(jobID, stageID); + } + } + } + + List listStats = sess.getLastMapRedStatsList(); + if (listStats != null && listStats.size() > 0) { + Map[] perJobStats = new Map[listStats.size()]; + for (int i = 0; i < listStats.size(); i++) { + MapRedStats mps = listStats.get(i); + Counters ctrs = mps.getCounters(); + Map counterList = new HashMap(); + Map metrics = new HashMap(); + + counterList.put("job_ID", mps.getJobId()); + + if (jobToStageMap.containsKey(mps.getJobId())) { + counterList.put("stage", jobToStageMap.get(mps.getJobId())); + } + + addJobStat(counterList, metrics, "cpu_msec", "cpu_sec", mps.getCpuMSec(), 1000); + addJobStat(counterList, metrics, "map", mps.getNumMap()); + addJobStat(counterList, metrics, "reduce", mps.getNumReduce()); + if (ctrs != null) { + conditionalAddJobStat(counterList, metrics, "hdfs_read_bytes", "hdfs_read_mbytes", + ctrs.findCounter("FileSystemCounters", "HDFS_BYTES_READ"), 1000000); + conditionalAddJobStat(counterList, metrics, "hdfs_write_bytes", "hdfs_write_mbytes", + ctrs.findCounter("FileSystemCounters", "HDFS_BYTES_WRITTEN"), 1000000); + conditionalAddJobStat(counterList, metrics, "hdfs_local_read_bytes", + "hdfs_read_local_mbytes", ctrs.findCounter("FileSystemCounters", + "HDFS_BYTES_READ_LOCAL"), 1000000); + conditionalAddJobStat(counterList, metrics, "hdfs_rack_read_bytes", + "hdfs_rack_read_mbytes", ctrs.findCounter("FileSystemCounters", + "HDFS_BYTES_READ_RACK"), 1000000); + conditionalAddJobStat(counterList, metrics, "hdfs_read_exceptions", + ctrs.findCounter("FileSystemCounters", "HDFS_READ_EXCEPTIONS")); + conditionalAddJobStat(counterList, metrics, + "hdfs_write_exceptions", + ctrs.findCounter("FileSystemCounters", "HDFS_WRITE_EXCEPTIONS")); + conditionalAddJobStat(counterList, metrics, "map_input_records", + "map_input_million_records", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS"), 1000000); + conditionalAddJobStat(counterList, metrics, "map_output_records", + "map_output_million_records", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_OUTPUT_RECORDS"), + 1000000); + conditionalAddJobStat(counterList, metrics, "reduce_input_records", + "reduce_input_million_records", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_INPUT_RECORDS"), + 1000000); + conditionalAddJobStat(counterList, metrics, "reduce_output_records", + "reduce_output_million_records", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS"), + 1000000); + conditionalAddJobStat(counterList, metrics, "shuffle_bytes", "shuffle_mbytes", + ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_SHUFFLE_BYTES"), 1000000); + conditionalAddJobStat(counterList, metrics, "map_input_bytes", "map_input_mbytes", + ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_BYTES"), 1000000); + conditionalAddJobStat(counterList, metrics, "map_spill_cpu_msecs", + "map_spill_cpu_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_SPILL_CPU"), 1000); + conditionalAddJobStat(counterList, metrics, "map_spill_wallclock_msecs", + "map_spill_walllclock_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_SPILL_WALLCLOCK"), 1000); + conditionalAddJobStat(counterList, metrics, "map_spill_number", "map_spill_number", + ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_SPILL_NUMBER"), 1); + conditionalAddJobStat(counterList, metrics, "map_spill_bytes", "map_spill_mbytes", + ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_SPILL_BYTES"), 1000000); + conditionalAddJobStat(counterList, metrics, "map_mem_sort_cpu_msecs", + "map_mem_sort_cpu_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_MEM_SORT_CPU"), 1000); + conditionalAddJobStat(counterList, metrics, "map_mem_sort_wallclock_msecs", + "map_mem_sort_wallclock_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_MEM_SORT_WALLCLOCK"), + 1000); + conditionalAddJobStat(counterList, metrics, "map_merge_cpu_msecs", + "map_merge_cpu_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_MERGE_CPU"), 1000); + conditionalAddJobStat(counterList, metrics, "map_merge_wallclock_msecs", + "map_merge_wallclock_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_MERGE_WALLCLOCK"), 1000); + conditionalAddJobStat(counterList, metrics, "reduce_copy_cpu_msecs", + "reduce_copy_cpu_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_COPY_CPU"), 1000); + conditionalAddJobStat(counterList, metrics, "reduce_copy_wallclock_msecs", + "reduce_copy_wallclock_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_COPY_WALLCLOCK"), + 1000); + conditionalAddJobStat(counterList, metrics, "reduce_sort_cpu_msecs", + "reduce_sort_cpu_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_SORT_CPU"), 1000); + conditionalAddJobStat(counterList, metrics, "redcue_sort_wallclock_msecs", + "reduce_sort_wallclock_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_SORT_WALLCLOCK"), + 1000); + conditionalAddJobStat(counterList, metrics, "map_task_wallclock_msecs", + "map_task_wallclock_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_TASK_WALLCLOCK"), 1000); + conditionalAddJobStat(counterList, metrics, "reduce_task_wallclock_msecs", + "reduce_task_wallclock_secs", ctrs.findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_TASK_WALLCLOCK"), + 1000); + conditionalAddJobStat(counterList, metrics, "slots_millis_maps", "slots_secs_maps", + ctrs.findCounter("org.apache.hadoop.mapred.JobInProgress$Counter", + "SLOTS_MILLIS_MAPS"), 1000); + conditionalAddJobStat(counterList, metrics, "slots_millis_reduces", + "slots_secs_reduces", ctrs.findCounter( + "org.apache.hadoop.mapred.JobInProgress$Counter", "SLOTS_MILLIS_REDUCES"), + 1000); + } + addJobStat(counterList, metrics, "success", mps.isSuccess() ? 1 : 0); + perJobStats[i] = counterList; + + metricsPublisher.publishMetricsWithQueryTags(metrics); + } + jobStats.put("per_job_stats", perJobStats); + } + } + + HiveOperation op = sess.getHiveOperation(); + + // If input was read, log the input and output size + if ((op != null) && + ((op.equals(HiveOperation.CREATETABLE_AS_SELECT)) || + (op.equals(HiveOperation.LOAD)) || + (op.equals(HiveOperation.QUERY)))) { + + // We are depending on the stats to be present in the metastore. + // If that is not true, we might end up calling getContentSummary for + // all the inputs and outputs, which may create a problem for HDFS + // Allow the user to manually turn it off. + if (!conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { + if (SessionState.get().getOverriddenConfigurations().containsKey( + HiveConf.ConfVars.HIVESTATSAUTOGATHER.varname)) { + SessionState.getConsole().printInfo("WARNING: hive.stats.autogather is set to false." + + " Stats were not populated for any outputs of this query. If any tables or " + + "partitions were overwritten as part of this query, their stats may be incorrect"); + } else { + throw new RuntimeException("hive.stats.autogather is set to false"); + } + } + + // Log the total size and the individual sizes for each input and output + HookUtils.ObjectSize inputSizes = + HookUtils.getObjectSize(conf, + new HashSet(hookContext.getInputs()), + false); + jobStats.put("input_size", String.valueOf(inputSizes.getTotalSize())); + if (!inputSizes.getObjectTypeLengths().isEmpty()) { + jobStats.put("inputs", inputSizes.getObjectTypeLengths()); + } + + // Log the pool specified in the conf. May be overwritten by the conf + // if we enable the feature on the JT to disallow non-standard pools. + String specifiedPool = conf.get("mapred.fairscheduler.pool", ""); + if (specifiedPool.length() > 0) { + jobStats.put("pool", conf.get("mapred.fairscheduler.pool")); + } + + if (hookContext.getHookType() != HookContext.HookType.ON_FAILURE_HOOK) { + // The object for the outputs was created before the statistics in that + // object was populated. So, reload the outputs from the metastore to get + // the size for outputs + HookUtils.ObjectSize outputSizes = + HookUtils.getObjectSize(conf, + new HashSet(hookContext.getOutputs()), + true); + jobStats.put("output_size", String.valueOf(outputSizes.getTotalSize())); + if (!outputSizes.getObjectTypeLengths().isEmpty()) { + jobStats.put("outputs", outputSizes.getObjectTypeLengths()); + } + } + } + + statsString = jobStats.toString(); + + Set queryTags = metricsPublisher.getQueryAttributes(); + queryTagsStr = StringUtils.join(queryTags, ','); + + List sqlParams = new ArrayList(); + sqlParams.add(StringEscapeUtils.escapeJava(queryId)); + sqlParams.add(StringEscapeUtils.escapeJava(querySrc)); + sqlParams.add(queryFailed); + sqlParams.add(queryTagsStr); + sqlParams.add(statsString); + + // Assertion at beginning of method guarantees this string will not remain empty + String sql = "insert into job_stats_log set queryId = ?, query_src = ?, query_failed = ?, " + + "query_tags = ?, job_stats = ?"; + + HookUtils.runInsert(conf, urlFactory, sql, sqlParams, HookUtils + .getSqlNumRetry(conf)); + } + + private void conditionalAddJobStat(Map counterList, Map metrics, + String key, Counters.Counter cntr) { + conditionalAddJobStat(counterList, metrics, key, key, cntr, 1); + } + + private void conditionalAddJobStat(Map counterList, Map metrics, + String exactKey, String approximateKey, Counters.Counter cntr, int divisor) { + if (cntr != null) { + conditionalAddJobStat(counterList, metrics, exactKey, approximateKey, cntr.getValue(), divisor); + } + } + + private void conditionalAddJobStat(Map counterList, Map metrics, + String exactKey, String approximateKey, long cntrValue, int divisor) { + if (cntrValue >= 0) { + addJobStat(counterList, metrics, exactKey, approximateKey, cntrValue, divisor); + } + } + + private void addJobStat(Map counterList, Map metrics, String key, long value) { + addJobStat(counterList, metrics, key, key, value, 1); + } + + // Method that adds a key value pair to a map, as well as to a list of OdsKeyValuePairs with average aggregation + private void addJobStat(Map counterList, Map metrics, String exactKey, + String approximatedKey, long value, int divisor) { + counterList.put(exactKey, String.valueOf(value)); + metrics.put(approximatedKey, (double)value/divisor); + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/StartFinishHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/StartFinishHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/StartFinishHook.java (working copy) @@ -0,0 +1,180 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.ArrayList; +import java.util.Set; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; + +/** + * This hook records the approx. start and finish times of queries into a table + * in MySQL (query_time_log). Useful for debugging. Possibly for performance + * measurement. + * + * - Relies on query_id to update rows with the finish time. + * - Old entries in this table should be cleaned out on a regular basis. + */ + +/* +Example table schema: + +CREATE TABLE `query_time_log` ( + `query_id` varchar(512) DEFAULT NULL, + `start_time` timestamp NULL DEFAULT NULL, + `finish_time` timestamp NULL DEFAULT NULL, + `query` mediumtext, + `query_type` varchar(32) DEFAULT NULL, + `inputs` mediumtext, + `outputs` mediumtext, + `user_info` varchar(512) DEFAULT NULL, + PRIMARY KEY (`query_id`), + INDEX(start_time), + INDEX(finish_time), + INDEX(inputs(256)), + INDEX(outputs(256)) + ) ENGINE=InnoDB DEFAULT CHARSET=latin1; +*/ + +public class StartFinishHook implements PreExecute, PostExecute { + + + + ConnectionUrlFactory urlFactory = null; + + public StartFinishHook() throws Exception { + HiveConf conf = new HiveConf(StartFinishHook.class); + + + urlFactory = HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.STARTFINISH_CONNECTION_FACTORY, + FBHiveConf.STARTFINISH_MYSQL_TIER_VAR_NAME, + FBHiveConf.STARTFINISH_HOST_DATABASE_VAR_NAME); + } + + /** + * Returns a list of strings with following values extracted from the state: + * command, commandType, inputStr, outputStr, queryId, userInfo + * + * @param sess + * @param inputs + * @param outputs + * @param ugi + * @return + */ + private static ArrayList extractValues(SessionState sess, + Set inputs, Set outputs, UserGroupInformation ugi) { + String command = sess.getCmd(); + String commandType = sess.getCommandType(); + String userInfo = ""; + if (ugi != null) { + userInfo = ugi.getUserName(); + } + String inputStr = ""; + + if (inputs != null) { + StringBuilder inputsSB = new StringBuilder(); + + boolean first = true; + + for (ReadEntity inp : inputs) { + if (!first) { + inputsSB.append(","); + } + first = false; + inputsSB.append(inp.toString()); + } + inputStr = inputsSB.toString(); + } + + String outputStr = ""; + + if (outputs != null) { + StringBuilder outputsSB = new StringBuilder(); + + boolean first = true; + + for (WriteEntity o : outputs) { + if (!first) { + outputsSB.append(","); + } + first = false; + outputsSB.append(o.toString()); + } + outputStr = outputsSB.toString(); + } + + String queryId = getQueryId(sess); + + ArrayList values = new ArrayList(); + values.add(command); + values.add(commandType); + values.add(inputStr); + values.add(outputStr); + values.add(queryId); + values.add(userInfo); + + return values; + } + + private static String getQueryId(SessionState sess) { + HiveConf conf = sess.getConf(); + String queryId = conf.getVar(HiveConf.ConfVars.HIVEQUERYID); + return queryId; + + } + + /** + * For PreExecute + */ + @Override + public void run(SessionState sess, Set inputs, + Set outputs, UserGroupInformation ugi) throws Exception { + ArrayList values = StartFinishHook.extractValues(sess, + inputs, outputs, ugi); + String query = "INSERT INTO query_time_log SET " + + "query = ?, " + + "query_type = ?, " + + "inputs = ?, " + + "outputs = ?, " + + "query_id = ?, " + + "user_info = ?, " + + "start_time = now()"; + + HiveConf conf = sess.getConf(); + // pre-hook doesn't need to retry many times and can fail faster. + HookUtils.runInsert(conf, urlFactory, query, values, 5); + } + + /** + * For PostExecute + */ + @Override + public void run(SessionState sess, Set inputs, + Set outputs, LineageInfo info, UserGroupInformation ugi) + throws Exception { + ArrayList values = StartFinishHook.extractValues(sess, + inputs, outputs, ugi); + // Duplicate values for update statement + values.addAll(values); + // The ON DUPLICATE.. ensures that start_time is preserved for normal cases + // where start_time was recorded + String valueString = + "query = ?, " + + "query_type = ?, " + + "inputs = ?, " + + "outputs = ?, " + + "query_id = ?, " + + "user_info = ?, " + + "finish_time = now()"; + String query = "INSERT INTO query_time_log SET " + valueString + + " ON DUPLICATE KEY UPDATE " + valueString ; + + HiveConf conf = sess.getConf(); + HookUtils.runInsert(conf, urlFactory, query, values, HookUtils + .getSqlNumRetry(conf)); + } + +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/AlterTableRestrictHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/AlterTableRestrictHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/AlterTableRestrictHook.java (working copy) @@ -0,0 +1,84 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.plan.DDLWork; +import org.apache.hadoop.hive.ql.plan.HiveOperation; +import org.apache.hadoop.hive.ql.session.SessionState; + +/** + * Adding a hook to stop platinum tables from getting modified in silver + */ +public class AlterTableRestrictHook implements ExecuteWithHookContext { + static final private Log LOG = LogFactory.getLog("hive.ql.hooks.AlterTableRestrictHook"); + + String current_cluster = null; + + public AlterTableRestrictHook() throws Exception { + HiveConf conf = new HiveConf(AlterTableRestrictHook.class); + current_cluster = conf.get(FBHiveConf.FB_CURRENT_CLUSTER); + } + + /** + * Restrict the alter table command if the current cluster is not the same + * as the creation cluster + * + */ + public void run(HookContext hookContext) throws Exception { + SessionState ss = SessionState.get(); + + if ((current_cluster == null) || (ss == null)) { + return; + } + + HiveOperation commandType = ss.getHiveOperation(); + + // This check is only for alter table + if (!((commandType == HiveOperation.ALTERTABLE_ADDCOLS) || + (commandType == HiveOperation.ALTERTABLE_REPLACECOLS) || + (commandType == HiveOperation.ALTERTABLE_RENAMECOL) || + (commandType == HiveOperation.ALTERTABLE_RENAMEPART) || + (commandType == HiveOperation.ALTERTABLE_RENAME) || + (commandType == HiveOperation.ALTERTABLE_PROPERTIES) || + (commandType == HiveOperation.ALTERTABLE_SERIALIZER) || + (commandType == HiveOperation.ALTERTABLE_SERDEPROPERTIES) || + (commandType == HiveOperation.ALTERTABLE_CLUSTER_SORT) || + (commandType == HiveOperation.ALTERTABLE_FILEFORMAT))) { + return; + } + + // If the creation cluster is being modified to be the current cluster the alter should not be + // restricted + if (commandType == HiveOperation.ALTERTABLE_PROPERTIES) { + Map newProps = + ((DDLWork)(hookContext.getQueryPlan().getRootTasks().get(0).getWork())) + .getAlterTblDesc().getProps(); + if (newProps.containsKey(HookUtils.TABLE_CREATION_CLUSTER) && + (newProps.get(HookUtils.TABLE_CREATION_CLUSTER).equals(current_cluster))) { + return; + } + } + + Set outputs = hookContext.getOutputs(); + for (WriteEntity output : outputs) { + Table table = output.getT(); + if (table != null) { + String tableCreationCluster = table.getProperty(HookUtils.TABLE_CREATION_CLUSTER); + if (tableCreationCluster != null && + !tableCreationCluster.isEmpty() && + !tableCreationCluster.equals(current_cluster)) { + String exception = "Table " + table.getTableName() + " cannot be modified."; + exception += " Table's cluster is " + tableCreationCluster + ","; + exception += "whereas current package is " + current_cluster; + throw new Exception(exception); + } + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/QueryDroppedPartitionsHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/QueryDroppedPartitionsHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/QueryDroppedPartitionsHook.java (working copy) @@ -0,0 +1,156 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.session.SessionState; + +/** + * A hook which is used to prevent people from querying dropped partitions in + * silver. The list of wrongly dropped partitions is in cdb.datahawk - if the + * query uses any of these partitions, it should fail. + */ +public class QueryDroppedPartitionsHook implements ExecuteWithHookContext { + static final private Log LOG = + LogFactory.getLog(QueryDroppedPartitionsHook.class); + +// private static final String SMC_DATABASE_NAME = "cdb.datahawk"; + + @Override + public void run(HookContext hookContext) throws Exception { + + assert(hookContext.getHookType() == HookContext.HookType.PRE_EXEC_HOOK); + SessionState sess = SessionState.get(); + HiveConf conf = sess.getConf(); + String commandType = StringEscapeUtils.escapeJava(sess.getCommandType()); + + // Only check for queries + if ((commandType == null) || + (!commandType.equals("QUERY") && + !commandType.equals("CREATETABLE_AS_SELECT"))) { + return; + } + + Set inputs = hookContext.getInputs(); + + // Nothing to check + if ((inputs == null) || (inputs.isEmpty())) { + return; + } + + String inputString = getInputs(inputs); + if ((inputString == null) || (inputString.isEmpty())) { + return; + } + + ConnectionUrlFactory urlFactory = + HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.QUERYDROPPED_PARTITIONS_CONNECTION_FACTORY, + FBHiveConf.QUERYDROPPED_PARTITIONS_MYSQL_TIER_VAR_NAME, + FBHiveConf.QUERYPLAN_HOST_DATABASE_VAR_NAME); + + // Return silently if you cannot connect for some reason + if ((FBHiveConf.QUERYDROPPED_PARTITIONS_MYSQL_TIER_VAR_NAME == null) || + FBHiveConf.QUERYDROPPED_PARTITIONS_MYSQL_TIER_VAR_NAME.isEmpty()) { + LOG.warn(FBHiveConf.QUERYPLAN_MYSQL_TIER_VAR_NAME + " is null"); + return; + } + + if (urlFactory == null) { + LOG.warn("unable to access " + conf.get(FBHiveConf.QUERYPLAN_MYSQL_TIER_VAR_NAME)); + return; + } + + List sqlParams = new ArrayList(); + sqlParams.add(inputString); + + LOG.info("QueryDroppedPartitionsHook input string: " + inputString); + + // Does the query reference a dropped partition + String sql = "select count(*) from 0114_dropped_parts3 " + + "where (recovered is null or recovered != 1) and ?"; + + List> result = + HookUtils.runInsertSelect(conf, urlFactory, sql, sqlParams, false); + + Long numberDroppedPartitions = null; + + if (!result.isEmpty() && result.get(0).get(0) != null) { + numberDroppedPartitions = (Long)result.get(0).get(0); + } + + if ((numberDroppedPartitions != null) && + (numberDroppedPartitions > 0)) { + String exception = "You cannot select from " + inputString + "."; + exception += "Look at "; + exception += + "https://our.intern.facebook.com/intern/sevmanager/prod/sev/137261279725248"; + exception += " for details "; + throw new Exception(exception); + } + + } + + private String getInputs(Set inputs) { + StringBuilder sb = new StringBuilder(); + + Map> inputMap = new HashMap>(); + + for (ReadEntity input : inputs) { + Partition inputPartition = input.getP(); + if (inputPartition == null) { + continue; + } + + if (!inputMap.containsKey(inputPartition.getTable().getTableName())) { + inputMap.put(inputPartition.getTable().getTableName(), new HashSet()); + } + inputMap.get(inputPartition.getTable().getTableName()).add(inputPartition.getName().split("/")[0]); + } + + if (inputMap.isEmpty()) { + return ""; + } + + sb.append("("); + boolean firstTable = true; + + for (Entry> entry : inputMap.entrySet()) { + if (!firstTable) { + sb.append(" OR "); + } else { + firstTable = false; + } + + sb.append("(table_name = '" + entry.getKey() + "' AND ds IN ("); + + boolean firstPartition = true; + for (String part : entry.getValue()) { + if (!firstPartition) { + sb.append(", "); + } else { + firstPartition = false; + } + + sb.append("'" + part + "'"); + } + sb.append("))"); + } + sb.append(")"); + + return sb.toString(); + } + +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/SuggestionPrintingHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/SuggestionPrintingHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/SuggestionPrintingHook.java (working copy) @@ -0,0 +1,95 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.Map; +import java.util.Set; +import java.util.Random; +import java.util.ArrayList; +import java.util.Map.Entry; +import java.io.Serializable; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.hooks.HookContext; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.DDLTask; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + * Implementation of a pre execute hook that prints out a suggestion for users + * to use TABLESAMPLE when inputs are large. + */ +public class SuggestionPrintingHook implements ExecuteWithHookContext { + + static final private Log LOG = LogFactory.getLog(SuggestionPrintingHook.class + .getName()); + + static private int timesReported = 0; + + @Override + public void run(HookContext hookContext) throws Exception { + assert (hookContext.getHookType() == HookContext.HookType.PRE_EXEC_HOOK); + SessionState sess = SessionState.get(); + if (sess.getIsSilent()) { + return; + } + SessionState.LogHelper console = new SessionState.LogHelper(LOG); + + QueryPlan queryPlan = hookContext.getQueryPlan(); + ArrayList> rootTasks = queryPlan.getRootTasks(); + + // If it is a pure DDL task, + if (rootTasks == null) { + return; + } + if (rootTasks.size() == 1) { + Task tsk = rootTasks.get(0); + if (tsk instanceof DDLTask) { + return; + } + } + + // do some simple query matching to not to show the suggestion for some + // queries. + String command = SessionState.get().getCmd().toUpperCase().replace('\n', + ' ').replace('\t', ' '); + if ((timesReported > 0 && HookUtils.rollDice(0.9f)) || + !command.contains("SELECT ") || command.contains(" TABLESAMPLE") + || command.contains(" JOIN ") || command.contains(" LIMIT ")) { + return; + } + + Set inputs = hookContext.getInputs(); + Map inputToCS = hookContext + .getInputPathToContentSummary(); + + HiveConf conf = sess.getConf(); + + int maxGigaBytes = conf.getInt("fbhive.suggest.tablesample.gigabytes", 32); + + long maxBytes = maxGigaBytes * 1024 * 1024 * 1024L; + + if (maxGigaBytes < 0) { + console.printError("maxGigaBytes value of " + maxGigaBytes + + "is invalid"); + return; + } + + long inputSize = HookUtils.getInputSize(inputs, inputToCS, conf); + + if (inputSize > maxBytes) { + console.printInfo(""); + console + .printInfo("*** This queries over " + + Math.round(maxBytes / 1024D / 1024D / 1024D) + + " GB data. Consider TABLESAMPLE: fburl.com/?key=2001210"); + console.printInfo(""); + timesReported++; + } + } +} \ No newline at end of file Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/ArchiverHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/ArchiverHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/ArchiverHook.java (working copy) @@ -0,0 +1,185 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.hadoop.hive.metastore.MetaStoreUtils; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; + +/** + * Implementation of a pre execute hook that checks whether + * a partition is archived or not and also sets that query + * time for the partition. + */ +public class ArchiverHook implements PreExecute { + + private static final String ARCHIVE_FLAG = "archiveFlag"; + private static final String LAST_QUERY_TIME = "lastQueryTime"; + + static final private Log LOG = LogFactory.getLog("hive.ql.hooks.ArchiverHook"); + + /** + * The metastore client. + */ + private HiveMetaStoreClient ms; + + /** + * The archiver hook constructor. + */ + public ArchiverHook() throws Exception { + ms = new HiveMetaStoreClient(new HiveConf(this.getClass())); + } + + private Map modifyParams(Map old_map, String key, String value) { + Map new_map = old_map; + if (new_map == null) + new_map = new LinkedHashMap(); + new_map.put(key, value); + return new_map; + } + + private boolean setLastQueryTime(Table t) throws Exception { + Map old_map = t.getParameters(); + if (old_map != null) { + String timeStr = old_map.get(LAST_QUERY_TIME); + if (timeStr != null) { + long time = Long.parseLong(timeStr); + long cur_time = System.currentTimeMillis(); + if (cur_time - time < 1*60*60*1000) { + // lastQueryTime was recently set + return false; + } + } + } + t.setParameters(modifyParams(old_map, LAST_QUERY_TIME, Long.toString(System.currentTimeMillis()))); + return true; + } + + private boolean setArchiveFlag(Table t) { + Map old_map = t.getParameters(); + if (old_map != null) { + String archF = old_map.get(ARCHIVE_FLAG); + if (archF != null) { + if(archF.equals("false")) { + return false; + } + } + } + t.setParameters(modifyParams(t.getParameters(), ARCHIVE_FLAG, "false")); + return true; + } + + private boolean setLastQueryTime(Partition p) throws Exception { + Map old_map = p.getParameters(); + if (old_map != null) { + String timeStr = old_map.get(LAST_QUERY_TIME); + if (timeStr != null) { + long time = Long.parseLong(timeStr); + long cur_time = System.currentTimeMillis(); + if (cur_time - time < 1*60*60*1000) { + // lastQueryTime was recently set + return false; + } + } + } + p.setParameters(modifyParams(old_map, LAST_QUERY_TIME, Long.toString(System.currentTimeMillis()))); + return true; + } + + private boolean setArchiveFlag(Partition p) { + Map old_map = p.getParameters(); + if (old_map != null) { + String archF = old_map.get(ARCHIVE_FLAG); + if (archF != null) { + if(archF.equals("false")) { + return false; + } + } + } + p.setParameters(modifyParams(p.getParameters(), ARCHIVE_FLAG, "false")); + return true; + } + + public void run(SessionState sess, Set inputs, + Set outputs, UserGroupInformation ugi) + throws Exception { + + //Go over the input paths and check if they are archived or not + for(ReadEntity re: inputs) { + boolean isArchived = false; + if (re.getParameters() != null) { + String archF = re.getParameters().get(ARCHIVE_FLAG); + if (archF != null) { + isArchived = archF.equals("true"); + } + } + + if (isArchived) + throw new Exception("Path: " + re.getLocation().toString() + " needs to be unarchived."); + + // Set the last query time + ReadEntity.Type typ = re.getType(); + switch(typ) { + case TABLE: + Table t = re.getTable().getTTable(); + if(setLastQueryTime(t)) { + LOG.debug("Setting LastQueryTime for table " + re); + ms.alter_table(MetaStoreUtils.DEFAULT_DATABASE_NAME, t.getTableName(), t); + } + break; + case PARTITION: + Partition p = re.getPartition().getTPartition(); + if (setLastQueryTime(p)) { + LOG.debug("Setting LastQueryTime for partition " + re); + ms.alter_partition(MetaStoreUtils.DEFAULT_DATABASE_NAME, p.getTableName(), p); + } + break; + default: + throw new Exception("Unknown type for input: " + re.toString()); + } + } + + // Go over the write paths and set the archived flag to false + for(WriteEntity we: outputs) { + WriteEntity.Type typ = we.getType(); + boolean q, a; + + switch(typ) { + case TABLE: + Table t = we.getTable().getTTable(); + q = setLastQueryTime(t); + a = setArchiveFlag(t); + if(q || a) { + LOG.debug("Altering dest table for archiver " + we); + ms.alter_table(MetaStoreUtils.DEFAULT_DATABASE_NAME, t.getTableName(), t); + } + break; + case PARTITION: + Partition p = we.getPartition().getTPartition(); + q = setLastQueryTime(p); + a = setArchiveFlag(p); + if(q || a) { + if (ms.getPartition(MetaStoreUtils.DEFAULT_DATABASE_NAME, p.getTableName(), p.getValues()) != null) { + LOG.debug("Altering dest partition for archiver " + we); + ms.alter_partition(MetaStoreUtils.DEFAULT_DATABASE_NAME, p.getTableName(), p); + } + } + break; + case DFS_DIR: + case LOCAL_DIR: + break; + default: + throw new Exception("Unknown type for output: " + we.toString()); + } + } + } + +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/PyRulesHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/PyRulesHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/PyRulesHook.java (working copy) @@ -0,0 +1,92 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Reader; +import java.io.File; +import java.io.FileReader; + +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; + +/** + * This hook executes python code to update the configuration in + * the ContextHook using java scripting abstraction. + * + * fbhive.pyrules.property has the path of the python file that is to + * be executed + * Python code has to define a method updateConf that accepts hookContext + * as a parameter + * + * Python code has to also provide a revertConf method that accepts hookContext + * and the old configuration object and reverts the changes made to the + * configuration in the updateConf + */ +public class PyRulesHook implements ExecuteWithHookContext { + + static final private Log LOG = LogFactory.getLog(PyRulesHook.class); + static private HiveConf savedConf = null; + @Override + + public void run(HookContext hookContext) throws Exception { + HiveConf conf = hookContext.getConf(); + PyRulesHook.savedConf = new HiveConf(conf); + ScriptEngine pythonMod = getPythonModifier(hookContext); + if (pythonMod == null) { + return; + } + conf.setBoolean("fbhive.pyrules.modified", true); + try { + pythonMod.put("hookContext", hookContext); + pythonMod.eval("updateConf(hookContext)"); + } catch (Exception ex) { + LOG.error("Error updating the conf", ex); + } + } + + private static ScriptEngine getPythonModifier(HookContext hookContext) + throws Exception { + String pyFilePath = hookContext.getConf().get("fbhive.pyrules.file"); + if (pyFilePath == null) + return null; + + File pyFile = new File(pyFilePath); + if (!pyFile.exists()) { + LOG.warn("The python conf file " + pyFile + " does not exist"); + return null; + } + + Reader reader = new FileReader(pyFile); + try { + ScriptEngine eng = new ScriptEngineManager().getEngineByName("python"); + if (eng == null) { + LOG.warn("Could not initialize jython engine"); + return null; + } + eng.eval(reader); + + return eng; + } catch (Exception ex) { + LOG.warn("Error updating the conf using python hook", ex); + return null; + } + } + public static class CleanupHook implements ExecuteWithHookContext { + public void run(HookContext hookContext) throws Exception { + if (!hookContext.getConf().getBoolean("fbhive.pyrules.modified", false)) { + return; + } else { + try { + ScriptEngine pythonRevert = getPythonModifier(hookContext); + pythonRevert.put("hookContext", hookContext); + pythonRevert.put("oldConf", PyRulesHook.savedConf); + pythonRevert.eval("revertConf(hookContext, oldConf)"); + } catch (Exception ex) { + LOG.error("Error reverting config", ex); + } + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/LineageHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/LineageHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/LineageHook.java (working copy) @@ -0,0 +1,168 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo; +import org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency; +import org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyKey; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; +import org.json.JSONArray; +import org.json.JSONObject; + +/** + * Implementation of a post execute hook that simply prints out its parameters + * to standard output. + */ +public class LineageHook implements PostExecute { + + ConnectionUrlFactory urlFactory = null; + public LineageHook() throws Exception { + HiveConf conf = new HiveConf(LineageHook.class); + urlFactory = HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.LINEAGE_CONNECTION_FACTORY, + FBHiveConf.LINEAGE_MYSQL_TIER_VAR_NAME, + FBHiveConf.LINEAGE_HOST_DATABASE_VAR_NAME); + } + + @Override + public void run(SessionState sess, Set inputs, + Set outputs, LineageInfo linfo, + UserGroupInformation ugi) throws Exception { + + HiveConf conf = sess.getConf(); + + if (linfo != null) { + + Iterator> iter = linfo.entrySet().iterator(); + while(iter.hasNext()) { + Map.Entry it = iter.next(); + Dependency dep = it.getValue(); + DependencyKey depK = it.getKey(); + + /** + * Generate json values of the following format: + * + * {"value": { + * "type":"SIMPLE", + * "baseCols":[{ + * "column":{ + * "name":"col", + * "comment":"from serde", + * "type":"array" + * }, + * "tabAlias":{ + * "alias":"athusoo_tmp", + * "table":{ + * "dbName":"default", + * "tableName":"athusoo_tmp" + * } + * } + * }] + * }, + * "key":{ + * "fieldSchema":{ + * "name":"col", + * "comment":"from deserializer", + * "type":"array" + * }, + * "dataContainer":{ + * "isPartition":false, + * "table":{ + * "dbName":"default", + * "tableName":"athusoo_tmp2" + * } + * } + * } + *} + */ + JSONObject out_json = new JSONObject(); + JSONObject depk_json = new JSONObject(); + JSONObject field_json = new JSONObject(); + + field_json.put("name", depK.getFieldSchema().getName()); + field_json.put("type", depK.getFieldSchema().getType()); + field_json.put("comment", depK.getFieldSchema().getComment()); + depk_json.put("fieldSchema", field_json); + + JSONObject dc_json = new JSONObject(); + dc_json.put("isPartition", depK.getDataContainer().isPartition()); + JSONObject tab_json = new JSONObject(); + if (depK.getDataContainer().isPartition()) { + JSONObject part_json = new JSONObject(); + Partition part = depK.getDataContainer().getPartition(); + part_json.put("values", part.getValues()); + + tab_json.put("tableName", depK.getDataContainer().getTable().getTableName()); + tab_json.put("dbName", depK.getDataContainer().getTable().getDbName()); + JSONArray fs_array = new JSONArray(); + for (FieldSchema fs : depK.getDataContainer().getTable().getPartitionKeys()) { + field_json = new JSONObject(); + field_json.put("name", fs.getName()); + field_json.put("type", fs.getType()); + field_json.put("comment", fs.getComment()); + + fs_array.put(field_json); + } + tab_json.put("partitionKeys", fs_array); + part_json.put("table", tab_json); + dc_json.put("partition", part_json); + } + else { + tab_json.put("tableName", depK.getDataContainer().getTable().getTableName()); + tab_json.put("dbName", depK.getDataContainer().getTable().getDbName()); + dc_json.put("table", tab_json); + } + depk_json.put("dataContainer", dc_json); + out_json.put("key", depk_json); + + JSONObject dep_json = new JSONObject(); + dep_json.put("type", dep.getType().toString()); + dep_json.put("expr", dep.getExpr()); + JSONArray basecol_array = new JSONArray(); + for(BaseColumnInfo col: dep.getBaseCols()) { + JSONObject col_json = new JSONObject(); + + field_json = new JSONObject(); + // A column can be null in the case of aggregations like count(1) + // where the value is dependent on the entire row. + if (col.getColumn() != null) { + field_json.put("name", col.getColumn().getName()); + field_json.put("type", col.getColumn().getType()); + field_json.put("comment", col.getColumn().getComment()); + } + col_json.put("column", field_json); + + JSONObject tabAlias_json = new JSONObject(); + tabAlias_json.put("alias", col.getTabAlias().getAlias()); + + tab_json = new JSONObject(); + tab_json.put("tableName", col.getTabAlias().getTable().getTableName()); + tab_json.put("dbName", col.getTabAlias().getTable().getDbName()); + + tabAlias_json.put("table", tab_json); + col_json.put("tabAlias", tabAlias_json); + basecol_array.put(col_json); + } + dep_json.put("baseCols", basecol_array); + out_json.put("value", dep_json); + + ArrayList sqlParams = new ArrayList(); + sqlParams.add(StringEscapeUtils.escapeJava(out_json.toString())); + String sql = "insert into lineage_log set info = ?"; + + HookUtils.runInsert(conf, urlFactory, sql, sqlParams, HookUtils + .getSqlNumRetry(conf)); + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/BaseReplicationHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/BaseReplicationHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/BaseReplicationHook.java (working copy) @@ -0,0 +1,62 @@ +/** + * + */ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.ArrayList; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; + +/** + * Seperating out some functionality for so that Hive1 can share code. + */ +public class BaseReplicationHook { + static final private Log LOG = LogFactory.getLog("hive.ql.hooks.BaseReplicationHook"); + + protected ConnectionUrlFactory urlFactory = null; + HiveConf conf = null; + + public static ConnectionUrlFactory getReplicationMySqlUrl() { + HiveConf conf = new HiveConf(BaseReplicationHook.class); + return HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.REPLICATION_CONNECTION_FACTORY, + FBHiveConf.REPLICATION_MYSQL_TIER_VAR_NAME, + FBHiveConf.REPLICATION_HOST_DATABASE_VAR_NAME); + } + + public BaseReplicationHook() throws Exception { + urlFactory = getReplicationMySqlUrl(); + conf = new HiveConf(BaseReplicationHook.class); + } + + /** + * Simplified call used by hive1 to insert into the audit log + * + * @param command + * @param commandType + * @param inputs + * @param outputs + * @param userInfo + * @throws Exception + */ + public void run(String command, String commandType, String inputs, + String outputs, String userInfo) throws Exception { + ArrayList sqlParams = new ArrayList(); + sqlParams.add(StringEscapeUtils.escapeJava(commandType)); + sqlParams.add(StringEscapeUtils.escapeJava(inputs)); + sqlParams.add(outputs); + sqlParams.add(StringEscapeUtils.escapeJava(userInfo)); + + String sql = "insert into snc1_command_log set command_type = ?, " + + "inputs = ?, outputs = ?, user_info = ?"; + if (conf == null) { + conf = new HiveConf(BaseReplicationHook.class); + } + HookUtils.runInsert(conf, urlFactory, sql, sqlParams); + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/SMCStatsDBHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/SMCStatsDBHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/SMCStatsDBHook.java (working copy) @@ -0,0 +1,48 @@ +package org.apache.hadoop.hive.ql.hooks; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.AbstractSemanticAnalyzerHook; +import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContext; + +public class SMCStatsDBHook extends AbstractSemanticAnalyzerHook { + + @Override + public ASTNode preAnalyze(HiveSemanticAnalyzerHookContext context, ASTNode ast) { + HiveConf conf; + try { + conf = (HiveConf) context.getConf(); + } catch (ClassCastException e) { + // Statistics won't be collected for this query, + // warning about it will be supplied later, by JDBCStatsPublisher + return ast; + } + ConnectionUrlFactory urlFactory = + HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.STATS_CONNECTION_FACTORY, + FBHiveConf.STATS_MYSQL_TIER_VAR_NAME, + FBHiveConf.STATS_HOST_DATABASE_VAR_NAME); + String databaseHostName; + + try { + databaseHostName = urlFactory.getUrl(); + } catch (Exception e) { + // Statistics won't be collected for this query, + // warning about it will be supplied later, by JDBCStatsPublisher + return ast; + } + + conf.setVar( + HiveConf.ConfVars.HIVESTATSDBCONNECTIONSTRING, + getUpdatedConnectionString(conf.getVar(HiveConf.ConfVars.HIVESTATSDBCONNECTIONSTRING), + databaseHostName)); + return ast; + } + + // default visibility for the sake of TestSMCStatsDBHook + String getUpdatedConnectionString(String initialConnectionString, String addressFromSMC) { + return initialConnectionString.replaceAll("jdbc.*\\?", addressFromSMC); + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/SmcConfigHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/SmcConfigHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/SmcConfigHook.java (working copy) @@ -0,0 +1,67 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.json.JSONObject; +/** + * Retrieves and sets Hive config key/values based on a config stored in the + * properties of an SMC tier. This is useful for quick changes to the config + * that should apply to a particular package of Hive. (e.g. silver.trunk). The + * advantage over a XML file edit is that it's persistent between hotfixes and + * we have a history of what changes were made. But since this is a hook that + * runs after query compilation, it is limited in what values it can effectively + * change. + * + * The configs are supposed to be stored in the properties of an SMC tier. The + * name of the property corresponds to the hive package. The value of the + * property is a JSON object that holds 1) an enabled field that controls + * whether the key-value pairs should be applied 2) a config field that holds + * the actual key-value pairs. + * + * (Property)hivePackageName -> {enabled : boolean, + * configs : {key1 : value1, + * key2 : value2.. + * } + * } + * + * The primary application of this hook is to modify the behavior of the + * jobtracker hook. For the configs to apply to the hook, it must be listed + * before the jobtracker hook in hive.exec.pre.hooks + */ +public class SmcConfigHook extends AbstractSmcConfigHook implements ExecuteWithHookContext { + + static final private Log LOG = LogFactory.getLog(SmcConfigHook.class); + + @Override + public void run(HookContext hookContext) throws Exception { + HiveConf conf = hookContext.getConf(); + + if (!isEnabled(conf)) { + return; + } + + Object configObj = getConfigObject(conf); + + if (configObj == null || !(configObj instanceof JSONObject) ) { + LOG.error("config not properly set!"); + return; + } + + // Sanity checks pass, apply all the configs. + JSONObject configJson = (JSONObject) configObj; + @SuppressWarnings("unchecked") + Iterator i = (Iterator) configJson.keys(); + while(i.hasNext()) { + String key = i.next(); + Object valueObj = configJson.get(key); + String value = valueObj.toString(); + + conf.set(key, value); + LOG.debug("Setting " + key + " to " + value); + } + } + +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/SmcConfigDriverRunHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/SmcConfigDriverRunHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/SmcConfigDriverRunHook.java (working copy) @@ -0,0 +1,145 @@ +package org.apache.hadoop.hive.ql.hooks; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.HiveDriverRunHook; +import org.apache.hadoop.hive.ql.HiveDriverRunHookContext; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.json.JSONArray; +import org.json.JSONObject; + +/** + * Retrieves and sets Hive config key/values based on a config stored in the + * properties of an SMC tier. This is useful for quick changes to the config + * that should apply to a particular package of Hive. (e.g. silver.trunk). The + * advantage over a XML file edit is that it's persistent between hotfixes and + * we have a history of what changes were made. But since this is a hook that + * runs at the very beginning of the Driver.run method, before compilation, + * it should be able to effectively change most values that affect query + * processing and execution. + * + * The configs are supposed to be stored in the properties of an SMC tier. The + * name of the property corresponds to the hive package. The value of the + * property is a JSON object that holds 1) an enabled field that controls + * whether the key-value pairs should be applied 2) a config field that holds + * an array of Objects + * + * (Property)hivePackageName -> {enabled : boolean, + * configs : [ + * {key : key1, + * value : value1, + * percentage : 50, + * enforce : true + * }, + * {key : key2, + * value : value2 + * }, ... + * ] + * } + * + * The key is the config variables key, value is the config variables value, + * percentage is optional, if set, the change will only be applied to + * approximately that percentage of queries, and enforce is also optional, if + * true, even if the user explicitely set this config variable, it will be + * overwritten. + * + * The primary application of this hook is to modify the behavior of Hive clients dynamically, + * without a push, and for incremental rollouts of config changes. E.g. if a feature is broken and + * can be turned off using a config variable, this hook can be used to turn it off without rolling + * back the push. Also, if there is a change and we are not sure how it will perform at scale and + * it can be controlled via a config, we can turn it on for increasing percentages of users using + * this hook. + */ +public class SmcConfigDriverRunHook extends AbstractSmcConfigHook implements HiveDriverRunHook { + + static final private Log LOG = LogFactory.getLog(SmcConfigDriverRunHook.class); + static final private String KEY_FIELD = "key"; + static final private String VALUE_FIELD = "value"; + static final private String PERCENTAGE_FIELD = "percentage"; + static final private String ENFORCE_FIELD = "enforce"; + + @Override + public void preDriverRun(HiveDriverRunHookContext hookContext) throws Exception { + HiveConf conf = (HiveConf) hookContext.getConf(); + if (!isEnabled(conf)) { + return; + } + + Object configObj = getConfigObject(conf); + + if (configObj == null || !(configObj instanceof JSONArray) ) { + LOG.error("config not properly set!"); + return; + } + + // Sanity checks pass, apply all the configs. + JSONArray configEntries = (JSONArray) configObj; + for (int i = 0; i < configEntries.length(); i++) { + JSONObject configEntry = configEntries.getJSONObject(i); + Object percentage = new Integer(100); + Object enforce = new Boolean(false); + + // Get the config key and value + String key = configEntry.getString(KEY_FIELD); + String value = configEntry.get(VALUE_FIELD).toString(); + + LOG.debug("SmcConfigHook found configuration KEY: " + key + " VALUE: " + value); + + // If enforce is set to true, even if the user has set the value of this config variable + // explicitely, we will overwrite it + if (configEntry.has(ENFORCE_FIELD)) { + enforce = configEntry.get(ENFORCE_FIELD); + } + + LOG.debug("Enforce for key " + key + " is " + enforce.toString()); + + if (!(enforce instanceof Boolean)) { + LOG.error("enforce is not properly set for " + key); + continue; + } + + if (!(Boolean)enforce && SessionState.get() != null && + SessionState.get().getOverriddenConfigurations().containsKey(key)) { + continue; + } + + // If the percentage field is set to some number n, the configuration change will be made + // to approximately n% of queries + if (configEntry.has(PERCENTAGE_FIELD)) { + percentage = configEntry.getInt(PERCENTAGE_FIELD); + } + + LOG.debug("Percentage for key " + key + " is " + percentage.toString()); + + if (!(percentage instanceof Integer)) { + LOG.error("percentage is not properly set for " + key); + continue; + } + + if ((Integer)percentage != 100) { + boolean diceRoll = false; + + try { + diceRoll = HookUtils.rollDice(((Integer)percentage).intValue()/100f); + } catch (Exception e) { + LOG.error("percentage is not properly set for " + key); + LOG.error(e.getMessage()); + } + + if (!diceRoll) { + continue; + } + } + + conf.set(key, value); + } + } + + @Override + public void postDriverRun(HiveDriverRunHookContext hookContext) + throws Exception { + // Do nothing + } + +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/Triple.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/Triple.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/Triple.java (working copy) @@ -0,0 +1,128 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.hooks; + +/** + * A generic class for triples. + * @param + * @param + * @param + */ +public class Triple +{ + protected T1 first = null; + protected T2 second = null; + protected T3 third = null; + + /** + * Default constructor. + */ + public Triple() + { + } + + /** + * Constructor + * @param a operand + * @param b operand + * @param c operand + */ + public Triple(T1 a, T2 b, T3 c) + { + this.first = a; + this.second = b; + this.third = c; + } + + /** + * Replace the first element of the triple. + * @param a operand + */ + public void setFirst(T1 a) + { + this.first = a; + } + + /** + * Replace the second element of the triple. + * @param b operand + */ + public void setSecond(T2 b) + { + this.second = b; + } + + /** + * Replace the third element of the triple. + * @param c operand + */ + public void setThird(T3 c) + { + this.third = c; + } + + /** + * Return the first element stored in the triple. + * @return T1 + */ + public T1 getFirst() + { + return first; + } + + /** + * Return the second element stored in the triple. + * @return T2 + */ + public T2 getSecond() + { + return second; + } + + /** + * Return the third element stored in the triple. + * @return T3 + */ + public T3 getThird() + { + return third; + } + + private boolean equals(Object x, Object y) + { + return (x == null && y == null) || (x != null && x.equals(y)); + } + + @Override + @SuppressWarnings("unchecked") + public boolean equals(Object other) + { + return other instanceof Triple && + equals(first, ((Triple)other).first) && + equals(second, ((Triple)other).second) && + equals(third, ((Triple)other).third); + } + + @Override + public String toString() + { + return "{" + getFirst() + "," + getSecond() + "," + getThird() + "}"; + } +} + Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/JobTrackerHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/JobTrackerHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/JobTrackerHook.java (working copy) @@ -0,0 +1,464 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.DDLTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.HookUtils.InputInfo; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +/** + * Implementation of a pre execute hook that decides what + * cluster to send a given query to based on the size of + * query inputs + * + * TODO: this needs to be optimized once HIVE-1507 is in + * place to reuse the patch->summary cache maintained in hive + * + * TODO: this encodes hadoop cluster info in code. very + * undesirable. Need to figure this out better (SMC?) + */ +public class JobTrackerHook { + + static final private Log LOG = LogFactory.getLog(JobTrackerHook.class.getName()); + + // store the prior location of the hadoop executable. switching this doesn't + // matter unless we are using the 'submit via child' feature + private static String preHadoopBin = null; + + private static String preJobTracker = null; + + private static Map savedValues = null; + + public static class PreExec implements ExecuteWithHookContext { + + private final String dislike = "Not choosing Bronze/Corona because "; + + static final private String POOLS = "pools"; + + /** + * If the job is on an SLA pool, do not redirect this job. + * + * @return True if the pool matches an SLA pool, false otherwise + */ + private boolean isOnSlaPool(HiveConf conf) { + String pool = conf.get("mapred.fairscheduler.pool"); + + // Nothing to be done if pool is not specified + if ((pool == null) || (pool.isEmpty())) { + return false; + } + + // Make sure that SLA jobs are not redirected + String[] slaPoolArray = + conf.getStrings("mapred.jobtracker.hook.sla.pools"); + if ((slaPoolArray == null) || (slaPoolArray.length == 0)) { + slaPoolArray = new String[]{"rootsla", "incrementalscraping"}; + } + for (int i = 0; i < slaPoolArray.length; ++i) { + if (slaPoolArray[i].equals(pool)) { + LOG.debug("Pool " + pool + " is on an sla pool"); + return true; + } + } + + LOG.debug("Pool " + pool + " is not on an sla pool"); + return false; + } + + /* + * The user has specified a mapping table in hive.configs, which is + * essentially of the form: pool -> + * Since, cluster will be repeated a lot in these scenarios, the exact + * mapping is: cluster -> + * Going forward, multiple clusters will be used in these mappings, once + * silver is broken into silver and silver2. No code changes will be + * required, only configuration change. + * @return Whether to use the cluster from the smc + */ + private boolean useClusterFromSmcConfig(HiveConf conf) { + try { + String pool = conf.get("mapred.fairscheduler.pool"); + + // Nothing to be done if pool is not specified + if ((pool == null) || (pool.isEmpty())) { + return false; + } + + ConnectionUrlFactory connectionUrlFactory = + HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.JOBTRACKER_CONNECTION_FACTORY, + FBHiveConf.JOBTRACKER_MYSQL_TIER_VAR_NAME, + FBHiveConf.JOBTRACKER_HOST_DATABASE_VAR_NAME); + + if (connectionUrlFactory == null) { + return false; + } + + String s = connectionUrlFactory.getValue(conf.get(FBHiveConf.HIVE_CONFIG_TIER), POOLS); + if (s == null) { + return false; + } + + JSONObject poolsJSON = new JSONObject(s); + + Iterator i = (Iterator) poolsJSON.keys(); + while(i.hasNext()) { + + String clusterName = i.next(); + JSONObject jo = (JSONObject)poolsJSON.get(clusterName); + + String hadoopHome = null; + String jobTracker = null; + JSONArray poolsObj = null; + + boolean isCorona = false; + if (jo.has("isCorona")) { + isCorona = jo.getBoolean("isCorona"); + } + + if (!jo.has("hadoopHome") || !jo.has("pools")) { + LOG.error("hadoopHome and pools need to be specified for " + + clusterName); + return false; + } else { + hadoopHome = jo.getString("hadoopHome"); + poolsObj = (JSONArray)jo.get("pools"); + } + if (!isCorona && !jo.has("jobTracker")) { + LOG.error( + "jobTracker needs to be specified for non-corona cluster " + + clusterName); + return false; + } else { + if (jo.has("jobTracker")) { + jobTracker = jo.getString("jobTracker"); + } + } + + // Do the pool match + for (int idx = 0; idx < poolsObj.length(); idx++) { + if (pool.equals(poolsObj.getString(idx))) { + + LOG.info ("Run it on " + clusterName + " due to pool " + pool); + + if (isCorona) { + // Parameters are taken from configuration. + runCorona(conf, hadoopHome); + } else { + // Run it on "clusterName" + preHadoopBin = conf.getVar(HiveConf.ConfVars.HADOOPBIN); + conf.setVar(HiveConf.ConfVars.HADOOPBIN, + hadoopHome + "/bin/hadoop"); + preJobTracker = conf.getVar(HiveConf.ConfVars.HADOOPJT); + conf.setVar(HiveConf.ConfVars.HADOOPJT, jobTracker); + } + + return true; + } + } + } + + // Found nothing + return false; + } catch (TException e) { + return false; + } catch (JSONException e) { + return false; + } catch (Exception e) { + return false; + } + } + + @Override + public void run(HookContext hookContext) throws Exception { + assert(hookContext.getHookType() == HookContext.HookType.PRE_EXEC_HOOK); + SessionState sess = SessionState.get(); + Set inputs = hookContext.getInputs(); + Set outputs = hookContext.getOutputs(); + UserGroupInformation ugi = hookContext.getUgi(); + Map inputToCS = hookContext.getInputPathToContentSummary(); + + QueryPlan queryPlan = hookContext.getQueryPlan(); + List> rootTasks = queryPlan.getRootTasks(); + + // If it is a pure DDL task, + if (rootTasks == null) { + return; + } + if (rootTasks.size() == 1) { + Task tsk = rootTasks.get(0); + if (tsk instanceof DDLTask) { + return; + } + } + + HiveConf conf = sess.getConf(); + + // In case posthook of the previous query was not triggered, + // we revert job tracker to clean state first. + if (preHadoopBin != null) { + conf.setVar(HiveConf.ConfVars.HADOOPBIN, preHadoopBin); + preHadoopBin = null; + } + + if (preJobTracker != null) { + conf.setVar(HiveConf.ConfVars.HADOOPJT, preJobTracker); + preJobTracker = null; + } + + // A map from a path to the highest percentage that it is sampled by a + // map reduce task. If any map reduce task which uses this path does not + // sample, this percentage is 100. + Map pathToTopPercentage = new HashMap(); + // A set of inputs we know were not sampled for some task, so we should + // ignore any entries for them in pathToTopPercentage + Set nonSampledInputs = new HashSet(); + boolean isThereSampling = false; + if (!hookContext.getQueryPlan().getQueryStr().toUpperCase(). + contains(" JOIN ")) { + isThereSampling = HookUtils.checkForSamplingTasks( + hookContext.getQueryPlan().getRootTasks(), + pathToTopPercentage, + nonSampledInputs); + } + + // if we are set on local mode execution (via user or auto) bail + if ("local".equals(conf.getVar(HiveConf.ConfVars.HADOOPJT))) { + return; + } + + // The smc hive.configs contains a mapping of pools to the map-reduce + // cluster. If the user has specified a pool, and the pool belongs to one + // of the clusters for the smc, use that cluster + if (useClusterFromSmcConfig(conf)) { + return; + } + + // If this is an SLA pool, bail + if (isOnSlaPool(conf)) { + return; + } + + // check if we need to run at all + if (! "true".equals(conf.get("fbhive.jobtracker.auto", ""))) { + return; + } + + int bronzePercentage = conf.getInt("fbhive.jobtracker.bronze.percentage", + 0); + boolean isCoronaEnabled = conf.getBoolean("fbhive.jobtracker.corona.enabled", false); + int coronaPercentage = 0; + if (isCoronaEnabled) { + coronaPercentage = conf.getInt("fbhive.jobtracker.corona.percentage", + 0); + } + + int percents [] = {bronzePercentage, coronaPercentage}; + int roll = rollDice(percents); + LOG.debug("Dice roll is " + roll); + boolean tryBronze = false; + boolean tryCorona = false; + + if (roll == -1) { + // Don't run bronze/corona + LOG.info(dislike + "because the coin toss said so"); + return; + } else if (roll == 0) { + tryBronze = true; + } else if (roll == 1) { + tryCorona = true; + } else { + throw new RuntimeException("Invalid roll! Roll was " + roll); + } + + int maxGigaBytes = conf.getInt("fbhive.jobtracker.bronze.maxGigaBytes", 0); + if (maxGigaBytes == 0) { + LOG.info (dislike + "maxGigaBytes = 0"); + return; + } + long maxBytes = maxGigaBytes * 1024L * 1024 * 1024; + + if (maxGigaBytes < 0) { + LOG.warn (dislike + "maxGigaBytes value of " + maxGigaBytes + "is invalid"); + return; + } + + String bronzeHadoopHome = conf.get("fbhive.jobtracker.bronze.hadoopHome", + "/mnt/vol/hive/sites/bronze/hadoop"); + + String bronzeJobTracker = conf.get("fbhive.jobtracker.bronze.tracker", + conf.get(FBHiveConf.FBHIVE_BRONZE_JOBTRACKER)); + + // assuming we are using combinehiveinputformat - we know the # of splits will _at least_ + // be >= number of partitions/tables. by indicating the max input size - the + // admin is also signalling the max # of splits (maxGig*1000/256MB). So we limit the number + // of partitions to the max # of splits. + + int maxSplits = conf.getInt("fbhive.jobtracker.bronze.maxPartitions", maxGigaBytes * 4); + + if (!isThereSampling && inputs.size() > maxSplits) { + LOG.info (dislike + "number of input tables/partitions: " + inputs.size() + + " exceeded max splits: " + maxSplits); + return; + } + + if (conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS) > maxSplits) { + LOG.info (dislike + "number of reducers: " + conf.getVar(HiveConf.ConfVars.HADOOPNUMREDUCERS) + + " exceeded max reducers: " + maxSplits); + return; + } + + InputInfo info = HookUtils.getInputInfo(inputs, inputToCS, conf, + isThereSampling, pathToTopPercentage, nonSampledInputs, maxSplits, + maxBytes); + + if (info.getEstimatedNumSplits() > maxSplits) { + LOG.info (dislike + "the estimated number of input " + + "tables/partitions exceeded max splits: " + maxSplits); + return; + } + + if (info.getSize() > maxBytes) { + LOG.info (dislike + "input length of " + info.getSize() + + " is more than " + maxBytes); + return; + } + + // we have met all the conditions to switch to bronze/corona cluster + + if (tryBronze) { + // Run it on Bronze + preHadoopBin = conf.getVar(HiveConf.ConfVars.HADOOPBIN); + conf.setVar(HiveConf.ConfVars.HADOOPBIN, bronzeHadoopHome + + "/bin/hadoop"); + preJobTracker = conf.getVar(HiveConf.ConfVars.HADOOPJT); + conf.setVar(HiveConf.ConfVars.HADOOPJT, bronzeJobTracker); + } else if (tryCorona){ + String coronaHadoopHome = conf.get( + "fbhive.jobtracker.corona.hadoopHome", + "/mnt/vol/hive/sites/corona/hadoop"); + runCorona(conf, coronaHadoopHome); + } + } + + private void runCorona(HiveConf conf, String hadoopHome) { + // Run it on Corona + preHadoopBin = conf.getVar(HiveConf.ConfVars.HADOOPBIN); + conf.setVar(HiveConf.ConfVars.HADOOPBIN, hadoopHome + "/bin/hadoop"); + // No need to set the JT as it's done through the conf + Configuration coronaConf = new Configuration(false); + // Read the configuration, save old values, replace with new ones + coronaConf.addResource("mapred-site-corona.xml"); + savedValues = new HashMap(); + for (Entry e : coronaConf) { + String key = e.getKey(); + String value = e.getValue(); + LOG.debug("Saving " + key + "(" + conf.get(key) + ")"); + savedValues.put(key, conf.get(key)); + LOG.debug("Setting " + key + "(" + key + ")"); + conf.set(key, value); + } + } + } + + /** + * Randomly picks an index with chance that is indicated by the value in + * percentages. Returns -1 for the remaining percentage + * + * E.g. [60, 20] will return 0 (60% of the time) and 1 (20% of the time) and + * -1 (20% of the time) + * @param percentages + * @return + */ + private static int rollDice(int [] percentages) { + + Random randGen = new Random(); + int randVal = randGen.nextInt(100) + 1; + + // Make sure that percentages add up to <= 100% + int sum = 0; + for (int i=0; i < percentages.length; i++) { + sum += percentages[i]; + if (percentages[i] < 0) { + throw new RuntimeException("Percentages must be >=0. Got " + + percentages[i]); + } + } + if (sum > 100) { + throw new RuntimeException("Percentages add up to > 100!"); + } + + for (int i=0; i < percentages.length; i++) { + if (randVal <= percentages[i]) { + return i; + } + randVal = randVal - percentages[i]; + } + return -1; + } + + public static class PostExec implements ExecuteWithHookContext { + @Override + public void run(HookContext hookContext) throws Exception { + assert(hookContext.getHookType() == HookContext.HookType.POST_EXEC_HOOK); + SessionState ss = SessionState.get(); + Set inputs = hookContext.getInputs(); + Set outputs = hookContext.getOutputs(); + LineageInfo linfo = hookContext.getLinfo(); + UserGroupInformation ugi = hookContext.getUgi(); + this.run(ss,inputs,outputs,linfo,ugi); + } + + public void run(SessionState sess, Set inputs, + Set outputs, LineageInfo lInfo, + UserGroupInformation ugi) throws Exception { + HiveConf conf = sess.getConf(); + + if (preHadoopBin != null) { + conf.setVar(HiveConf.ConfVars.HADOOPBIN, preHadoopBin); + preHadoopBin = null; + } + + if (preJobTracker != null) { + conf.setVar(HiveConf.ConfVars.HADOOPJT, preJobTracker); + preJobTracker = null; + } + + // Restore values set for Corona + if (savedValues != null) { + for (Entry e : savedValues.entrySet()) { + String key = e.getKey(); + String value = e.getValue(); + LOG.debug("Restoring " + key + "(" + value + ")"); + if (value != null) { + conf.set(key, value); + } else { + conf.set(key, ""); + } + } + } + } + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/AuditJoinHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/AuditJoinHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/AuditJoinHook.java (working copy) @@ -0,0 +1,111 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskRunner; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.session.SessionState; + +/** + * Implementation of a post execute hook that checks whether a partition is + * archived or not and also sets that query time for the partition. + */ +public class AuditJoinHook implements ExecuteWithHookContext { + static final private Log LOG = LogFactory + .getLog("hive.ql.hooks.AuditJoinHook"); + + ConnectionUrlFactory urlFactory = null; + + public AuditJoinHook() throws Exception { + HiveConf conf = new HiveConf(AuditJoinHook.class); + urlFactory = HookUtils.getUrlFactory(conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.AUDIT_CONNECTION_FACTORY, + FBHiveConf.AUDIT_MYSQL_TIER_VAR_NAME, + FBHiveConf.AUDIT_HOST_DATABASE_VAR_NAME); + } + + public void run(HookContext hookContext) throws Exception { + HiveConf conf = hookContext.getConf(); + boolean enableConvert = HiveConf.getBoolVar(conf, + HiveConf.ConfVars.HIVECONVERTJOIN); + if (!enableConvert) { + return; + } + String command = StringEscapeUtils.escapeJava(SessionState.get() + .getCmd()); + QueryPlan plan = hookContext.getQueryPlan(); + String queryID = StringEscapeUtils.escapeJava(plan.getQueryId()); + // String query = SessionState.get().getCmd(); + + int convertedMapJoin = 0; + int commonJoin = 0; + int backupCommonJoin = 0; + int convertedLocalMapJoin = 0; + int localMapJoin = 0; + + List list = hookContext.getCompleteTaskList(); + for (TaskRunner tskRunner : list) { + Task tsk = tskRunner.getTask(); + int tag = tsk.getTaskTag(); + switch (tag) { + case Task.COMMON_JOIN: + commonJoin++; + break; + case Task.CONVERTED_LOCAL_MAPJOIN: + convertedLocalMapJoin++; + break; + case Task.CONVERTED_MAPJOIN: + convertedMapJoin++; + break; + case Task.BACKUP_COMMON_JOIN: + backupCommonJoin++; + break; + case Task.LOCAL_MAPJOIN: + localMapJoin++; + break; + } + } + + // nothing to do + if ((convertedMapJoin == 0) && (commonJoin == 0) && (backupCommonJoin == 0) && (convertedLocalMapJoin == 0) + && (localMapJoin == 0)) { + return; + } + + ArrayList sqlParams = new ArrayList(); + sqlParams.add(StringEscapeUtils.escapeJava(command)); + sqlParams.add(StringEscapeUtils.escapeJava(queryID)); + sqlParams.add(new Integer(convertedLocalMapJoin)); + sqlParams.add(new Integer(convertedMapJoin)); + sqlParams.add(new Integer(localMapJoin)); + sqlParams.add(new Integer(commonJoin)); + sqlParams.add(new Integer(backupCommonJoin)); + + String sql = "insert into audit_join set command = ?, query_id = ?, converted_local_mapjoin = ?, converted_map_join = ?," + + " local_mapjoin = ?, common_join = ?, backup_common_join = ?"; + + if (urlFactory == null) { + urlFactory = HookUtils.getUrlFactory( + conf, + FBHiveConf.CONNECTION_FACTORY, + FBHiveConf.AUDIT_CONNECTION_FACTORY, + FBHiveConf.AUDIT_MYSQL_TIER_VAR_NAME, + FBHiveConf.AUDIT_HOST_DATABASE_VAR_NAME); + if (urlFactory == null) { + throw new RuntimeException("DB parameters not set!"); + } + } + + HookUtils.runInsert(conf, + urlFactory, sql, sqlParams, HookUtils + .getSqlNumRetry(conf)); + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/SampleConcurrencyHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/SampleConcurrencyHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/SampleConcurrencyHook.java (working copy) @@ -0,0 +1,67 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.util.List; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.AbstractSemanticAnalyzerHook; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContext; +import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContextImpl; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * Implementation of a compile time hook to enable concurrency for a subset + * of queries + */ +public class SampleConcurrencyHook extends AbstractSemanticAnalyzerHook { + + + // Set concurrency for a sample of the queries + @Override + public ASTNode preAnalyze( + HiveSemanticAnalyzerHookContext context, + ASTNode ast) throws SemanticException { + HiveSemanticAnalyzerHookContextImpl ctx = (HiveSemanticAnalyzerHookContextImpl)context; + HiveConf conf = (HiveConf)ctx.getConf(); + + // If concurrency is disabled, nothing to do + boolean supportConcurrency = conf.getBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY); + if (!supportConcurrency) { + return ast; + } + + // Do nothing is the statement is show locks + if (ast.getToken().getType() == HiveParser.TOK_SHOWLOCKS) { + return ast; + } + + // + // based on sample rate, decide whether to gather stats + // + float pubPercent = conf.getFloat(FBHiveConf.ENABLE_PARTIAL_CONCURRENCY, 0); + + try { + if (!HookUtils.rollDice(pubPercent)) { + conf.setBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, false); + return ast; + } + } catch (Exception e) { + throw new SemanticException(e); + } + + conf.setBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); + return ast; + } + + // Nothing to do + @Override + public void postAnalyze( + HiveSemanticAnalyzerHookContext context, + List> rootTasks) throws SemanticException { + // no nothing + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/HookUtils.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/HookUtils.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/HookUtils.java (working copy) @@ -0,0 +1,748 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.IOException; +import java.io.Serializable; +import java.net.URI; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; + +/** + * Utilities for writing hooks. + */ +public class HookUtils { + static final private Log LOG = LogFactory.getLog(HookUtils.class.getName()); + + public static final String TABLE_CREATION_CLUSTER = "creation_cluster"; + + static final String POST_HOOK_DB_MAX_RETRY_VAR_NAME = + "fbhive.posthook.mysql.max_retries"; + + // The default value is to retry 20 times with maximum retry interval + // 60 seconds. The expectation is about 22 minutes. After 7 retries, it + // reaches 60 seconds. + static final int DEFAULT_SQL_NUM_RETRIES = 20; + static final int DEFAULT_RETRY_MAX_INTERVAL_SEC = 60; + + private static Connection getConnection(HiveConf conf, String url) + throws SQLException { + return DriverManager.getConnection( + url, + conf.get(FBHiveConf.FBHIVE_DB_USERNAME), + conf.get(FBHiveConf.FBHIVE_DB_PASSWORD)); + } + + public static int getSqlNumRetry(HiveConf conf) { + return conf.getInt(POST_HOOK_DB_MAX_RETRY_VAR_NAME, 30); + } + + public static void runInsert(HiveConf conf, + ConnectionUrlFactory urlFactory, + String sql, + List sqlParams) throws Exception { + runInsert(conf, urlFactory, sql, sqlParams, DEFAULT_SQL_NUM_RETRIES); + } + + public static List> runInsertSelect(HiveConf conf, + ConnectionUrlFactory urlFactory, + String sql, + List sqlParams) + throws Exception { + return runInsertSelect(conf, urlFactory, sql, sqlParams, true); + + } + + public static List> runInsertSelect(HiveConf conf, + ConnectionUrlFactory urlFactory, + String sql, + List sqlParams, + boolean isWrite) + throws Exception { + return runInsertSelect(conf, urlFactory, sql, sqlParams, isWrite, + DEFAULT_SQL_NUM_RETRIES, + DEFAULT_RETRY_MAX_INTERVAL_SEC, false); + } + + public static void runInsert(HiveConf conf, + ConnectionUrlFactory urlFactory, + String sql, + List sqlParams, + int numRetries) + throws Exception { + runInsertSelect(conf, urlFactory, sql, sqlParams, true, numRetries, + DEFAULT_RETRY_MAX_INTERVAL_SEC, true); + } + + /* + * @param conf - + * @param parentTierName - the factory to create + * @param tierName - the factory to create + * @param tierParam1Name - the name of the first parameter + * @param tierParam2Name - the name of the second parameter + */ + public static ConnectionUrlFactory getUrlFactory( + HiveConf conf, + String parentTierName, + String childTierName, + String tierParam1Name, + String tierParam2Name) { + return getUrlFactory(conf, parentTierName, childTierName, tierParam1Name, tierParam2Name, null); + } + + /* + * @param conf - + * @param parentTierName - the factory to create + * @param tierName - the factory to create + * @param tierParam1Name - the name of the first parameter + * @param tierParam2Name - the name of the second parameter + */ + public static ConnectionUrlFactory getUrlFactory( + Configuration conf, + String parentTierName, + String childTierName, + String tierParam1Name, + String tierParam2Name, + String commonParam) { + + String parentTierValue = + parentTierName == null ? null : conf.get(parentTierName); + String childTierValue = + childTierName == null ? null : conf.get(childTierName); + + String tierValue = + childTierValue != null && !childTierValue.isEmpty() ? childTierValue : + (parentTierValue != null && !parentTierValue.isEmpty() ? parentTierValue : + null); + + if (tierValue == null) { + return null; + } + + ConnectionUrlFactory conn = + (ConnectionUrlFactory)getObject(conf, tierValue); + + String tierParamValue = + tierParam1Name == null ? null : conf.get(tierParam1Name); + + if ((tierParamValue == null) || tierParamValue.isEmpty()) { + tierParamValue = tierParam2Name == null ? null : conf.get(tierParam2Name); + } + + String commonParamValue = + commonParam == null ? null : conf.get(commonParam); + + conn.init(tierParamValue, commonParamValue); + return conn; + } + + // In the case of a select returns a list of lists, where each inner list represents a row + // returned by the query. In the case of an insert, returns null. + public static List> runInsertSelect( + HiveConf conf, + ConnectionUrlFactory urlFactory, String sql, + List sqlParams, boolean isWrite, int numRetries, + int retryMaxInternalSec, boolean insert) + throws Exception { + + // throwing an exception + int waitMS = 300; // wait for at least 300 msec before next retry. + Random rand = new Random(); + for (int i = 0; i < numRetries; ++i) { + try { + String url = urlFactory.getUrl(isWrite); + LOG.info("Attepting connection with URL " + url); + Connection conn = getConnection(conf, url); + PreparedStatement pstmt = conn.prepareStatement(sql); + int pos = 1; + for (Object param : sqlParams) { + if (param instanceof Integer) { + pstmt.setInt(pos++, ((Integer) param).intValue()); + } else { + pstmt.setString(pos++, (String) param); + } + } + if (insert) { + int recordsUpdated = pstmt.executeUpdate(); + LOG.info("rows inserted: " + recordsUpdated + " sql: " + sql); + pstmt.close(); + return null; + } + else { + ResultSet result = pstmt.executeQuery(); + List> results = new ArrayList>(); + int numColumns = result.getMetaData().getColumnCount(); + while (result.next()) { + List row = new ArrayList(); + results.add(row); + for (int index = 1; index <= numColumns; index++) { + row.add(result.getObject(index)); + } + } + pstmt.clearBatch(); + pstmt.close(); + + LOG.info("rows selected: " + results.size() + " sql: " + sql); + return results; + } + } catch (Exception e) { + // We should catch a better exception than Exception, but since + // ConnectionUrlFactory.getUrl() defines throws Exception, it's hard + // for us to figure out the complete set it can throw. We follow + // ConnectionUrlFactory.getUrl()'s definition to catch Exception. + // It shouldn't be a big problem as after numRetries, we anyway exit. + LOG.info("Exception " + e + ". Will retry " + (numRetries - i) + + " times."); + // Introducing a random factor to the wait time before another retry. + // The wait time is dependent on # of failures and a random factor. + // At the first time of getting a SQLException, the wait time + // is a random number between [0,300] msec. If the first retry + // still fails, we will wait 300 msec grace period before the 2nd retry. + // Also at the second retry, the waiting window is expanded to 600 msec + // alleviating the request rate from the server. Similarly the 3rd retry + // will wait 600 msec grace period before retry and the waiting window + // is + // expanded to 1200 msec. + + waitMS += waitMS; + if (waitMS > retryMaxInternalSec * 1000) { + waitMS = retryMaxInternalSec * 1000; + } + double waitTime = waitMS + waitMS * rand.nextDouble(); + Thread.sleep((long) waitTime); + if (i + 1 == numRetries) { + LOG.error("Still got Exception after " + numRetries + " retries.", + e); + throw e; + } + } + } + return null; + } + + /** + * Populates inputToCS with a mapping from the input paths to their respective ContentSummary + * objects. If an input is in a subdirectory of another's location, or in the same location, + * the input is not included in the total size of the inputs. If it is not already present in + * the mapping, it will not be added. + * + * @param inputs + * @param inputToCS + * @param conf + * @throws IOException + * @throws Exception + */ + public static long getInputSize(Set inputs, + Map inputToCS, HiveConf conf) + throws IOException, Exception { + + URI defaultPathUri = new URI(conf.getVar(HiveConf.ConfVars.METASTOREWAREHOUSE)); + String defaultPath = defaultPathUri.getPath(); + String defaultPrefix = defaultPathUri.toString().substring(0, defaultPathUri.toString().lastIndexOf(defaultPath)); + + // A mapping from the location as a String, formatted as a String for sorting, to the original + // path of the object + Map locationToPath = new HashMap(); + + for (ReadEntity re : inputs) { + Path p = null; + switch (re.getType()) { + case TABLE: + Table table = re.getTable(); + + if (table.isPartitioned()) { + // If the input is a partitioned table, do not include its content summary, as data will + // never be read from a partitioned table, only its partitions, so it must be a metadata + // change to the table. + continue; + } + if (table.isView()) { + // If the input is a view, it does not have a content summary as it is only a logical + // construct. + continue; + } + + p = table.getPath(); + break; + case PARTITION: + Partition partition = re.getPartition(); + + if (partition.getTable().isView()) { + // If the input is a partition of a view, it does not have a content summary as it is + // only a logical construct. + continue; + } + + p = partition.getPartitionPath(); + break; + default: + continue; + } + + String location = re.getLocation().toString(); + + // If the location is something like /user/facebook/warehouse/ we want it to start with + // hdfs://... to make ensure using prefixes we can identify subdirectories + if (location.equals(defaultPath) || + location.startsWith(defaultPath.endsWith("/") ? defaultPath : defaultPath + "/")) { + location = defaultPrefix + location; + } + + // If the location does not end with / add it, this ensures /a/b/cd is not considered a + // subdirectory of /a/b/c + if (!location.endsWith("/")) { + location += "/"; + } + + locationToPath.put(location, p); + } + + String[] locations = new String[locationToPath.size()]; + locations = locationToPath.keySet().toArray(locations); + Arrays.sort(locations); + + String lastLocation = null; + long totalInputSize = 0; + for (String formattedLocation : locations) { + + // Since the locations have been sorted, if this location is a subdirectory of another, that + // directory must either be immediately before this location, or every location in between is + // also a subdirectory + if (lastLocation != null && formattedLocation.startsWith(lastLocation)) { + continue; + } + + Path p = locationToPath.get(formattedLocation); + lastLocation = formattedLocation; + + String pathStr = p.toString(); + if(LOG.isDebugEnabled()) { + LOG.debug("Finding from cache Content Summary for " + pathStr); + } + + ContentSummary cs = (inputToCS == null) ? null : inputToCS.get(pathStr); + if (cs == null) { + if(LOG.isDebugEnabled()) { + LOG.debug("Fetch Content Summary for " + pathStr); + } + FileSystem fs = p.getFileSystem(conf); + cs = fs.getContentSummary(p); + inputToCS.put(pathStr, cs); + } + + totalInputSize += cs.getLength(); + + if (LOG.isDebugEnabled()) { + LOG.debug("Length for file: " + pathStr + " = " + cs.getLength()); + } + } + + return totalInputSize; + } + + /** + * Goes through the list of tasks, and populates a map from each path used + * by a mapRedTask to the highest percentage to which it is sampled, or 100 + * if it is ever not sampled. + * + * Also, if a task is not a map reduce task or has a null or empty + * NameToSplitSample map, it adds all of its inputs to a + * set so they can be treated as unsampled. + * + * Calls itself recursively on each task's list of dependent tasks + * + * @return whether or not there is any sampling performed in the query + */ + static public boolean checkForSamplingTasks( + List> tasks, + Map topPercentages, + Set nonSampledInputs) { + boolean isThereSampling = false; + + for (Task task : tasks) { + MapredWork work; + + // Only look for sampled inputs in MapRedTasks with non-null, non-empty + // NameToSplitSample maps + if (task.getWork() instanceof MapredWork && + (work = (MapredWork)task.getWork()).getNameToSplitSample() != null && + !work.getNameToSplitSample().isEmpty()) { + + isThereSampling = true; + + // If the task is a map reduce task, go through each of the paths + // used by its work, if it is sampled check if it is the highest + // sampling percentage yet seen for that path. If it is not + // sampled, set the highest percentage to 100. + for (Map.Entry> entry : work.getPathToAliases().entrySet()) { + double percentage = 0; + + for (String alias : entry.getValue()) { + if (work.getNameToSplitSample().containsKey(alias)) { + if (work.getNameToSplitSample().get(alias).getPercent() > percentage) { + percentage = work.getNameToSplitSample().get(alias).getPercent(); + } + } else { + percentage = 100; + break; + } + } + + String path = entry.getKey(); + if (!topPercentages.containsKey(path) || percentage > topPercentages.get(path)) { + topPercentages.put(path, percentage); + } + } + } else if (task.getQueryPlan() != null) { + nonSampledInputs.addAll(task.getQueryPlan().getInputs()); + } + + if (task.getDependentTasks() != null) { + isThereSampling |= checkForSamplingTasks(task.getDependentTasks(), + topPercentages, + nonSampledInputs); + } + } + + return isThereSampling; + } + + /** + * Helper class used to pass from getObjectSize back to the caller. + * This contains the total size of the objects passed in, as well as + * the type, size and number of files for each object. For eg, if a query + * references 2 partitions T1@p1 and T1@p2 of size 10 and 20, and 2 and 5 + * files respectively, the totalSize will be 30, and the object map will be + * like: + * T1@p1 -> > objectTypeLengths; + + ObjectSize() { + } + + ObjectSize(long totalSize, + Map> objectTypeLengths) { + this.totalSize = totalSize; + this.objectTypeLengths = objectTypeLengths; + } + + long getTotalSize() { + return totalSize; + } + + void setTotalSize(long totalSize) { + this.totalSize = totalSize; + } + + Map> getObjectTypeLengths() { + return objectTypeLengths; + } + + void setObjectTypeLengths(Map> objectTypeLengths) { + this.objectTypeLengths = objectTypeLengths; + } + } + + + static public HookUtils.ObjectSize getObjectSize(HiveConf conf, + Set objects, + boolean loadObjects) + throws Exception { + // The objects may need to be loaded again since StatsTask is executed after + // the move task, and the object in the write entity may not have the size + long totalSize = 0; + Map> objectLengths = + new HashMap>(); + Hive db = null; + if (loadObjects) { + db = Hive.get(); + } + + for (Entity object: objects) { + // We are computing sizes only for tables and partitions + Entity.Type objectType = object.getTyp(); + Table table = null; + String size = null; + String numFiles = null; + Path path = null; + + switch (objectType) { + case TABLE: + table = object.getTable(); + + if (table.isPartitioned() && !table.isView()) { + // If the input is a partitioned table, do not include its content summary, as data will + // never be read from a partitioned table, only its partitions, so it must be a metadata + // change to the table. + // + // However, if the table is a view, a view's partitions are not included in the inputs, + // so do not skip it so that we have some record of it. + continue; + } + + if (loadObjects) { + table = db.getTable(table.getTableName()); + } + + if (table.isView()) { + // Views are logical, so they have no size or files + path = null; + size = "0"; + numFiles = "0"; + } else { + path = table.getPath(); + size = table.getProperty("totalSize"); + numFiles = table.getProperty("numFiles"); + } + break; + case PARTITION: + Partition partition = object.getPartition(); + + if (loadObjects) { + partition = + db.getPartition(partition.getTable(), partition.getSpec(), false); + } + table = partition.getTable(); + + if (table.isView()) { + // Views are logical, so they have no size or files + // Currently view partitions are not included in the inputs, but this is included so + // that if that changes in open source, it will not cause an NPE. It should not cause + // any double counting as the size of the view and its partitions are both 0. + path = null; + size = "0"; + numFiles = "0"; + } else { + path = partition.getPartitionPath(); + size = partition.getParameters().get("totalSize"); + numFiles = partition.getParameters().get("numFiles"); + } + break; + default: + // nothing to do + break; + } + + // Counting needed + if (table != null) { + if (size == null) { + // If the size is not present in the metastore (old + // legacy tables), get it from hdfs + FileSystem fs = path.getFileSystem(conf); + size = String.valueOf(fs.getContentSummary(path).getLength()); + } + + if (numFiles == null) { + numFiles = String.valueOf(0); + } + + Triple triple = + new Triple( + table.getTableType().toString(), size, numFiles); + + objectLengths.put(object.getName(), triple); + + // If the input/output is a external table or a view, dont add it to + // the total size. The managed tables/partitions for those locations + // should also be part of the object list passed in. It is true for + // inputs, whereas outputs should not external tables or views in a + // query. So, the totalSize may be less than the sum of all individual + // sizes + if ((table.getTableType() != TableType.EXTERNAL_TABLE) && + (table.getTableType() != TableType.VIRTUAL_VIEW)) { + totalSize += Long.valueOf(size); + } + } + } + + ObjectSize objectSize = new ObjectSize(totalSize, objectLengths); + return objectSize; + } + + /** + * A helper class used to pass info from getInputInfo back to the caller. + */ + public static class InputInfo { + long size; + long fileCount; + long directoryCount; + double estimatedNumSplits; + + InputInfo(long size, long fileCount, long directoryCount, + double estimatedNumSplits) { + this.size = size; + this.fileCount = fileCount; + this.directoryCount = directoryCount; + this.estimatedNumSplits = estimatedNumSplits; + } + + long getSize() { + return size; + } + + long getFileCount() { + return fileCount; + } + + long getDirectoryCount() { + return directoryCount; + } + + double getEstimatedNumSplits() { + return estimatedNumSplits; + } + } + + /** + * Returns the sizes of the inputs while taking sampling into account. + * + * @param inputs - entities used for the query input + * @param inputToCS - already known mappings from paths to content summaries. + * If a path is not in this mapping, it will be looked up + * @param conf - hadoop conf for constructing filesystem + * @param isThereSampling - whether the query includes sampling + * @param pathToTopPercentage - a mapping from the path to the highest + * sampled percentage. If not in the map, defaults to 100% + * @param nonSampledInputs - entities that are not sampled + * @param maxSplits - if the number of splits exceeds this number as the + * splits are incrementally summed, return early + * @param maxSize - if the size exceeds this number as the sizes are being + * incrementally summed, return early + * @return an InputInfo object about the net input + * @throws IOException + */ + + static public InputInfo getInputInfo(Collection inputs, + Map inputToCS, Configuration conf, + boolean isThereSampling, Map pathToTopPercentage, + Set nonSampledInputs, + long maxSplits, long maxSize) throws IOException { + + double estimatedNumSplits = 0; + long size = 0; + long fileCount = 0; + long directoryCount = 0; + + // Go over the input paths and calculate size + for(ReadEntity re: inputs) { + Path p = null; + switch(re.getType()) { + case TABLE: + p = re.getTable().getPath(); + break; + case PARTITION: + p = re.getPartition().getPartitionPath(); + break; + default: + break; + } + + if (p != null) { + String pathStr = p.toString(); + LOG.debug("Finding from cache Content Summary for " + pathStr); + ContentSummary cs = (inputToCS == null) ? null : inputToCS + .get(pathStr); + if (cs == null) { + LOG.debug("Fetch Content Summary for " + pathStr); + FileSystem fs = p.getFileSystem(conf); + cs = fs.getContentSummary(p); + inputToCS.put(pathStr, cs); + } + + if (isThereSampling) { + // If the input is used in a map reduce task get the highest + // percentage to which it is sampled, otherwise, set the + // sampling percentage to 100 + double samplePercentage = 100; + if (pathToTopPercentage.containsKey(pathStr) && + !nonSampledInputs.contains(re)) { + samplePercentage = pathToTopPercentage.get(pathStr); + } + size += (long)(cs.getLength() * samplePercentage / 100D); + estimatedNumSplits += samplePercentage / 100; + + if (estimatedNumSplits > maxSplits) { + break; + } + } else { + size += cs.getLength(); + fileCount += cs.getFileCount(); + directoryCount += cs.getDirectoryCount(); + } + + if (LOG.isDebugEnabled()) { + LOG.debug ("Length for file: " + p.toString() + " = " + cs.getLength()); + } + } + + if (size > maxSize){ + break; + } + } + + return new InputInfo(size, fileCount, directoryCount, + estimatedNumSplits); + } + + //Returns true approximately % of the time + public static boolean rollDice(float percentage) throws Exception { + + Random randGen = new Random(); + float randVal = randGen.nextFloat(); + + if (percentage < 0 || percentage > 1) { + throw new Exception("Percentages must be >=0% and <= 100%. Got " + percentage); + } + + if (randVal < percentage) { + return true; + } + + return false; + } + + public static T getObject(Configuration conf, String className) { + if ((className == null) || (className.isEmpty())) { + return null; + } + + T clazz = null; + try { + clazz = (T) ReflectionUtils.newInstance(conf.getClassByName(className), conf); + } catch (ClassNotFoundException e) { + return null; + } + return clazz; + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/ReplicationHook.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/ReplicationHook.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/ReplicationHook.java (working copy) @@ -0,0 +1,167 @@ +package org.apache.hadoop.hive.ql.hooks; + +import java.io.Serializable; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Set; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.io.CachingPrintStream; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.TaskRunner; +import org.apache.hadoop.hive.ql.hooks.conf.FBHiveConf; +import org.apache.hadoop.hive.ql.log.PerfLogger; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; +import org.json.JSONObject; + + +/** + * Implementation of a post execute hook that checks whether + * a partition is archived or not and also sets that query + * time for the partition. + */ +public class ReplicationHook extends BaseReplicationHook implements ExecuteWithHookContext { + + static final private Log LOG = LogFactory.getLog(ReplicationHook.class.getName()); + + private HiveConf conf; + + public ReplicationHook() throws Exception { + super(); + conf = new HiveConf(this.getClass()); + } + + /** + * Set this replication hook's hive configuration. + * Expose this as a public function in case run() cannot get the HiveConf + * from the session, e.g., if ReplicationHook is not called after a CLI query. + * @param conf the configuration to use + */ + public void setHiveConf(HiveConf conf) { + this.conf = conf; + } + + public void run(SessionState sess, Set inputs, + Set outputs, LineageInfo lInfo, UserGroupInformation ugi) + throws Exception { + run(sess, inputs, outputs, lInfo, ugi, null, HookContext.HookType.POST_EXEC_HOOK); + } + + public void run(SessionState sess, Set inputs, + Set outputs, LineageInfo lInfo, UserGroupInformation ugi, + List completedTasks, HookContext.HookType hookType) + throws Exception { + + assert(hookType == HookContext.HookType.POST_EXEC_HOOK || + hookType == HookContext.HookType.ON_FAILURE_HOOK); + + String command = ""; + String commandType = ""; + String user_info = ""; + String inputStr = ""; + String outputStr = ""; + String queryId = ""; + String querySrc = ""; + String startTimeStr = ""; + String packageName = ""; + + if (sess != null) { + command = StringEscapeUtils.escapeJava(sess.getCmd()); + commandType = StringEscapeUtils.escapeJava(sess.getCommandType()); + setHiveConf(sess.getConf()); + queryId = conf.getVar(HiveConf.ConfVars.HIVEQUERYID); + + querySrc = conf.get(JobStatsHook.HIVE_QUERY_SOURCE, ""); + packageName = conf.get(FBHiveConf.FB_CURRENT_CLUSTER); + } + + if (ugi != null) { + user_info = StringEscapeUtils.escapeJava(ugi.getUserName()); + } + + if (inputs != null) { + inputStr = entitiesToString(inputs); + } + + if (outputs != null) { + outputStr = entitiesToString(outputs); + } + + // Retrieve the time the Driver.run method started from the PerfLogger, as this corresponds + // to approximately the time when the query started to be processed, and format it. + // If, some how, this time was not set, it will default to 0000-00-00 00:00:00 in the db. + Long startTimeMillis = PerfLogger.getPerfLogger().getStartTime(PerfLogger.DRIVER_RUN); + if (startTimeMillis != null) { + Date startTime = new Date(startTimeMillis.longValue()); + startTimeStr = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(startTime); + } else { + LOG.error("Start time was null in ReplicationHook"); + } + + ArrayList sqlParams = new ArrayList(); + sqlParams.add(StringEscapeUtils.escapeJava(command)); + sqlParams.add(StringEscapeUtils.escapeJava(commandType)); + sqlParams.add(StringEscapeUtils.escapeJava(inputStr)); + sqlParams.add(outputStr); + sqlParams.add(StringEscapeUtils.escapeJava(queryId)); + sqlParams.add(StringEscapeUtils.escapeJava(user_info)); + sqlParams.add(StringEscapeUtils.escapeJava(querySrc)); + sqlParams.add(startTimeStr); + sqlParams.add(packageName); + + // Assertion at beginning of method guarantees this string will remain empty + String sql = ""; + if (hookType == HookContext.HookType.POST_EXEC_HOOK) { + sql = "insert into snc1_command_log set command = ?, command_type = ?, inputs = ?, " + + "outputs = ?, queryId = ?, user_info = ?, query_src = ?, start_time = ?, " + + "package_name = ?"; + } else if (hookType == HookContext.HookType.ON_FAILURE_HOOK) { + + List errors = ((CachingPrintStream)sess.err).getOutput(); + String localErrorString = ""; + if (!errors.isEmpty()) { + JSONObject localErrorObj = new JSONObject(); + localErrorObj.put("localErrors", errors); + localErrorString = localErrorObj.toString(); + } + + sqlParams.add(localErrorString); + + sql = "insert into snc1_failed_command_log set command = ?, command_type = ?, inputs = ?, " + + "outputs = ?, queryId = ?, user_info = ?, query_src = ?, start_time = ?, " + + "package_name = ?, local_errors = ?"; + } + HookUtils.runInsert(conf, urlFactory, sql, sqlParams, HookUtils + .getSqlNumRetry(conf)); + } + +@Override + public void run(HookContext hookContext) throws Exception { + SessionState ss = SessionState.get(); + Set inputs = hookContext.getInputs(); + Set outputs = hookContext.getOutputs(); + LineageInfo linfo = hookContext.getLinfo(); + UserGroupInformation ugi = hookContext.getUgi(); + this.run(ss, inputs, outputs, linfo, ugi, hookContext.getCompleteTaskList(), hookContext.getHookType()); + } + + public static String entitiesToString(Set entities) { + StringBuilder stringBuilder = new StringBuilder(); + + boolean first = true; + + for (Serializable o : entities) { + if (!first) { + stringBuilder.append(","); + } + first = false; + stringBuilder.append(o.toString()); + } + return stringBuilder.toString(); + } +} Index: contrib/src/java/org/apache/hadoop/hive/ql/hooks/Pair.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/hooks/Pair.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/hooks/Pair.java (working copy) @@ -0,0 +1,86 @@ +package org.apache.hadoop.hive.ql.hooks; + +/** + * A generic class for pairs. + * @param + * @param + */ +public class Pair +{ + protected T1 first = null; + protected T2 second = null; + + /** + * Default constructor. + */ + public Pair() + { + } + + /** + * Constructor + * @param a operand + * @param b operand + */ + public Pair(T1 a, T2 b) + { + this.first = a; + this.second = b; + } + + /** + * Replace the first element of the pair. + * @param a operand + */ + public void setFirst(T1 a) + { + this.first = a; + } + + /** + * Replace the second element of the pair. + * @param b operand + */ + public void setSecond(T2 b) + { + this.second = b; + } + + /** + * Return the first element stored in the pair. + * @return T1 + */ + public T1 getFirst() + { + return first; + } + + /** + * Return the second element stored in the pair. + * @return T2 + */ + public T2 getSecond() + { + return second; + } + + private boolean equals(Object x, Object y) + { + return (x == null && y == null) || (x != null && x.equals(y)); + } + + @Override + @SuppressWarnings("unchecked") + public boolean equals(Object other) + { + return other instanceof Pair && equals(first, ((Pair)other).first) && + equals(second, ((Pair)other).second); + } + + @Override + public String toString() + { + return "{" + getFirst() + "," + getSecond() + "}"; + } +} + Index: contrib/src/java/org/apache/hadoop/hive/ql/stats/HiveStatsMetricsPublisher.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/ql/stats/HiveStatsMetricsPublisher.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/ql/stats/HiveStatsMetricsPublisher.java (working copy) @@ -0,0 +1,29 @@ +package org.apache.hadoop.hive.ql.stats; + +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.hooks.HookContext; + +public interface HiveStatsMetricsPublisher { + public enum QueryTag { + JOIN, + GROUP_BY, + MAP_SCRIPT, + REDUCE_SCRIPT, + MAP_ONLY, + MAP_REDUCE, + PIPELINE, + MAP_JOIN, + ORDER_BY, + SORT_BY, + CLUSTER_BY, + DISTRIBUTE_BY, + JOIN_FOLLOWED_BY_GROUP_BY, + }; + + public void extractAndOverwriteQueryAttributes(final HookContext hookContext); + public void publishMetricsWithQueryTags(Map metrics); + public Set getQueryAttributes(); +} +