diff --git build.xml build.xml index 4b345b5..344f3ab 100644 --- build.xml +++ build.xml @@ -250,11 +250,6 @@ - - - - - diff --git common/build.xml common/build.xml index d9ac07e..f02392d 100755 --- common/build.xml +++ common/build.xml @@ -25,8 +25,29 @@ to call at top-level: ant deploy-contrib compile-core-test + + + + + + + + + + + + + + + diff --git common/resources/hive-default.xml common/resources/hive-default.xml new file mode 100644 index 0000000..1465317 --- /dev/null +++ common/resources/hive-default.xml @@ -0,0 +1,721 @@ + + + + + + + + + + + + + + mapred.reduce.tasks + -1 + The default number of reduce tasks per job. Typically set + to a prime close to the number of available hosts. Ignored when + mapred.job.tracker is "local". Hadoop set this to 1 by default, whereas hive uses -1 as its default value. + By setting this property to -1, Hive will automatically figure out what should be the number of reducers. + + + + + hive.exec.reducers.bytes.per.reducer + 1000000000 + size per reducer.The default is 1G, i.e if the input size is 10G, it will use 10 reducers. + + + + hive.exec.reducers.max + 999 + max number of reducers will be used. If the one + specified in the configuration parameter mapred.reduce.tasks is + negative, hive will use this one as the max number of reducers when + automatically determine number of reducers. + + + + hive.exec.scratchdir + /tmp/hive-${user.name} + Scratch space for Hive jobs + + + + hive.test.mode + false + whether hive is running in test mode. If yes, it turns on sampling and prefixes the output tablename + + + + hive.test.mode.prefix + test_ + if hive is running in test mode, prefixes the output table by this string + + + + + + + + + + + hive.test.mode.samplefreq + 32 + if hive is running in test mode and table is not bucketed, sampling frequency + + + + hive.test.mode.nosamplelist + + if hive is running in test mode, dont sample the above comma seperated list of tables + + + + hive.metastore.local + true + controls whether to connect to remove metastore server or open a new metastore server in Hive Client JVM + + + + javax.jdo.option.ConnectionURL + jdbc:derby:;databaseName=metastore_db;create=true + JDBC connect string for a JDBC metastore + + + + javax.jdo.option.ConnectionDriverName + org.apache.derby.jdbc.EmbeddedDriver + Driver class name for a JDBC metastore + + + + javax.jdo.PersistenceManagerFactoryClass + org.datanucleus.jdo.JDOPersistenceManagerFactory + class implementing the jdo persistence + + + + javax.jdo.option.DetachAllOnCommit + true + detaches all objects from session so that they can be used after transaction is committed + + + + javax.jdo.option.NonTransactionalRead + true + reads outside of transactions + + + + javax.jdo.option.ConnectionUserName + APP + username to use against metastore database + + + + javax.jdo.option.ConnectionPassword + mine + password to use against metastore database + + + + datanucleus.connectionPoolingType + DBCP + Uses a DBCP connection pool for JDBC metastore + + + + datanucleus.validateTables + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.validateColumns + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.validateConstraints + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.storeManagerType + rdbms + metadata store type + + + + datanucleus.autoCreateSchema + true + creates necessary schema on a startup if one doesn't exist. set this to false, after creating it once + + + + datanucleus.autoStartMechanismMode + checked + throw exception if metadata tables are incorrect + + + + datanucleus.transactionIsolation + read-committed + Default transaction isolation level for identity generation. + + + + datanucleus.cache.level2 + false + Use a level 2 cache. Turn this off if metadata is changed independently of hive metastore server + + + + datanucleus.cache.level2.type + SOFT + SOFT=soft reference based cache, WEAK=weak reference based cache. + + + + datanucleus.identifierFactory + datanucleus + Name of the identifier factory to use when generating table/column names etc. 'datanucleus' is used for backward compatibility + + + + datanucleus.plugin.pluginRegistryBundleCheck + LOG + Defines what happens when plugin bundles are found and are duplicated [EXCEPTION|LOG|NONE] + + + + hive.metastore.warehouse.dir + /user/hive/warehouse + location of default database for the warehouse + + + + hive.metastore.connect.retries + 5 + Number of retries while opening a connection to metastore + + + + hive.metastore.rawstore.impl + org.apache.hadoop.hive.metastore.ObjectStore + Name of the class that implements org.apache.hadoop.hive.metastore.rawstore interface. This class is used to store and retrieval of raw metadata objects such as table, database + + + + hive.default.fileformat + TextFile + Default file format for CREATE TABLE statement. Options are TextFile and SequenceFile. Users can explicitly say CREATE TABLE ... STORED AS <TEXTFILE|SEQUENCEFILE> to override + + + + hive.fileformat.check + true + Whether to check file format or not when loading data files + + + + hive.map.aggr + true + Whether to use map-side aggregation in Hive Group By queries + + + + hive.groupby.skewindata + false + Whether there is skew in data to optimize group by queries + + + + hive.groupby.mapaggr.checkinterval + 100000 + Number of rows after which size of the grouping keys/aggregation classes is performed + + + + hive.mapred.local.mem + 0 + For local mode, memory of the mappers/reducers + + + + hive.map.aggr.hash.percentmemory + 0.5 + Portion of total memory to be used by map-side grup aggregation hash table + + + + hive.map.aggr.hash.min.reduction + 0.5 + Hash aggregation will be turned off if the ratio between hash + table size and input rows is bigger than this number. Set to 1 to make sure + hash aggregation is never turned off. + + + + hive.optimize.cp + true + Whether to enable column pruner + + + + hive.optimize.ppd + true + Whether to enable predicate pushdown + + + + hive.optimize.ppd.storage + true + Whether to push predicates down into storage handlers. Ignored when hive.optimize.ppd is false. + + + + hive.optimize.pruner + true + Whether to enable the new partition pruner which depends on predicate pushdown. If this is disabled, + the old partition pruner which is based on AST will be enabled. + + + + hive.optimize.groupby + true + Whether to enable the bucketed group by from bucketed partitions/tables. + + + + hive.join.emit.interval + 1000 + How many rows in the right-most join operand Hive should buffer before emitting the join result. + + + + hive.join.cache.size + 25000 + How many rows in the joining tables (except the streaming table) should be cached in memory. + + + + hive.mapjoin.bucket.cache.size + 100 + How many values in each keys in the map-joined table should be cached in memory. + + + + hive.mapjoin.maxsize + 100000 + Maximum # of rows of the small table that can be handled by map-side join. If the size is reached and hive.task.progress is set, a fatal error counter is set and the job will be killed. + + + + hive.mapjoin.cache.numrows + 25000 + How many rows should be cached by jdbm for map join. + + + + hive.optimize.skewjoin + false + Whether to enable skew join optimization. + + + + hive.skewjoin.key + 100000 + Determine if we get a skew key in join. If we see more + than the specified number of rows with the same key in join operator, + we think the key as a skew join key. + + + + hive.skewjoin.mapjoin.map.tasks + 10000 + Determine the number of map task used in the follow up map join job + for a skew join. It should be used together with hive.skewjoin.mapjoin.min.split + to perform a fine grained control. + + + + hive.skewjoin.mapjoin.min.split + 33554432 + Determine the number of map task at most used in the follow up map join job + for a skew join by specifying the minimum split size. It should be used together with + hive.skewjoin.mapjoin.map.tasks to perform a fine grained control. + + + + hive.mapred.mode + nonstrict + The mode in which the hive operations are being performed. In strict mode, some risky queries are not allowed to run + + + + hive.exec.script.maxerrsize + 100000 + Maximum number of bytes a script is allowed to emit to standard error (per map-reduce task). This prevents runaway scripts from filling logs partitions to capacity + + + + hive.exec.script.allow.partial.consumption + false + When enabled, this option allows a user script to exit successfully without consuming all the data from the standard input. + + + + + hive.script.operator.id.env.var + HIVE_SCRIPT_OPERATOR_ID + Name of the environment variable that holds the unique script operator ID in the user's transform function (the custom mapper/reducer that the user has specified in the query) + + + + + hive.exec.compress.output + false + This controls whether the final outputs of a query (to a local/hdfs file or a hive table) is compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* + + + + hive.exec.compress.intermediate + false + This controls whether intermediate files produced by hive between multiple map-reduce jobs are compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* + + + + hive.exec.parallel + false + Whether to execute jobs in parallel + + + + hive.exec.parallel.thread.number + 8 + How many jobs at most can be executed in parallel + + + + hive.task.progress + false + Whether Hive should periodically update task progress counters during execution. Enabling this allows task progress to be monitored more closely in the job tracker, but may impose a performance penalty. This flag is automatically set to true for jobs with hive.exec.dynamic.partition set to true. + + + + hive.hwi.war.file + lib/hive-hwi-@VERSION@.war + This sets the path to the HWI war file, relative to ${HIVE_HOME}. + + + + hive.hwi.listen.host + 0.0.0.0 + This is the host address the Hive Web Interface will listen on + + + + hive.hwi.listen.port + 9999 + This is the port the Hive Web Interface will listen on + + + + hive.exec.pre.hooks + + Comma-separated list of pre-execution hooks to be invoked for each statement. A pre-execution hook is specified as the name of a Java class which implements the org.apache.hadoop.hive.ql.hooks.PreExecute interface. + + + + hive.exec.post.hooks + + Comma-separated list of post-execution hooks to be invoked for each statement. A post-execution hook is specified as the name of a Java class which implements the org.apache.hadoop.hive.ql.hooks.PostExecute interface. + + + + hive.merge.mapfiles + true + Merge small files at the end of a map-only job + + + + hive.merge.mapredfiles + false + Merge small files at the end of a map-reduce job + + + + hive.mergejob.maponly + true + Try to generate a map-only job for merging files if CombineHiveInputFormat is supported. + + + + hive.heartbeat.interval + 1000 + Send a heartbeat after this interval - used by mapjoin and filter operators + + + + hive.merge.size.per.task + 256000000 + Size of merged files at the end of the job + + + + hive.merge.smallfiles.avgsize + 16000000 + When the average output file size of a job is less than this number, Hive will start an additional map-reduce job to merge the output files into bigger files. This is only done for map-only jobs if hive.merge.mapfiles is true, and for map-reduce jobs if hive.merge.mapredfiles is true. + + + + hive.script.auto.progress + false + Whether Hive Tranform/Map/Reduce Clause should automatically send progress information to TaskTracker to avoid the task getting killed because of inactivity. Hive sends progress information when the script is outputting to stderr. This option removes the need of periodically producing stderr messages, but users should be cautious because this may prevent infinite loops in the scripts to be killed by TaskTracker. + + + + hive.script.serde + org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + The default serde for trasmitting input data to and reading output data from the user scripts. + + + + hive.script.recordreader + org.apache.hadoop.hive.ql.exec.TextRecordReader + The default record reader for reading data from the user scripts. + + + + hive.script.recordwriter + org.apache.hadoop.hive.ql.exec.TextRecordWriter + The default record writer for writing data to the user scripts. + + + + hive.input.format + org.apache.hadoop.hive.ql.io.HiveInputFormat + The default input format, if it is not specified, the system assigns it. It is set to HiveInputFormat for hadoop versions 17, 18 and 19, whereas it is set to CombinedHiveInputFormat for hadoop 20. The user can always overwrite it - if there is a bug in CombinedHiveInputFormat, it can always be manually set to HiveInputFormat. + + + + hive.udtf.auto.progress + false + Whether Hive should automatically send progress information to TaskTracker when using UDTF's to prevent the task getting killed because of inactivity. Users should be cautious because this may prevent TaskTracker from killing tasks with infinte loops. + + + + hive.mapred.reduce.tasks.speculative.execution + true + Whether speculative execution for reducers should be turned on. + + + + hive.exec.counters.pull.interval + 1000 + The interval with which to poll the JobTracker for the counters the running job. The smaller it is the more load there will be on the jobtracker, the higher it is the less granular the caught will be. + + + + hive.enforce.bucketing + false + Whether bucketing is enforced. If true, while inserting into the table, bucketing is enforced. + + + + hive.enforce.sorting + false + Whether sorting is enforced. If true, while inserting into the table, sorting is enforced. + + + + hive.metastore.ds.connection.url.hook + + Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used + + + + hive.metastore.ds.retry.attempts + 1 + The number of times to retry a metastore call if there were a connection error + + + + hive.metastore.ds.retry.interval + 1000 + The number of miliseconds between metastore retry attempts + + + + hive.metastore.server.min.threads + 200 + Minimum number of worker threads in the Thrift server's pool. + + + + hive.metastore.server.max.threads + 100000 + Maximum number of worker threads in the Thrift server's pool. + + + + hive.metastore.server.tcp.keepalive + true + Whether to enable TCP keepalive for the metastore server. Keepalive will prevent accumulation of half-open connections. + + + + hive.optimize.reducededuplication + true + Remove extra map-reduce jobs if the data is already clustered by the same key which needs to be used again. This should always be set to true. Since it is a new feature, it has been made configurable. + + + + hive.exec.dynamic.partition + false + Whether or not to allow dynamic partitions in DML/DDL. + + + + hive.exec.dynamic.partition.mode + strict + In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions. + + + + hive.exec.max.dynamic.partitions + 1000 + Maximum number of dynamic partitions allowed to be created in total. + + + + hive.exec.max.dynamic.partitions.pernode + 100 + Maximum number of dynamic partitions allowed to be created in each mapper/reducer node. + + + + hive.exec.max.created.files + 100000 + Maximum number of HDFS files created by all mappers/reducers in a MapReduce job. + + + + + hive.exec.default.partition.name + __HIVE_DEFAULT_PARTITION__ + The default partition name in case the dynamic partition column value is null/empty string or anyother values that cannot be escaped. This value must not contain any special character used in HDFS URI (e.g., ':', '%', '/' etc). The user has to be aware that the dynamic partition value should not contain this value to avoid confusions. + + + + hive.support.concurrency + false + Whether hive supports concurrency or not. A zookeeper instance must be up and running for the default hive lock manager to support read-write locks. + + + + hive.concurrency.manager + org.apache.hadoop.hive.ql.lockmgr.ZooKeeperLockMgr + The concurrency manager for hive. + + + + hive.lock.numretries + 100 + The number of times you want to try to get all the locks + + + + hive.lock.sleep.between.retries + 60 + The sleep time (in seconds) between various retries + + + + hive.zookeeper.quorum + + The list of zookeeper servers to talk to. This is only needed for read/write locks. + + + + hive.zookeeper.client.port + 2181 + The port of zookeeper servers to talk to. This is only needed for read/write locks. + + + + hive.zookeeper.session.timeout + + Zookeeper client's session timeout. The client is disconnected, and as a result, all locks released, if a heartbeat is not sent in the timeout. + + + + hive.zookeeper.namespace + hive_zookeeper_namespace + The parent node under which all zookeeper nodes are created. + + + + fs.har.impl + org.apache.hadoop.hive.shims.HiveHarFileSystem + The implementation for accessing Hadoop Archives. Note that this won't be applicable to Hadoop vers less than 0.20 + + + + hive.archive.enabled + false + Whether archiving operations are permitted + + + + hive.archive.har.parentdir.settable + false + In new Hadoop versions, the parent directory must be set while + creating a HAR. Because this functionality is hard to detect with just version + numbers, this conf var needs to be set manually. + + + + hive.exec.mode.local.auto + false + Let hive determine whether to run in local mode automatically + + + + hive.exec.show.job.failure.debug.info + false + + If a job fails, whether to provide a link in the CLI to the task with the + most failures, along with debugging hints if applicable. + + + + + hive.auto.progress.timeout + 0 + + How long to run autoprogressor for the script/UDTF operators (in seconds). + Set to 0 for forever. + + + + + + + hive.hbase.wal.enabled + true + Whether writes to HBase should be forced to the write-ahead log. Disabling this improves HBase write performance at the risk of lost writes in case of a crash. + + + + hive.table.parameters.default + + Default property values for newly created tables + + + diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 47b7518..e6071bb 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -487,14 +487,16 @@ public class HiveConf extends Configuration { // let's add the hive configuration URL hconfurl = getClassLoader().getResource("hive-default.xml"); if (hconfurl == null) { - l4j.debug("hive-default.xml not found."); + l4j.info("hive-default.xml not found."); } else { + l4j.info("Loading hive-default.xml configuration from " + hconfurl.toString()); addResource(hconfurl); } URL hsiteurl = getClassLoader().getResource("hive-site.xml"); if (hsiteurl == null) { - l4j.debug("hive-site.xml not found."); + l4j.info("hive-site.xml not found."); } else { + l4j.info("Loading hive-site.xml configuration from " + hsiteurl.toString()); addResource(hsiteurl); } diff --git conf/hive-default.xml conf/hive-default.xml deleted file mode 100644 index 1465317..0000000 --- conf/hive-default.xml +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - - - - mapred.reduce.tasks - -1 - The default number of reduce tasks per job. Typically set - to a prime close to the number of available hosts. Ignored when - mapred.job.tracker is "local". Hadoop set this to 1 by default, whereas hive uses -1 as its default value. - By setting this property to -1, Hive will automatically figure out what should be the number of reducers. - - - - - hive.exec.reducers.bytes.per.reducer - 1000000000 - size per reducer.The default is 1G, i.e if the input size is 10G, it will use 10 reducers. - - - - hive.exec.reducers.max - 999 - max number of reducers will be used. If the one - specified in the configuration parameter mapred.reduce.tasks is - negative, hive will use this one as the max number of reducers when - automatically determine number of reducers. - - - - hive.exec.scratchdir - /tmp/hive-${user.name} - Scratch space for Hive jobs - - - - hive.test.mode - false - whether hive is running in test mode. If yes, it turns on sampling and prefixes the output tablename - - - - hive.test.mode.prefix - test_ - if hive is running in test mode, prefixes the output table by this string - - - - - - - - - - - hive.test.mode.samplefreq - 32 - if hive is running in test mode and table is not bucketed, sampling frequency - - - - hive.test.mode.nosamplelist - - if hive is running in test mode, dont sample the above comma seperated list of tables - - - - hive.metastore.local - true - controls whether to connect to remove metastore server or open a new metastore server in Hive Client JVM - - - - javax.jdo.option.ConnectionURL - jdbc:derby:;databaseName=metastore_db;create=true - JDBC connect string for a JDBC metastore - - - - javax.jdo.option.ConnectionDriverName - org.apache.derby.jdbc.EmbeddedDriver - Driver class name for a JDBC metastore - - - - javax.jdo.PersistenceManagerFactoryClass - org.datanucleus.jdo.JDOPersistenceManagerFactory - class implementing the jdo persistence - - - - javax.jdo.option.DetachAllOnCommit - true - detaches all objects from session so that they can be used after transaction is committed - - - - javax.jdo.option.NonTransactionalRead - true - reads outside of transactions - - - - javax.jdo.option.ConnectionUserName - APP - username to use against metastore database - - - - javax.jdo.option.ConnectionPassword - mine - password to use against metastore database - - - - datanucleus.connectionPoolingType - DBCP - Uses a DBCP connection pool for JDBC metastore - - - - datanucleus.validateTables - false - validates existing schema against code. turn this on if you want to verify existing schema - - - - datanucleus.validateColumns - false - validates existing schema against code. turn this on if you want to verify existing schema - - - - datanucleus.validateConstraints - false - validates existing schema against code. turn this on if you want to verify existing schema - - - - datanucleus.storeManagerType - rdbms - metadata store type - - - - datanucleus.autoCreateSchema - true - creates necessary schema on a startup if one doesn't exist. set this to false, after creating it once - - - - datanucleus.autoStartMechanismMode - checked - throw exception if metadata tables are incorrect - - - - datanucleus.transactionIsolation - read-committed - Default transaction isolation level for identity generation. - - - - datanucleus.cache.level2 - false - Use a level 2 cache. Turn this off if metadata is changed independently of hive metastore server - - - - datanucleus.cache.level2.type - SOFT - SOFT=soft reference based cache, WEAK=weak reference based cache. - - - - datanucleus.identifierFactory - datanucleus - Name of the identifier factory to use when generating table/column names etc. 'datanucleus' is used for backward compatibility - - - - datanucleus.plugin.pluginRegistryBundleCheck - LOG - Defines what happens when plugin bundles are found and are duplicated [EXCEPTION|LOG|NONE] - - - - hive.metastore.warehouse.dir - /user/hive/warehouse - location of default database for the warehouse - - - - hive.metastore.connect.retries - 5 - Number of retries while opening a connection to metastore - - - - hive.metastore.rawstore.impl - org.apache.hadoop.hive.metastore.ObjectStore - Name of the class that implements org.apache.hadoop.hive.metastore.rawstore interface. This class is used to store and retrieval of raw metadata objects such as table, database - - - - hive.default.fileformat - TextFile - Default file format for CREATE TABLE statement. Options are TextFile and SequenceFile. Users can explicitly say CREATE TABLE ... STORED AS <TEXTFILE|SEQUENCEFILE> to override - - - - hive.fileformat.check - true - Whether to check file format or not when loading data files - - - - hive.map.aggr - true - Whether to use map-side aggregation in Hive Group By queries - - - - hive.groupby.skewindata - false - Whether there is skew in data to optimize group by queries - - - - hive.groupby.mapaggr.checkinterval - 100000 - Number of rows after which size of the grouping keys/aggregation classes is performed - - - - hive.mapred.local.mem - 0 - For local mode, memory of the mappers/reducers - - - - hive.map.aggr.hash.percentmemory - 0.5 - Portion of total memory to be used by map-side grup aggregation hash table - - - - hive.map.aggr.hash.min.reduction - 0.5 - Hash aggregation will be turned off if the ratio between hash - table size and input rows is bigger than this number. Set to 1 to make sure - hash aggregation is never turned off. - - - - hive.optimize.cp - true - Whether to enable column pruner - - - - hive.optimize.ppd - true - Whether to enable predicate pushdown - - - - hive.optimize.ppd.storage - true - Whether to push predicates down into storage handlers. Ignored when hive.optimize.ppd is false. - - - - hive.optimize.pruner - true - Whether to enable the new partition pruner which depends on predicate pushdown. If this is disabled, - the old partition pruner which is based on AST will be enabled. - - - - hive.optimize.groupby - true - Whether to enable the bucketed group by from bucketed partitions/tables. - - - - hive.join.emit.interval - 1000 - How many rows in the right-most join operand Hive should buffer before emitting the join result. - - - - hive.join.cache.size - 25000 - How many rows in the joining tables (except the streaming table) should be cached in memory. - - - - hive.mapjoin.bucket.cache.size - 100 - How many values in each keys in the map-joined table should be cached in memory. - - - - hive.mapjoin.maxsize - 100000 - Maximum # of rows of the small table that can be handled by map-side join. If the size is reached and hive.task.progress is set, a fatal error counter is set and the job will be killed. - - - - hive.mapjoin.cache.numrows - 25000 - How many rows should be cached by jdbm for map join. - - - - hive.optimize.skewjoin - false - Whether to enable skew join optimization. - - - - hive.skewjoin.key - 100000 - Determine if we get a skew key in join. If we see more - than the specified number of rows with the same key in join operator, - we think the key as a skew join key. - - - - hive.skewjoin.mapjoin.map.tasks - 10000 - Determine the number of map task used in the follow up map join job - for a skew join. It should be used together with hive.skewjoin.mapjoin.min.split - to perform a fine grained control. - - - - hive.skewjoin.mapjoin.min.split - 33554432 - Determine the number of map task at most used in the follow up map join job - for a skew join by specifying the minimum split size. It should be used together with - hive.skewjoin.mapjoin.map.tasks to perform a fine grained control. - - - - hive.mapred.mode - nonstrict - The mode in which the hive operations are being performed. In strict mode, some risky queries are not allowed to run - - - - hive.exec.script.maxerrsize - 100000 - Maximum number of bytes a script is allowed to emit to standard error (per map-reduce task). This prevents runaway scripts from filling logs partitions to capacity - - - - hive.exec.script.allow.partial.consumption - false - When enabled, this option allows a user script to exit successfully without consuming all the data from the standard input. - - - - - hive.script.operator.id.env.var - HIVE_SCRIPT_OPERATOR_ID - Name of the environment variable that holds the unique script operator ID in the user's transform function (the custom mapper/reducer that the user has specified in the query) - - - - - hive.exec.compress.output - false - This controls whether the final outputs of a query (to a local/hdfs file or a hive table) is compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* - - - - hive.exec.compress.intermediate - false - This controls whether intermediate files produced by hive between multiple map-reduce jobs are compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* - - - - hive.exec.parallel - false - Whether to execute jobs in parallel - - - - hive.exec.parallel.thread.number - 8 - How many jobs at most can be executed in parallel - - - - hive.task.progress - false - Whether Hive should periodically update task progress counters during execution. Enabling this allows task progress to be monitored more closely in the job tracker, but may impose a performance penalty. This flag is automatically set to true for jobs with hive.exec.dynamic.partition set to true. - - - - hive.hwi.war.file - lib/hive-hwi-@VERSION@.war - This sets the path to the HWI war file, relative to ${HIVE_HOME}. - - - - hive.hwi.listen.host - 0.0.0.0 - This is the host address the Hive Web Interface will listen on - - - - hive.hwi.listen.port - 9999 - This is the port the Hive Web Interface will listen on - - - - hive.exec.pre.hooks - - Comma-separated list of pre-execution hooks to be invoked for each statement. A pre-execution hook is specified as the name of a Java class which implements the org.apache.hadoop.hive.ql.hooks.PreExecute interface. - - - - hive.exec.post.hooks - - Comma-separated list of post-execution hooks to be invoked for each statement. A post-execution hook is specified as the name of a Java class which implements the org.apache.hadoop.hive.ql.hooks.PostExecute interface. - - - - hive.merge.mapfiles - true - Merge small files at the end of a map-only job - - - - hive.merge.mapredfiles - false - Merge small files at the end of a map-reduce job - - - - hive.mergejob.maponly - true - Try to generate a map-only job for merging files if CombineHiveInputFormat is supported. - - - - hive.heartbeat.interval - 1000 - Send a heartbeat after this interval - used by mapjoin and filter operators - - - - hive.merge.size.per.task - 256000000 - Size of merged files at the end of the job - - - - hive.merge.smallfiles.avgsize - 16000000 - When the average output file size of a job is less than this number, Hive will start an additional map-reduce job to merge the output files into bigger files. This is only done for map-only jobs if hive.merge.mapfiles is true, and for map-reduce jobs if hive.merge.mapredfiles is true. - - - - hive.script.auto.progress - false - Whether Hive Tranform/Map/Reduce Clause should automatically send progress information to TaskTracker to avoid the task getting killed because of inactivity. Hive sends progress information when the script is outputting to stderr. This option removes the need of periodically producing stderr messages, but users should be cautious because this may prevent infinite loops in the scripts to be killed by TaskTracker. - - - - hive.script.serde - org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - The default serde for trasmitting input data to and reading output data from the user scripts. - - - - hive.script.recordreader - org.apache.hadoop.hive.ql.exec.TextRecordReader - The default record reader for reading data from the user scripts. - - - - hive.script.recordwriter - org.apache.hadoop.hive.ql.exec.TextRecordWriter - The default record writer for writing data to the user scripts. - - - - hive.input.format - org.apache.hadoop.hive.ql.io.HiveInputFormat - The default input format, if it is not specified, the system assigns it. It is set to HiveInputFormat for hadoop versions 17, 18 and 19, whereas it is set to CombinedHiveInputFormat for hadoop 20. The user can always overwrite it - if there is a bug in CombinedHiveInputFormat, it can always be manually set to HiveInputFormat. - - - - hive.udtf.auto.progress - false - Whether Hive should automatically send progress information to TaskTracker when using UDTF's to prevent the task getting killed because of inactivity. Users should be cautious because this may prevent TaskTracker from killing tasks with infinte loops. - - - - hive.mapred.reduce.tasks.speculative.execution - true - Whether speculative execution for reducers should be turned on. - - - - hive.exec.counters.pull.interval - 1000 - The interval with which to poll the JobTracker for the counters the running job. The smaller it is the more load there will be on the jobtracker, the higher it is the less granular the caught will be. - - - - hive.enforce.bucketing - false - Whether bucketing is enforced. If true, while inserting into the table, bucketing is enforced. - - - - hive.enforce.sorting - false - Whether sorting is enforced. If true, while inserting into the table, sorting is enforced. - - - - hive.metastore.ds.connection.url.hook - - Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used - - - - hive.metastore.ds.retry.attempts - 1 - The number of times to retry a metastore call if there were a connection error - - - - hive.metastore.ds.retry.interval - 1000 - The number of miliseconds between metastore retry attempts - - - - hive.metastore.server.min.threads - 200 - Minimum number of worker threads in the Thrift server's pool. - - - - hive.metastore.server.max.threads - 100000 - Maximum number of worker threads in the Thrift server's pool. - - - - hive.metastore.server.tcp.keepalive - true - Whether to enable TCP keepalive for the metastore server. Keepalive will prevent accumulation of half-open connections. - - - - hive.optimize.reducededuplication - true - Remove extra map-reduce jobs if the data is already clustered by the same key which needs to be used again. This should always be set to true. Since it is a new feature, it has been made configurable. - - - - hive.exec.dynamic.partition - false - Whether or not to allow dynamic partitions in DML/DDL. - - - - hive.exec.dynamic.partition.mode - strict - In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions. - - - - hive.exec.max.dynamic.partitions - 1000 - Maximum number of dynamic partitions allowed to be created in total. - - - - hive.exec.max.dynamic.partitions.pernode - 100 - Maximum number of dynamic partitions allowed to be created in each mapper/reducer node. - - - - hive.exec.max.created.files - 100000 - Maximum number of HDFS files created by all mappers/reducers in a MapReduce job. - - - - - hive.exec.default.partition.name - __HIVE_DEFAULT_PARTITION__ - The default partition name in case the dynamic partition column value is null/empty string or anyother values that cannot be escaped. This value must not contain any special character used in HDFS URI (e.g., ':', '%', '/' etc). The user has to be aware that the dynamic partition value should not contain this value to avoid confusions. - - - - hive.support.concurrency - false - Whether hive supports concurrency or not. A zookeeper instance must be up and running for the default hive lock manager to support read-write locks. - - - - hive.concurrency.manager - org.apache.hadoop.hive.ql.lockmgr.ZooKeeperLockMgr - The concurrency manager for hive. - - - - hive.lock.numretries - 100 - The number of times you want to try to get all the locks - - - - hive.lock.sleep.between.retries - 60 - The sleep time (in seconds) between various retries - - - - hive.zookeeper.quorum - - The list of zookeeper servers to talk to. This is only needed for read/write locks. - - - - hive.zookeeper.client.port - 2181 - The port of zookeeper servers to talk to. This is only needed for read/write locks. - - - - hive.zookeeper.session.timeout - - Zookeeper client's session timeout. The client is disconnected, and as a result, all locks released, if a heartbeat is not sent in the timeout. - - - - hive.zookeeper.namespace - hive_zookeeper_namespace - The parent node under which all zookeeper nodes are created. - - - - fs.har.impl - org.apache.hadoop.hive.shims.HiveHarFileSystem - The implementation for accessing Hadoop Archives. Note that this won't be applicable to Hadoop vers less than 0.20 - - - - hive.archive.enabled - false - Whether archiving operations are permitted - - - - hive.archive.har.parentdir.settable - false - In new Hadoop versions, the parent directory must be set while - creating a HAR. Because this functionality is hard to detect with just version - numbers, this conf var needs to be set manually. - - - - hive.exec.mode.local.auto - false - Let hive determine whether to run in local mode automatically - - - - hive.exec.show.job.failure.debug.info - false - - If a job fails, whether to provide a link in the CLI to the task with the - most failures, along with debugging hints if applicable. - - - - - hive.auto.progress.timeout - 0 - - How long to run autoprogressor for the script/UDTF operators (in seconds). - Set to 0 for forever. - - - - - - - hive.hbase.wal.enabled - true - Whether writes to HBase should be forced to the write-ahead log. Disabling this improves HBase write performance at the risk of lost writes in case of a crash. - - - - hive.table.parameters.default - - Default property values for newly created tables - - - diff --git conf/hive-site.xml.template conf/hive-site.xml.template new file mode 100644 index 0000000..1465317 --- /dev/null +++ conf/hive-site.xml.template @@ -0,0 +1,721 @@ + + + + + + + + + + + + + + mapred.reduce.tasks + -1 + The default number of reduce tasks per job. Typically set + to a prime close to the number of available hosts. Ignored when + mapred.job.tracker is "local". Hadoop set this to 1 by default, whereas hive uses -1 as its default value. + By setting this property to -1, Hive will automatically figure out what should be the number of reducers. + + + + + hive.exec.reducers.bytes.per.reducer + 1000000000 + size per reducer.The default is 1G, i.e if the input size is 10G, it will use 10 reducers. + + + + hive.exec.reducers.max + 999 + max number of reducers will be used. If the one + specified in the configuration parameter mapred.reduce.tasks is + negative, hive will use this one as the max number of reducers when + automatically determine number of reducers. + + + + hive.exec.scratchdir + /tmp/hive-${user.name} + Scratch space for Hive jobs + + + + hive.test.mode + false + whether hive is running in test mode. If yes, it turns on sampling and prefixes the output tablename + + + + hive.test.mode.prefix + test_ + if hive is running in test mode, prefixes the output table by this string + + + + + + + + + + + hive.test.mode.samplefreq + 32 + if hive is running in test mode and table is not bucketed, sampling frequency + + + + hive.test.mode.nosamplelist + + if hive is running in test mode, dont sample the above comma seperated list of tables + + + + hive.metastore.local + true + controls whether to connect to remove metastore server or open a new metastore server in Hive Client JVM + + + + javax.jdo.option.ConnectionURL + jdbc:derby:;databaseName=metastore_db;create=true + JDBC connect string for a JDBC metastore + + + + javax.jdo.option.ConnectionDriverName + org.apache.derby.jdbc.EmbeddedDriver + Driver class name for a JDBC metastore + + + + javax.jdo.PersistenceManagerFactoryClass + org.datanucleus.jdo.JDOPersistenceManagerFactory + class implementing the jdo persistence + + + + javax.jdo.option.DetachAllOnCommit + true + detaches all objects from session so that they can be used after transaction is committed + + + + javax.jdo.option.NonTransactionalRead + true + reads outside of transactions + + + + javax.jdo.option.ConnectionUserName + APP + username to use against metastore database + + + + javax.jdo.option.ConnectionPassword + mine + password to use against metastore database + + + + datanucleus.connectionPoolingType + DBCP + Uses a DBCP connection pool for JDBC metastore + + + + datanucleus.validateTables + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.validateColumns + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.validateConstraints + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.storeManagerType + rdbms + metadata store type + + + + datanucleus.autoCreateSchema + true + creates necessary schema on a startup if one doesn't exist. set this to false, after creating it once + + + + datanucleus.autoStartMechanismMode + checked + throw exception if metadata tables are incorrect + + + + datanucleus.transactionIsolation + read-committed + Default transaction isolation level for identity generation. + + + + datanucleus.cache.level2 + false + Use a level 2 cache. Turn this off if metadata is changed independently of hive metastore server + + + + datanucleus.cache.level2.type + SOFT + SOFT=soft reference based cache, WEAK=weak reference based cache. + + + + datanucleus.identifierFactory + datanucleus + Name of the identifier factory to use when generating table/column names etc. 'datanucleus' is used for backward compatibility + + + + datanucleus.plugin.pluginRegistryBundleCheck + LOG + Defines what happens when plugin bundles are found and are duplicated [EXCEPTION|LOG|NONE] + + + + hive.metastore.warehouse.dir + /user/hive/warehouse + location of default database for the warehouse + + + + hive.metastore.connect.retries + 5 + Number of retries while opening a connection to metastore + + + + hive.metastore.rawstore.impl + org.apache.hadoop.hive.metastore.ObjectStore + Name of the class that implements org.apache.hadoop.hive.metastore.rawstore interface. This class is used to store and retrieval of raw metadata objects such as table, database + + + + hive.default.fileformat + TextFile + Default file format for CREATE TABLE statement. Options are TextFile and SequenceFile. Users can explicitly say CREATE TABLE ... STORED AS <TEXTFILE|SEQUENCEFILE> to override + + + + hive.fileformat.check + true + Whether to check file format or not when loading data files + + + + hive.map.aggr + true + Whether to use map-side aggregation in Hive Group By queries + + + + hive.groupby.skewindata + false + Whether there is skew in data to optimize group by queries + + + + hive.groupby.mapaggr.checkinterval + 100000 + Number of rows after which size of the grouping keys/aggregation classes is performed + + + + hive.mapred.local.mem + 0 + For local mode, memory of the mappers/reducers + + + + hive.map.aggr.hash.percentmemory + 0.5 + Portion of total memory to be used by map-side grup aggregation hash table + + + + hive.map.aggr.hash.min.reduction + 0.5 + Hash aggregation will be turned off if the ratio between hash + table size and input rows is bigger than this number. Set to 1 to make sure + hash aggregation is never turned off. + + + + hive.optimize.cp + true + Whether to enable column pruner + + + + hive.optimize.ppd + true + Whether to enable predicate pushdown + + + + hive.optimize.ppd.storage + true + Whether to push predicates down into storage handlers. Ignored when hive.optimize.ppd is false. + + + + hive.optimize.pruner + true + Whether to enable the new partition pruner which depends on predicate pushdown. If this is disabled, + the old partition pruner which is based on AST will be enabled. + + + + hive.optimize.groupby + true + Whether to enable the bucketed group by from bucketed partitions/tables. + + + + hive.join.emit.interval + 1000 + How many rows in the right-most join operand Hive should buffer before emitting the join result. + + + + hive.join.cache.size + 25000 + How many rows in the joining tables (except the streaming table) should be cached in memory. + + + + hive.mapjoin.bucket.cache.size + 100 + How many values in each keys in the map-joined table should be cached in memory. + + + + hive.mapjoin.maxsize + 100000 + Maximum # of rows of the small table that can be handled by map-side join. If the size is reached and hive.task.progress is set, a fatal error counter is set and the job will be killed. + + + + hive.mapjoin.cache.numrows + 25000 + How many rows should be cached by jdbm for map join. + + + + hive.optimize.skewjoin + false + Whether to enable skew join optimization. + + + + hive.skewjoin.key + 100000 + Determine if we get a skew key in join. If we see more + than the specified number of rows with the same key in join operator, + we think the key as a skew join key. + + + + hive.skewjoin.mapjoin.map.tasks + 10000 + Determine the number of map task used in the follow up map join job + for a skew join. It should be used together with hive.skewjoin.mapjoin.min.split + to perform a fine grained control. + + + + hive.skewjoin.mapjoin.min.split + 33554432 + Determine the number of map task at most used in the follow up map join job + for a skew join by specifying the minimum split size. It should be used together with + hive.skewjoin.mapjoin.map.tasks to perform a fine grained control. + + + + hive.mapred.mode + nonstrict + The mode in which the hive operations are being performed. In strict mode, some risky queries are not allowed to run + + + + hive.exec.script.maxerrsize + 100000 + Maximum number of bytes a script is allowed to emit to standard error (per map-reduce task). This prevents runaway scripts from filling logs partitions to capacity + + + + hive.exec.script.allow.partial.consumption + false + When enabled, this option allows a user script to exit successfully without consuming all the data from the standard input. + + + + + hive.script.operator.id.env.var + HIVE_SCRIPT_OPERATOR_ID + Name of the environment variable that holds the unique script operator ID in the user's transform function (the custom mapper/reducer that the user has specified in the query) + + + + + hive.exec.compress.output + false + This controls whether the final outputs of a query (to a local/hdfs file or a hive table) is compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* + + + + hive.exec.compress.intermediate + false + This controls whether intermediate files produced by hive between multiple map-reduce jobs are compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* + + + + hive.exec.parallel + false + Whether to execute jobs in parallel + + + + hive.exec.parallel.thread.number + 8 + How many jobs at most can be executed in parallel + + + + hive.task.progress + false + Whether Hive should periodically update task progress counters during execution. Enabling this allows task progress to be monitored more closely in the job tracker, but may impose a performance penalty. This flag is automatically set to true for jobs with hive.exec.dynamic.partition set to true. + + + + hive.hwi.war.file + lib/hive-hwi-@VERSION@.war + This sets the path to the HWI war file, relative to ${HIVE_HOME}. + + + + hive.hwi.listen.host + 0.0.0.0 + This is the host address the Hive Web Interface will listen on + + + + hive.hwi.listen.port + 9999 + This is the port the Hive Web Interface will listen on + + + + hive.exec.pre.hooks + + Comma-separated list of pre-execution hooks to be invoked for each statement. A pre-execution hook is specified as the name of a Java class which implements the org.apache.hadoop.hive.ql.hooks.PreExecute interface. + + + + hive.exec.post.hooks + + Comma-separated list of post-execution hooks to be invoked for each statement. A post-execution hook is specified as the name of a Java class which implements the org.apache.hadoop.hive.ql.hooks.PostExecute interface. + + + + hive.merge.mapfiles + true + Merge small files at the end of a map-only job + + + + hive.merge.mapredfiles + false + Merge small files at the end of a map-reduce job + + + + hive.mergejob.maponly + true + Try to generate a map-only job for merging files if CombineHiveInputFormat is supported. + + + + hive.heartbeat.interval + 1000 + Send a heartbeat after this interval - used by mapjoin and filter operators + + + + hive.merge.size.per.task + 256000000 + Size of merged files at the end of the job + + + + hive.merge.smallfiles.avgsize + 16000000 + When the average output file size of a job is less than this number, Hive will start an additional map-reduce job to merge the output files into bigger files. This is only done for map-only jobs if hive.merge.mapfiles is true, and for map-reduce jobs if hive.merge.mapredfiles is true. + + + + hive.script.auto.progress + false + Whether Hive Tranform/Map/Reduce Clause should automatically send progress information to TaskTracker to avoid the task getting killed because of inactivity. Hive sends progress information when the script is outputting to stderr. This option removes the need of periodically producing stderr messages, but users should be cautious because this may prevent infinite loops in the scripts to be killed by TaskTracker. + + + + hive.script.serde + org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + The default serde for trasmitting input data to and reading output data from the user scripts. + + + + hive.script.recordreader + org.apache.hadoop.hive.ql.exec.TextRecordReader + The default record reader for reading data from the user scripts. + + + + hive.script.recordwriter + org.apache.hadoop.hive.ql.exec.TextRecordWriter + The default record writer for writing data to the user scripts. + + + + hive.input.format + org.apache.hadoop.hive.ql.io.HiveInputFormat + The default input format, if it is not specified, the system assigns it. It is set to HiveInputFormat for hadoop versions 17, 18 and 19, whereas it is set to CombinedHiveInputFormat for hadoop 20. The user can always overwrite it - if there is a bug in CombinedHiveInputFormat, it can always be manually set to HiveInputFormat. + + + + hive.udtf.auto.progress + false + Whether Hive should automatically send progress information to TaskTracker when using UDTF's to prevent the task getting killed because of inactivity. Users should be cautious because this may prevent TaskTracker from killing tasks with infinte loops. + + + + hive.mapred.reduce.tasks.speculative.execution + true + Whether speculative execution for reducers should be turned on. + + + + hive.exec.counters.pull.interval + 1000 + The interval with which to poll the JobTracker for the counters the running job. The smaller it is the more load there will be on the jobtracker, the higher it is the less granular the caught will be. + + + + hive.enforce.bucketing + false + Whether bucketing is enforced. If true, while inserting into the table, bucketing is enforced. + + + + hive.enforce.sorting + false + Whether sorting is enforced. If true, while inserting into the table, sorting is enforced. + + + + hive.metastore.ds.connection.url.hook + + Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used + + + + hive.metastore.ds.retry.attempts + 1 + The number of times to retry a metastore call if there were a connection error + + + + hive.metastore.ds.retry.interval + 1000 + The number of miliseconds between metastore retry attempts + + + + hive.metastore.server.min.threads + 200 + Minimum number of worker threads in the Thrift server's pool. + + + + hive.metastore.server.max.threads + 100000 + Maximum number of worker threads in the Thrift server's pool. + + + + hive.metastore.server.tcp.keepalive + true + Whether to enable TCP keepalive for the metastore server. Keepalive will prevent accumulation of half-open connections. + + + + hive.optimize.reducededuplication + true + Remove extra map-reduce jobs if the data is already clustered by the same key which needs to be used again. This should always be set to true. Since it is a new feature, it has been made configurable. + + + + hive.exec.dynamic.partition + false + Whether or not to allow dynamic partitions in DML/DDL. + + + + hive.exec.dynamic.partition.mode + strict + In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions. + + + + hive.exec.max.dynamic.partitions + 1000 + Maximum number of dynamic partitions allowed to be created in total. + + + + hive.exec.max.dynamic.partitions.pernode + 100 + Maximum number of dynamic partitions allowed to be created in each mapper/reducer node. + + + + hive.exec.max.created.files + 100000 + Maximum number of HDFS files created by all mappers/reducers in a MapReduce job. + + + + + hive.exec.default.partition.name + __HIVE_DEFAULT_PARTITION__ + The default partition name in case the dynamic partition column value is null/empty string or anyother values that cannot be escaped. This value must not contain any special character used in HDFS URI (e.g., ':', '%', '/' etc). The user has to be aware that the dynamic partition value should not contain this value to avoid confusions. + + + + hive.support.concurrency + false + Whether hive supports concurrency or not. A zookeeper instance must be up and running for the default hive lock manager to support read-write locks. + + + + hive.concurrency.manager + org.apache.hadoop.hive.ql.lockmgr.ZooKeeperLockMgr + The concurrency manager for hive. + + + + hive.lock.numretries + 100 + The number of times you want to try to get all the locks + + + + hive.lock.sleep.between.retries + 60 + The sleep time (in seconds) between various retries + + + + hive.zookeeper.quorum + + The list of zookeeper servers to talk to. This is only needed for read/write locks. + + + + hive.zookeeper.client.port + 2181 + The port of zookeeper servers to talk to. This is only needed for read/write locks. + + + + hive.zookeeper.session.timeout + + Zookeeper client's session timeout. The client is disconnected, and as a result, all locks released, if a heartbeat is not sent in the timeout. + + + + hive.zookeeper.namespace + hive_zookeeper_namespace + The parent node under which all zookeeper nodes are created. + + + + fs.har.impl + org.apache.hadoop.hive.shims.HiveHarFileSystem + The implementation for accessing Hadoop Archives. Note that this won't be applicable to Hadoop vers less than 0.20 + + + + hive.archive.enabled + false + Whether archiving operations are permitted + + + + hive.archive.har.parentdir.settable + false + In new Hadoop versions, the parent directory must be set while + creating a HAR. Because this functionality is hard to detect with just version + numbers, this conf var needs to be set manually. + + + + hive.exec.mode.local.auto + false + Let hive determine whether to run in local mode automatically + + + + hive.exec.show.job.failure.debug.info + false + + If a job fails, whether to provide a link in the CLI to the task with the + most failures, along with debugging hints if applicable. + + + + + hive.auto.progress.timeout + 0 + + How long to run autoprogressor for the script/UDTF operators (in seconds). + Set to 0 for forever. + + + + + + + hive.hbase.wal.enabled + true + Whether writes to HBase should be forced to the write-ahead log. Disabling this improves HBase write performance at the risk of lost writes in case of a crash. + + + + hive.table.parameters.default + + Default property values for newly created tables + + +