diff --git .gitignore .gitignore index d0c97d1..ae78098 100644 --- .gitignore +++ .gitignore @@ -25,3 +25,4 @@ hcatalog/core/target hcatalog/webhcat/java-client/target hcatalog/storage-handlers/hbase/target hcatalog/webhcat/svr/target +conf/hive-default.xml.template diff --git common/pom.xml common/pom.xml index b7f6642..4b5b6ba 100644 --- common/pom.xml +++ common/pom.xml @@ -108,6 +108,35 @@ + + + dist + + + + org.apache.maven.plugins + maven-antrun-plugin + + + generate-template + package + + + + + + + + + run + + + + + + + @@ -149,21 +178,6 @@ run - - generate-template - package - - - - - - - - - run - - diff --git conf/hive-default.xml.template conf/hive-default.xml.template deleted file mode 100644 index d6a8e70..0000000 --- conf/hive-default.xml.template +++ /dev/null @@ -1,3045 +0,0 @@ - - - - - - - - hive.exec.script.wrapper - - - - - hive.exec.plan - - - - - hive.plan.serialization.format - kryo - - Query plan format serialization between client and task nodes. - Two supported values are : kryo and javaXML. Kryo is default. - - - - hive.exec.scratchdir - /tmp/hive-${system:user.name} - Scratch space for Hive jobs - - - hive.exec.local.scratchdir - ${system:java.io.tmpdir}/${system:user.name} - Local scratch space for Hive jobs - - - hive.scratch.dir.permission - 700 - - - - hive.exec.submitviachild - false - - - - hive.exec.submit.local.task.via.child - true - - Determines whether local tasks (typically mapjoin hashtable generation phase) runs in - separate JVM (true recommended) or not. - Avoids the overhead of spawning new JVM, but can lead to out-of-memory issues. - - - - hive.exec.script.maxerrsize - 100000 - - Maximum number of bytes a script is allowed to emit to standard error (per map-reduce task). - This prevents runaway scripts from filling logs partitions to capacity - - - - hive.exec.script.allow.partial.consumption - false - - When enabled, this option allows a user script to exit successfully without consuming - all the data from the standard input. - - - - stream.stderr.reporter.prefix - reporter: - Streaming jobs that log to standard error with this prefix can log counter or status information. - - - stream.stderr.reporter.enabled - true - Enable consumption of status and counter messages for streaming jobs. - - - hive.exec.compress.output - false - - This controls whether the final outputs of a query (to a local/HDFS file or a Hive table) is compressed. - The compression codec and other options are determined from Hadoop config variables mapred.output.compress* - - - - hive.exec.compress.intermediate - false - - This controls whether intermediate files produced by Hive between multiple map-reduce jobs are compressed. - The compression codec and other options are determined from Hadoop config variables mapred.output.compress* - - - - hive.intermediate.compression.codec - - - - - hive.intermediate.compression.type - - - - - hive.exec.reducers.bytes.per.reducer - 1000000000 - size per reducer.The default is 1G, i.e if the input size is 10G, it will use 10 reducers. - - - hive.exec.reducers.max - 999 - - max number of reducers will be used. If the one specified in the configuration parameter mapred.reduce.tasks is - negative, Hive will use this one as the max number of reducers when automatically determine number of reducers. - - - - hive.exec.pre.hooks - - - Comma-separated list of pre-execution hooks to be invoked for each statement. - A pre-execution hook is specified as the name of a Java class which implements the - org.apache.hadoop.hive.ql.hooks.ExecuteWithHookContext interface. - - - - hive.exec.post.hooks - - - Comma-separated list of post-execution hooks to be invoked for each statement. - A post-execution hook is specified as the name of a Java class which implements the - org.apache.hadoop.hive.ql.hooks.ExecuteWithHookContext interface. - - - - hive.exec.failure.hooks - - - Comma-separated list of on-failure hooks to be invoked for each statement. - An on-failure hook is specified as the name of Java class which implements the - org.apache.hadoop.hive.ql.hooks.ExecuteWithHookContext interface. - - - - hive.client.stats.publishers - - - Comma-separated list of statistics publishers to be invoked on counters on each job. - A client stats publisher is specified as the name of a Java class which implements the - org.apache.hadoop.hive.ql.stats.ClientStatsPublisher interface. - - - - hive.exec.parallel - false - Whether to execute jobs in parallel - - - hive.exec.parallel.thread.number - 8 - How many jobs at most can be executed in parallel - - - hive.mapred.reduce.tasks.speculative.execution - true - Whether speculative execution for reducers should be turned on. - - - hive.exec.counters.pull.interval - 1000 - - The interval with which to poll the JobTracker for the counters the running job. - The smaller it is the more load there will be on the jobtracker, the higher it is the less granular the caught will be. - - - - hive.exec.dynamic.partition - true - Whether or not to allow dynamic partitions in DML/DDL. - - - hive.exec.dynamic.partition.mode - strict - - In strict mode, the user must specify at least one static partition - in case the user accidentally overwrites all partitions. - - - - hive.exec.max.dynamic.partitions - 1000 - Maximum number of dynamic partitions allowed to be created in total. - - - hive.exec.max.dynamic.partitions.pernode - 100 - Maximum number of dynamic partitions allowed to be created in each mapper/reducer node. - - - hive.exec.max.created.files - 100000 - Maximum number of HDFS files created by all mappers/reducers in a MapReduce job. - - - hive.downloaded.resources.dir - ${system:java.io.tmpdir}/${hive.session.id}_resources - Temporary local directory for added resources in the remote file system. - - - hive.exec.default.partition.name - __HIVE_DEFAULT_PARTITION__ - - The default partition name in case the dynamic partition column value is null/empty string or any other values that cannot be escaped. - This value must not contain any special character used in HDFS URI (e.g., ':', '%', '/' etc). - The user has to be aware that the dynamic partition value should not contain this value to avoid confusions. - - - - hive.lockmgr.zookeeper.default.partition.name - __HIVE_DEFAULT_ZOOKEEPER_PARTITION__ - - - - hive.exec.show.job.failure.debug.info - true - - If a job fails, whether to provide a link in the CLI to the task with the - most failures, along with debugging hints if applicable. - - - - hive.exec.job.debug.capture.stacktraces - true - - Whether or not stack traces parsed from the task logs of a sampled failed task - for each failed job should be stored in the SessionState - - - - hive.exec.job.debug.timeout - 30000 - - - - hive.exec.tasklog.debug.timeout - 20000 - - - - hive.output.file.extension - - - String used as a file extension for output files. - If not set, defaults to the codec extension for text files (e.g. ".gz"), or no extension otherwise. - - - - hive.exec.mode.local.auto - false - Let Hive determine whether to run in local mode automatically - - - hive.exec.mode.local.auto.inputbytes.max - 134217728 - When hive.exec.mode.local.auto is true, input bytes should less than this for local mode. - - - hive.exec.mode.local.auto.input.files.max - 4 - When hive.exec.mode.local.auto is true, the number of tasks should less than this for local mode. - - - hive.exec.drop.ignorenonexistent - true - Do not report an error if DROP TABLE/VIEW specifies a non-existent table/view - - - hive.ignore.mapjoin.hint - true - Ignore the mapjoin hint - - - hive.file.max.footer - 100 - maximum number of lines for footer user can define for a table file - - - hive.resultset.use.unique.column.names - true - - Make column names unique in the result set by qualifying column names with table alias if needed. - Table alias will be added to column names for queries of type "select *" or - if query explicitly uses table alias "select r1.x..". - - - - fs.har.impl - org.apache.hadoop.hive.shims.HiveHarFileSystem - The implementation for accessing Hadoop Archives. Note that this won't be applicable to Hadoop versions less than 0.20 - - - hive.metastore.metadb.dir - - - - - hive.metastore.warehouse.dir - /user/hive/warehouse - location of default database for the warehouse - - - hive.metastore.uris - - Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. - - - hive.metastore.connect.retries - 3 - Number of retries while opening a connection to metastore - - - hive.metastore.failure.retries - 1 - Number of retries upon failure of Thrift metastore calls - - - hive.metastore.client.connect.retry.delay - 1 - Number of seconds for the client to wait between consecutive connection attempts - - - hive.metastore.client.socket.timeout - 600 - MetaStore Client socket timeout in seconds - - - javax.jdo.option.ConnectionPassword - mine - password to use against metastore database - - - hive.metastore.ds.connection.url.hook - - Name of the hook to use for retrieving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used - - - javax.jdo.option.Multithreaded - true - Set this to true if multiple threads access metastore through JDO concurrently. - - - javax.jdo.option.ConnectionURL - jdbc:derby:;databaseName=metastore_db;create=true - JDBC connect string for a JDBC metastore - - - hive.metastore.force.reload.conf - false - - Whether to force reloading of the metastore configuration (including - the connection URL, before the next metastore query that accesses the - datastore. Once reloaded, this value is reset to false. Used for - testing only. - - - - hive.hmshandler.retry.attempts - 1 - The number of times to retry a HMSHandler call if there were a connection error - - - hive.hmshandler.retry.interval - 1000 - The number of milliseconds between HMSHandler retry attempts - - - hive.hmshandler.force.reload.conf - false - - Whether to force reloading of the HMSHandler configuration (including - the connection URL, before the next metastore query that accesses the - datastore. Once reloaded, this value is reset to false. Used for - testing only. - - - - hive.metastore.server.min.threads - 200 - Minimum number of worker threads in the Thrift server's pool. - - - hive.metastore.server.max.threads - 100000 - Maximum number of worker threads in the Thrift server's pool. - - - hive.metastore.server.tcp.keepalive - true - Whether to enable TCP keepalive for the metastore server. Keepalive will prevent accumulation of half-open connections. - - - hive.metastore.archive.intermediate.original - _INTERMEDIATE_ORIGINAL - - Intermediate dir suffixes used for archiving. Not important what they - are, as long as collisions are avoided - - - - hive.metastore.archive.intermediate.archived - _INTERMEDIATE_ARCHIVED - - - - hive.metastore.archive.intermediate.extracted - _INTERMEDIATE_EXTRACTED - - - - hive.metastore.kerberos.keytab.file - - The path to the Kerberos Keytab file containing the metastore Thrift server's service principal. - - - hive.metastore.kerberos.principal - hive-metastore/_HOST@EXAMPLE.COM - - The service principal for the metastore Thrift server. - The special string _HOST will be replaced automatically with the correct host name. - - - - hive.metastore.sasl.enabled - false - If true, the metastore Thrift interface will be secured with SASL. Clients must authenticate with Kerberos. - - - hive.metastore.thrift.framed.transport.enabled - false - If true, the metastore Thrift interface will use TFramedTransport. When false (default) a standard TTransport is used. - - - hive.cluster.delegation.token.store.class - org.apache.hadoop.hive.thrift.MemoryTokenStore - The delegation token store implementation. Set to org.apache.hadoop.hive.thrift.ZooKeeperTokenStore for load-balanced cluster. - - - hive.cluster.delegation.token.store.zookeeper.connectString - - The ZooKeeper token store connect string. - - - hive.cluster.delegation.token.store.zookeeper.znode - /hive/cluster/delegation - The root path for token store data. - - - hive.cluster.delegation.token.store.zookeeper.acl - - ACL for token store entries. List comma separated all server principals for the cluster. - - - hive.metastore.cache.pinobjtypes - Table,StorageDescriptor,SerDeInfo,Partition,Database,Type,FieldSchema,Order - List of comma separated metastore object types that should be pinned in the cache - - - datanucleus.connectionPoolingType - BONECP - Specify connection pool library for datanucleus - - - datanucleus.validateTables - false - validates existing schema against code. turn this on if you want to verify existing schema - - - datanucleus.validateColumns - false - validates existing schema against code. turn this on if you want to verify existing schema - - - datanucleus.validateConstraints - false - validates existing schema against code. turn this on if you want to verify existing schema - - - datanucleus.storeManagerType - rdbms - metadata store type - - - datanucleus.autoCreateSchema - true - creates necessary schema on a startup if one doesn't exist. set this to false, after creating it once - - - datanucleus.fixedDatastore - false - - - - hive.metastore.schema.verification - false - - Enforce metastore schema version consistency. - True: Verify that version information stored in metastore matches with one from Hive jars. Also disable automatic - schema migration attempt. Users are required to manually migrate schema after Hive upgrade which ensures - proper metastore schema migration. (Default) - False: Warn if the version information stored in metastore doesn't match with one from in Hive jars. - - - - datanucleus.autoStartMechanismMode - checked - throw exception if metadata tables are incorrect - - - datanucleus.transactionIsolation - read-committed - Default transaction isolation level for identity generation. - - - datanucleus.cache.level2 - false - Use a level 2 cache. Turn this off if metadata is changed independently of Hive metastore server - - - datanucleus.cache.level2.type - none - - - - datanucleus.identifierFactory - datanucleus1 - - Name of the identifier factory to use when generating table/column names etc. - 'datanucleus1' is used for backward compatibility with DataNucleus v1 - - - - datanucleus.rdbms.useLegacyNativeValueStrategy - true - - - - datanucleus.plugin.pluginRegistryBundleCheck - LOG - Defines what happens when plugin bundles are found and are duplicated [EXCEPTION|LOG|NONE] - - - hive.metastore.batch.retrieve.max - 300 - - Maximum number of objects (tables/partitions) can be retrieved from metastore in one batch. - The higher the number, the less the number of round trips is needed to the Hive metastore server, - but it may also cause higher memory requirement at the client side. - - - - hive.metastore.batch.retrieve.table.partition.max - 1000 - Maximum number of table partitions that metastore internally retrieves in one batch. - - - hive.metastore.init.hooks - - - A comma separated list of hooks to be invoked at the beginning of HMSHandler initialization. - An init hook is specified as the name of Java class which extends org.apache.hadoop.hive.metastore.MetaStoreInitListener. - - - - hive.metastore.pre.event.listeners - - List of comma separated listeners for metastore events. - - - hive.metastore.event.listeners - - - - - hive.metastore.authorization.storage.checks - false - - Should the metastore do authorization checks against the underlying storage (usually hdfs) - for operations like drop-partition (disallow the drop-partition if the user in - question doesn't have permissions to delete the corresponding directory - on the storage). - - - - hive.metastore.event.clean.freq - 0 - Frequency at which timer task runs to purge expired events in metastore(in seconds). - - - hive.metastore.event.expiry.duration - 0 - Duration after which events expire from events table (in seconds) - - - hive.metastore.execute.setugi - true - - In unsecure mode, setting this property to true will cause the metastore to execute DFS operations using - the client's reported user and group permissions. Note that this property must be set on - both the client and server sides. Further note that its best effort. - If client sets its to true and server sets it to false, client setting will be ignored. - - - - hive.metastore.partition.name.whitelist.pattern - - Partition names will be checked against this regex pattern and rejected if not matched. - - - hive.metastore.integral.jdo.pushdown - false - - Allow JDO query pushdown for integral partition columns in metastore. Off by default. This - improves metastore perf for integral columns, especially if there's a large number of partitions. - However, it doesn't work correctly with integral values that are not normalized (e.g. have - leading zeroes, like 0012). If metastore direct SQL is enabled and works, this optimization - is also irrelevant. - - - - hive.metastore.try.direct.sql - true - - - - hive.metastore.try.direct.sql.ddl - true - - - - hive.metastore.disallow.incompatible.col.type.changes - false - - If true (default is false), ALTER TABLE operations which change the type of - a column (say STRING) to an incompatible type (say MAP<STRING, STRING>) are disallowed. - RCFile default SerDe (ColumnarSerDe) serializes the values in such a way that the - datatypes can be converted from string to any type. The map is also serialized as - a string, which can be read as a string as well. However, with any binary - serialization, this is not true. Blocking the ALTER TABLE prevents ClassCastExceptions - when subsequently trying to access old partitions. - - Primitive types like INT, STRING, BIGINT, etc are compatible with each other and are - not blocked. - - See HIVE-4409 for more details. - - - - hive.table.parameters.default - - Default property values for newly created tables - - - hive.ddl.createtablelike.properties.whitelist - - Table Properties to copy over when executing a Create Table Like. - - - hive.metastore.rawstore.impl - org.apache.hadoop.hive.metastore.ObjectStore - - Name of the class that implements org.apache.hadoop.hive.metastore.rawstore interface. - This class is used to store and retrieval of raw metadata objects such as table, database - - - - javax.jdo.option.ConnectionDriverName - org.apache.derby.jdbc.EmbeddedDriver - Driver class name for a JDBC metastore - - - javax.jdo.PersistenceManagerFactoryClass - org.datanucleus.api.jdo.JDOPersistenceManagerFactory - class implementing the jdo persistence - - - hive.metastore.expression.proxy - org.apache.hadoop.hive.ql.optimizer.ppr.PartitionExpressionForMetastore - - - - javax.jdo.option.DetachAllOnCommit - true - Detaches all objects from session so that they can be used after transaction is committed - - - javax.jdo.option.NonTransactionalRead - true - Reads outside of transactions - - - javax.jdo.option.ConnectionUserName - APP - Username to use against metastore database - - - hive.metastore.end.function.listeners - - List of comma separated listeners for the end of metastore functions. - - - hive.metastore.partition.inherit.table.properties - - - List of comma separated keys occurring in table properties which will get inherited to newly created partitions. - * implies all the keys will get inherited. - - - - hive.metadata.export.location - - - When used in conjunction with the org.apache.hadoop.hive.ql.parse.MetaDataExportListener pre event listener, - it is the location to which the metadata will be exported. The default is an empty string, which results in the - metadata being exported to the current user's home directory on HDFS. - - - - hive.metadata.move.exported.metadata.to.trash - true - - When used in conjunction with the org.apache.hadoop.hive.ql.parse.MetaDataExportListener pre event listener, - this setting determines if the metadata that is exported will subsequently be moved to the user's trash directory - alongside the dropped table data. This ensures that the metadata will be cleaned up along with the dropped table data. - - - - hive.cli.errors.ignore - false - - - - hive.cli.print.current.db - false - Whether to include the current database in the Hive prompt. - - - hive.cli.prompt - hive - - Command line prompt configuration value. Other hiveconf can be used in this configuration value. - Variable substitution will only be invoked at the Hive CLI startup. - - - - hive.cli.pretty.output.num.cols - -1 - - The number of columns to use when formatting output generated by the DESCRIBE PRETTY table_name command. - If the value of this property is -1, then Hive will use the auto-detected terminal width. - - - - hive.metastore.fs.handler.class - org.apache.hadoop.hive.metastore.HiveMetaStoreFsImpl - - - - hive.session.id - - - - - hive.session.silent - false - - - - hive.session.history.enabled - false - Whether to log Hive query, query plan, runtime statistics etc. - - - hive.query.string - - Query being executed (might be multiple per a session) - - - hive.query.id - - ID for query being executed (might be multiple per a session) - - - hive.jobname.length - 50 - max jobname length - - - hive.jar.path - - - - - hive.aux.jars.path - - - - - hive.added.files.path - - - - - hive.added.jars.path - - - - - hive.added.archives.path - - - - - hive.auto.progress.timeout - 0 - - How long to run autoprogressor for the script/UDTF operators (in seconds). - Set to 0 for forever. - - - - hive.table.name - - - - - hive.partition.name - - - - - hive.script.auto.progress - false - - Whether Hive Transform/Map/Reduce Clause should automatically send progress information to TaskTracker - to avoid the task getting killed because of inactivity. Hive sends progress information when the script is - outputting to stderr. This option removes the need of periodically producing stderr messages, - but users should be cautious because this may prevent infinite loops in the scripts to be killed by TaskTracker. - - - - hive.script.operator.id.env.var - HIVE_SCRIPT_OPERATOR_ID - - Name of the environment variable that holds the unique script operator ID in the user's - transform function (the custom mapper/reducer that the user has specified in the query) - - - - hive.script.operator.truncate.env - false - Truncate each environment variable for external script in scripts operator to 20KB (to fit system limits) - - - hive.mapred.mode - nonstrict - - The mode in which the Hive operations are being performed. - In strict mode, some risky queries are not allowed to run. They include: - Cartesian Product. - No partition being picked up for a query. - Comparing bigints and strings. - Comparing bigints and doubles. - Orderby without limit. - - - - hive.alias - - - - - hive.map.aggr - true - Whether to use map-side aggregation in Hive Group By queries - - - hive.groupby.skewindata - false - Whether there is skew in data to optimize group by queries - - - hive.optimize.multigroupby.common.distincts - true - - Whether to optimize a multi-groupby query with the same distinct. - Consider a query like: - - from src - insert overwrite table dest1 select col1, count(distinct colx) group by col1 - insert overwrite table dest2 select col2, count(distinct colx) group by col2; - - With this parameter set to true, first we spray by the distinct value (colx), and then - perform the 2 groups bys. This makes sense if map-side aggregation is turned off. However, - with maps-side aggregation, it might be useful in some cases to treat the 2 inserts independently, - thereby performing the query above in 2MR jobs instead of 3 (due to spraying by distinct key first). - If this parameter is turned off, we don't consider the fact that the distinct key is the same across - different MR jobs. - - - - hive.join.emit.interval - 1000 - How many rows in the right-most join operand Hive should buffer before emitting the join result. - - - hive.join.cache.size - 25000 - How many rows in the joining tables (except the streaming table) should be cached in memory. - - - hive.mapjoin.bucket.cache.size - 100 - - - - hive.mapjoin.optimized.hashtable - true - - Whether Hive should use memory-optimized hash table for MapJoin. Only works on Tez, - because memory-optimized hashtable cannot be serialized. - - - - hive.mapjoin.optimized.keys - true - - Whether MapJoin hashtable should use optimized (size-wise), keys, allowing the table to take less - memory. Depending on key, the memory savings for entire table can be 5-15% or so. - - - - hive.mapjoin.lazy.hashtable - true - - Whether MapJoin hashtable should deserialize values on demand. Depending on how many values in - the table the join will actually touch, it can save a lot of memory by not creating objects for - rows that are not needed. If all rows are needed obviously there's no gain. - - - - hive.mapjoin.optimized.hashtable.wbsize - 10485760 - - Optimized hashtable (see hive.mapjoin.optimized.hashtable) uses a chain of buffers to - store data. This is one buffer size. HT may be slightly faster if this is larger, but for small - joins unnecessary memory will be allocated and then trimmed. - - - - hive.smbjoin.cache.rows - 10000 - How many rows with the same key value should be cached in memory per smb joined table. - - - hive.groupby.mapaggr.checkinterval - 100000 - Number of rows after which size of the grouping keys/aggregation classes is performed - - - hive.map.aggr.hash.percentmemory - 0.5 - Portion of total memory to be used by map-side group aggregation hash table - - - hive.mapjoin.followby.map.aggr.hash.percentmemory - 0.3 - Portion of total memory to be used by map-side group aggregation hash table, when this group by is followed by map join - - - hive.map.aggr.hash.force.flush.memory.threshold - 0.9 - - The max memory to be used by map-side group aggregation hash table. - If the memory usage is higher than this number, force to flush data - - - - hive.map.aggr.hash.min.reduction - 0.5 - - Hash aggregation will be turned off if the ratio between hash table size and input rows is bigger than this number. - Set to 1 to make sure hash aggregation is never turned off. - - - - hive.multigroupby.singlereducer - true - - Whether to optimize multi group by query to generate single M/R job plan. If the multi group by query has - common group by keys, it will be optimized to generate single M/R job. - - - - hive.map.groupby.sorted - false - - If the bucketing/sorting properties of the table exactly match the grouping key, whether to perform - the group by in the mapper by using BucketizedHiveInputFormat. The only downside to this - is that it limits the number of mappers to the number of files. - - - - hive.map.groupby.sorted.testmode - false - - If the bucketing/sorting properties of the table exactly match the grouping key, whether to perform - the group by in the mapper by using BucketizedHiveInputFormat. If the test mode is set, the plan - is not converted, but a query property is set to denote the same. - - - - hive.groupby.orderby.position.alias - false - Whether to enable using Column Position Alias in Group By or Order By - - - hive.new.job.grouping.set.cardinality - 30 - - Whether a new map-reduce job should be launched for grouping sets/rollups/cubes. - For a query like: select a, b, c, count(1) from T group by a, b, c with rollup; - 4 rows are created per row: (a, b, c), (a, b, null), (a, null, null), (null, null, null). - This can lead to explosion across map-reduce boundary if the cardinality of T is very high, - and map-side aggregation does not do a very good job. - - This parameter decides if Hive should add an additional map-reduce job. If the grouping set - cardinality (4 in the example above), is more than this value, a new MR job is added under the - assumption that the original group by will reduce the data size. - - - - hive.udtf.auto.progress - false - - Whether Hive should automatically send progress information to TaskTracker - when using UDTF's to prevent the task getting killed because of inactivity. Users should be cautious - because this may prevent TaskTracker from killing tasks with infinite loops. - - - - hive.default.fileformat - TextFile - - Default file format for CREATE TABLE statement. - Options are TextFile, SequenceFile, RCfile and ORC. Users can explicitly override it by CREATE TABLE ... STORED AS [FORMAT] - - - - hive.query.result.fileformat - TextFile - Default file format for storing result of the query. Allows TextFile, SequenceFile and RCfile - - - hive.fileformat.check - true - Whether to check file format or not when loading data files - - - hive.default.rcfile.serde - org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe - The default SerDe Hive will use for the RCFile format - - - hive.default.serde - org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - The default SerDe Hive will use for storage formats that do not specify a SerDe. - - - hive.serdes.using.metastore.for.schema - org.apache.hadoop.hive.ql.io.orc.OrcSerde,org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe,org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe,org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe,org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe,org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe,org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe,org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - SerDes retriving schema from metastore. This an internal parameter. Check with the hive dev. team - - - hive.querylog.location - ${system:java.io.tmpdir}/${system:user.name} - Location of Hive run time structured log file - - - hive.querylog.enable.plan.progress - true - - Whether to log the plan's progress every time a job's progress is checked. - These logs are written to the location specified by hive.querylog.location - - - - hive.querylog.plan.progress.interval - 60000 - - The interval to wait between logging the plan's progress in milliseconds. - If there is a whole number percentage change in the progress of the mappers or the reducers, - the progress is logged regardless of this value. - The actual interval will be the ceiling of (this value divided by the value of - hive.exec.counters.pull.interval) multiplied by the value of hive.exec.counters.pull.interval - I.e. if it is not divide evenly by the value of hive.exec.counters.pull.interval it will be - logged less frequently than specified. - This only has an effect if hive.querylog.enable.plan.progress is set to true. - - - - hive.script.serde - org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - The default SerDe for transmitting input data to and reading output data from the user scripts. - - - hive.script.recordreader - org.apache.hadoop.hive.ql.exec.TextRecordReader - The default record reader for reading data from the user scripts. - - - hive.script.recordwriter - org.apache.hadoop.hive.ql.exec.TextRecordWriter - The default record writer for writing data to the user scripts. - - - hive.transform.escape.input - false - - This adds an option to escape special chars (newlines, carriage returns and - tabs) when they are passed to the user script. This is useful if the Hive tables - can contain data that contains special characters. - - - - hive.binary.record.max.length - 1000 - - Read from a binary stream and treat each hive.binary.record.max.length bytes as a record. - The last record before the end of stream can have less than hive.binary.record.max.length bytes - - - - hive.hwi.listen.host - 0.0.0.0 - This is the host address the Hive Web Interface will listen on - - - hive.hwi.listen.port - 9999 - This is the port the Hive Web Interface will listen on - - - hive.hwi.war.file - ${system:HWI_WAR_FILE} - This sets the path to the HWI war file, relative to ${HIVE_HOME}. - - - hive.mapred.local.mem - 0 - mapper/reducer memory in local mode - - - hive.mapjoin.smalltable.filesize - 25000000 - - The threshold for the input file size of the small tables; if the file size is smaller - than this threshold, it will try to convert the common join into map join - - - - hive.sample.seednumber - 0 - A number used to percentage sampling. By changing this number, user will change the subsets of data sampled. - - - hive.test.mode - false - Whether Hive is running in test mode. If yes, it turns on sampling and prefixes the output tablename. - - - hive.test.mode.prefix - test_ - In test mode, specfies prefixes for the output table - - - hive.test.mode.samplefreq - 32 - - In test mode, specfies sampling frequency for table, which is not bucketed, - For example, the following query: - INSERT OVERWRITE TABLE dest SELECT col1 from src - would be converted to - INSERT OVERWRITE TABLE test_dest - SELECT col1 from src TABLESAMPLE (BUCKET 1 out of 32 on rand(1)) - - - - hive.test.mode.nosamplelist - - In test mode, specifies comma separated table names which would not apply sampling - - - hive.test.dummystats.aggregator - - internal variable for test - - - hive.test.dummystats.publisher - - internal variable for test - - - hive.merge.mapfiles - true - Merge small files at the end of a map-only job - - - hive.merge.mapredfiles - false - Merge small files at the end of a map-reduce job - - - hive.merge.tezfiles - false - Merge small files at the end of a Tez DAG - - - hive.merge.size.per.task - 256000000 - Size of merged files at the end of the job - - - hive.merge.smallfiles.avgsize - 16000000 - - When the average output file size of a job is less than this number, Hive will start an additional - map-reduce job to merge the output files into bigger files. This is only done for map-only jobs - if hive.merge.mapfiles is true, and for map-reduce jobs if hive.merge.mapredfiles is true. - - - - hive.merge.rcfile.block.level - true - - - - hive.merge.input.format.block.level - org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileBlockMergeInputFormat - - - - hive.merge.current.job.has.dynamic.partitions - false - - - - hive.exec.rcfile.use.explicit.header - true - - If this is set the header for RCFiles will simply be RCF. If this is not - set the header will be that borrowed from sequence files, e.g. SEQ- followed - by the input and output RCFile formats. - - - - hive.exec.rcfile.use.sync.cache - true - - - - hive.io.rcfile.record.interval - 2147483647 - - - - hive.io.rcfile.column.number.conf - 0 - - - - hive.io.rcfile.tolerate.corruptions - false - - - - hive.io.rcfile.record.buffer.size - 4194304 - - - - hive.exec.orc.memory.pool - 0.5 - Maximum fraction of heap that can be used by ORC file writers - - - hive.exec.orc.write.format - - Define the version of the file to write - - - hive.exec.orc.default.stripe.size - 67108864 - Define the default ORC stripe size - - - hive.exec.orc.default.block.size - 268435456 - Define the default file system block size for ORC files. - - - hive.exec.orc.dictionary.key.size.threshold - 0.8 - - If the number of keys in a dictionary is greater than this fraction of the total number of - non-null rows, turn off dictionary encoding. Use 1 to always use dictionary encoding. - - - - hive.exec.orc.default.row.index.stride - 10000 - Define the default ORC index stride - - - hive.exec.orc.default.buffer.size - 262144 - Define the default ORC buffer size - - - hive.exec.orc.default.block.padding - true - Define the default block padding - - - hive.exec.orc.block.padding.tolerance - 0.05 - - Define the tolerance for block padding as a percentage of stripe size. - For the defaults of 64Mb ORC stripe and 256Mb HDFS blocks, a maximum of 3.2Mb will be reserved for padding within the 256Mb block. - In that case, if the available size within the block is more than 3.2Mb, a new smaller stripe will be inserted to fit within that space. - This will make sure that no stripe written will cross block boundaries and cause remote reads within a node local task. - - - - hive.exec.orc.default.compress - ZLIB - Define the default compression codec for ORC file - - - hive.exec.orc.encoding.strategy - SPEED - - Define the encoding strategy to use while writing data. Changing this will - only affect the light weight encoding for integers. This flag will not - change the compression level of higher level compression codec (like ZLIB). - Possible options are SPEED and COMPRESSION. - - - - hive.orc.splits.include.file.footer - false - - If turned on splits generated by orc will include metadata about the stripes in the file. This - data is read remotely (from the client or HS2 machine) and sent to all the tasks. - - - - hive.orc.cache.stripe.details.size - 10000 - Cache size for keeping meta info about orc splits cached in the client. - - - hive.orc.compute.splits.num.threads - 10 - How many threads orc should use to create splits in parallel. - - - hive.exec.orc.skip.corrupt.data - false - - If ORC reader encounters corrupt data, this value will be used to determine - whether to skip the corrupt data or throw exception. The default behavior is to throw exception. - - - - hive.exec.orc.zerocopy - false - Use zerocopy reads with ORC. - - - hive.lazysimple.extended_boolean_literal - false - - LazySimpleSerde uses this property to determine if it treats 'T', 't', 'F', 'f', - '1', and '0' as extened, legal boolean literal, in addition to 'TRUE' and 'FALSE'. - The default is false, which means only 'TRUE' and 'FALSE' are treated as legal - boolean literal. - - - - hive.optimize.skewjoin - false - - Whether to enable skew join optimization. - The algorithm is as follows: At runtime, detect the keys with a large skew. Instead of - processing those keys, store them temporarily in an HDFS directory. In a follow-up map-reduce - job, process those skewed keys. The same key need not be skewed for all the tables, and so, - the follow-up map-reduce job (for the skewed keys) would be much faster, since it would be a - map-join. - - - - hive.auto.convert.join - true - Whether Hive enables the optimization about converting common join into mapjoin based on the input file size - - - hive.auto.convert.join.noconditionaltask - true - - Whether Hive enables the optimization about converting common join into mapjoin based on the input file size. - If this parameter is on, and the sum of size for n-1 of the tables/partitions for a n-way join is smaller than the - specified size, the join is directly converted to a mapjoin (there is no conditional task). - - - - hive.auto.convert.join.noconditionaltask.size - 10000000 - - If hive.auto.convert.join.noconditionaltask is off, this parameter does not take affect. - However, if it is on, and the sum of size for n-1 of the tables/partitions for a n-way join is smaller than this size, - the join is directly converted to a mapjoin(there is no conditional task). The default is 10MB - - - - hive.auto.convert.join.use.nonstaged - false - - For conditional joins, if input stream from a small alias can be directly applied to join operator without - filtering or projection, the alias need not to be pre-staged in distributed cache via mapred local task. - Currently, this is not working with vectorization or tez execution engine. - - - - hive.skewjoin.key - 100000 - - Determine if we get a skew key in join. If we see more than the specified number of rows with the same key in join operator, - we think the key as a skew join key. - - - - hive.skewjoin.mapjoin.map.tasks - 10000 - - Determine the number of map task used in the follow up map join job for a skew join. - It should be used together with hive.skewjoin.mapjoin.min.split to perform a fine grained control. - - - - hive.skewjoin.mapjoin.min.split - 33554432 - - Determine the number of map task at most used in the follow up map join job for a skew join by specifying - the minimum split size. It should be used together with hive.skewjoin.mapjoin.map.tasks to perform a fine grained control. - - - - hive.heartbeat.interval - 1000 - Send a heartbeat after this interval - used by mapjoin and filter operators - - - hive.limit.row.max.size - 100000 - When trying a smaller subset of data for simple LIMIT, how much size we need to guarantee each row to have at least. - - - hive.limit.optimize.limit.file - 10 - When trying a smaller subset of data for simple LIMIT, maximum number of files we can sample. - - - hive.limit.optimize.enable - false - Whether to enable to optimization to trying a smaller subset of data for simple LIMIT first. - - - hive.limit.optimize.fetch.max - 50000 - - Maximum number of rows allowed for a smaller subset of data for simple LIMIT, if it is a fetch query. - Insert queries are not restricted by this limit. - - - - hive.limit.pushdown.memory.usage - -1.0 - The max memory to be used for hash in RS operator for top K selection. - - - hive.limit.query.max.table.partition - -1 - - This controls how many partitions can be scanned for each partitioned table. - The default value "-1" means no limit. - - - - hive.hashtable.initialCapacity - 100000 - - - - hive.hashtable.loadfactor - 0.75 - - - - hive.mapjoin.followby.gby.localtask.max.memory.usage - 0.55 - - This number means how much memory the local task can take to hold the key/value into an in-memory hash table - when this map join is followed by a group by. If the local task's memory usage is more than this number, - the local task will abort by itself. It means the data of the small table is too large to be held in memory. - - - - hive.mapjoin.localtask.max.memory.usage - 0.9 - - This number means how much memory the local task can take to hold the key/value into an in-memory hash table. - If the local task's memory usage is more than this number, the local task will abort by itself. - It means the data of the small table is too large to be held in memory. - - - - hive.mapjoin.check.memory.rows - 100000 - The number means after how many rows processed it needs to check the memory usage - - - hive.debug.localtask - false - - - - hive.input.format - org.apache.hadoop.hive.ql.io.CombineHiveInputFormat - The default input format. Set this to HiveInputFormat if you encounter problems with CombineHiveInputFormat. - - - hive.tez.input.format - org.apache.hadoop.hive.ql.io.HiveInputFormat - The default input format for tez. Tez groups splits in the AM. - - - hive.tez.container.size - -1 - By default Tez will spawn containers of the size of a mapper. This can be used to overwrite. - - - hive.tez.java.opts - - By default Tez will use the Java options from map tasks. This can be used to overwrite. - - - hive.tez.log.level - INFO - - The log level to use for tasks executing as part of the DAG. - Used only if hive.tez.java.opts is used to configure Java options. - - - - hive.enforce.bucketing - false - Whether bucketing is enforced. If true, while inserting into the table, bucketing is enforced. - - - hive.enforce.sorting - false - Whether sorting is enforced. If true, while inserting into the table, sorting is enforced. - - - hive.optimize.bucketingsorting - true - - If hive.enforce.bucketing or hive.enforce.sorting is true, don't create a reducer for enforcing - bucketing/sorting for queries of the form: - insert overwrite table T2 select * from T1; - where T1 and T2 are bucketed/sorted by the same keys into the same number of buckets. - - - - hive.mapred.partitioner - org.apache.hadoop.hive.ql.io.DefaultHivePartitioner - - - - hive.enforce.sortmergebucketmapjoin - false - If the user asked for sort-merge bucketed map-side join, and it cannot be performed, should the query fail or not ? - - - hive.enforce.bucketmapjoin - false - - If the user asked for bucketed map-side join, and it cannot be performed, - should the query fail or not ? For example, if the buckets in the tables being joined are - not a multiple of each other, bucketed map-side join cannot be performed, and the - query will fail if hive.enforce.bucketmapjoin is set to true. - - - - hive.auto.convert.sortmerge.join - false - Will the join be automatically converted to a sort-merge join, if the joined tables pass the criteria for sort-merge join. - - - hive.auto.convert.sortmerge.join.bigtable.selection.policy - org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeBasedBigTableSelectorForAutoSMJ - - The policy to choose the big table for automatic conversion to sort-merge join. - By default, the table with the largest partitions is assigned the big table. All policies are: - . based on position of the table - the leftmost table is selected - org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSMJ. - . based on total size (all the partitions selected in the query) of the table - org.apache.hadoop.hive.ql.optimizer.TableSizeBasedBigTableSelectorForAutoSMJ. - . based on average size (all the partitions selected in the query) of the table - org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeBasedBigTableSelectorForAutoSMJ. - New policies can be added in future. - - - - hive.auto.convert.sortmerge.join.to.mapjoin - false - - If hive.auto.convert.sortmerge.join is set to true, and a join was converted to a sort-merge join, - this parameter decides whether each table should be tried as a big table, and effectively a map-join should be - tried. That would create a conditional task with n+1 children for a n-way join (1 child for each table as the - big table), and the backup task will be the sort-merge join. In some cases, a map-join would be faster than a - sort-merge join, if there is no advantage of having the output bucketed and sorted. For example, if a very big sorted - and bucketed table with few files (say 10 files) are being joined with a very small sorter and bucketed table - with few files (10 files), the sort-merge join will only use 10 mappers, and a simple map-only join might be faster - if the complete small table can fit in memory, and a map-join can be performed. - - - - hive.exec.script.trust - false - - - - hive.exec.rowoffset - false - Whether to provide the row offset virtual column - - - hive.hadoop.supports.splittable.combineinputformat - false - - - - hive.optimize.index.filter - false - Whether to enable automatic use of indexes - - - hive.optimize.index.autoupdate - false - Whether to update stale indexes automatically - - - hive.optimize.ppd - true - Whether to enable predicate pushdown - - - hive.ppd.recognizetransivity - true - Whether to transitively replicate predicate filters over equijoin conditions. - - - hive.ppd.remove.duplicatefilters - true - Whether to push predicates down into storage handlers. Ignored when hive.optimize.ppd is false. - - - hive.optimize.metadataonly - true - - - - hive.optimize.null.scan - true - Dont scan relations which are guaranteed to not generate any rows - - - hive.optimize.ppd.storage - true - Whether to push predicates down to storage handlers - - - hive.optimize.groupby - true - Whether to enable the bucketed group by from bucketed partitions/tables. - - - hive.optimize.bucketmapjoin - false - Whether to try bucket mapjoin - - - hive.optimize.bucketmapjoin.sortedmerge - false - Whether to try sorted bucket merge map join - - - hive.optimize.reducededuplication - true - - Remove extra map-reduce jobs if the data is already clustered by the same key which needs to be used again. - This should always be set to true. Since it is a new feature, it has been made configurable. - - - - hive.optimize.reducededuplication.min.reducer - 4 - - Reduce deduplication merges two RSs by moving key/parts/reducer-num of the child RS to parent RS. - That means if reducer-num of the child RS is fixed (order by or forced bucketing) and small, it can make very slow, single MR. - The optimization will be automatically disabled if number of reducers would be less than specified value. - - - - hive.optimize.sort.dynamic.partition - true - - When enabled dynamic partitioning column will be globally sorted. - This way we can keep only one record writer open for each partition value - in the reducer thereby reducing the memory pressure on reducers. - - - - hive.optimize.sampling.orderby - false - - - - hive.optimize.sampling.orderby.number - 1000 - - - - hive.optimize.sampling.orderby.percent - 0.1 - - - - hive.optimize.union.remove - false - - Whether to remove the union and push the operators between union and the filesink above union. - This avoids an extra scan of the output by union. This is independently useful for union - queries, and specially useful when hive.optimize.skewjoin.compiletime is set to true, since an - extra union is inserted. - - The merge is triggered if either of hive.merge.mapfiles or hive.merge.mapredfiles is set to true. - If the user has set hive.merge.mapfiles to true and hive.merge.mapredfiles to false, the idea was the - number of reducers are few, so the number of files anyway are small. However, with this optimization, - we are increasing the number of files possibly by a big margin. So, we merge aggressively. - - - - hive.optimize.correlation - false - exploit intra-query correlations. - - - hive.mapred.supports.subdirectories - false - - Whether the version of Hadoop which is running supports sub-directories for tables/partitions. - Many Hive optimizations can be applied if the Hadoop version supports sub-directories for - tables/partitions. It was added by MAPREDUCE-1501 - - - - hive.optimize.skewjoin.compiletime - false - - Whether to create a separate plan for skewed keys for the tables in the join. - This is based on the skewed keys stored in the metadata. At compile time, the plan is broken - into different joins: one for the skewed keys, and the other for the remaining keys. And then, - a union is performed for the 2 joins generated above. So unless the same skewed key is present - in both the joined tables, the join for the skewed key will be performed as a map-side join. - - The main difference between this parameter and hive.optimize.skewjoin is that this parameter - uses the skew information stored in the metastore to optimize the plan at compile time itself. - If there is no skew information in the metadata, this parameter will not have any affect. - Both hive.optimize.skewjoin.compiletime and hive.optimize.skewjoin should be set to true. - Ideally, hive.optimize.skewjoin should be renamed as hive.optimize.skewjoin.runtime, but not doing - so for backward compatibility. - - If the skew information is correctly stored in the metadata, hive.optimize.skewjoin.compiletime - would change the query plan to take care of it, and hive.optimize.skewjoin will be a no-op. - - - - hive.optimize.index.filter.compact.minsize - 5368709120 - Minimum size (in bytes) of the inputs on which a compact index is automatically used. - - - hive.optimize.index.filter.compact.maxsize - -1 - Maximum size (in bytes) of the inputs on which a compact index is automatically used. A negative number is equivalent to infinity. - - - hive.index.compact.query.max.entries - 10000000 - The maximum number of index entries to read during a query that uses the compact index. Negative value is equivalent to infinity. - - - hive.index.compact.query.max.size - 10737418240 - The maximum number of bytes that a query using the compact index can read. Negative value is equivalent to infinity. - - - hive.index.compact.binary.search - true - Whether or not to use a binary search to find the entries in an index table that match the filter, where possible - - - hive.stats.autogather - true - A flag to gather statistics automatically during the INSERT OVERWRITE command. - - - hive.stats.dbclass - fs - The storage that stores temporary Hive statistics. Currently, jdbc, hbase, counter and custom type are supported. - - - hive.stats.jdbcdriver - org.apache.derby.jdbc.EmbeddedDriver - The JDBC driver for the database that stores temporary Hive statistics. - - - hive.stats.dbconnectionstring - jdbc:derby:;databaseName=TempStatsStore;create=true - The default connection string for the database that stores temporary Hive statistics. - - - hive.stats.default.publisher - - The Java class (implementing the StatsPublisher interface) that is used by default if hive.stats.dbclass is custom type. - - - hive.stats.default.aggregator - - The Java class (implementing the StatsAggregator interface) that is used by default if hive.stats.dbclass is custom type. - - - hive.stats.jdbc.timeout - 30 - Timeout value (number of seconds) used by JDBC connection and statements. - - - hive.stats.atomic - false - whether to update metastore stats only if all stats are available - - - hive.stats.retries.max - 0 - - Maximum number of retries when stats publisher/aggregator got an exception updating intermediate database. - Default is no tries on failures. - - - - hive.stats.retries.wait - 3000 - The base waiting window (in milliseconds) before the next retry. The actual wait time is calculated by baseWindow * failures baseWindow * (failure 1) * (random number between [0.0,1.0]). - - - hive.stats.collect.rawdatasize - true - should the raw data size be collected when analyzing tables - - - hive.client.stats.counters - - - Subset of counters that should be of interest for hive.client.stats.publishers (when one wants to limit their publishing). - Non-display names should be used - - - - hive.stats.reliable - false - - Whether queries will fail because stats cannot be collected completely accurately. - If this is set to true, reading/writing from/into a partition may fail because the stats - could not be computed accurately. - - - - hive.stats.gather.num.threads - 10 - - Number of threads used by partialscan/noscan analyze command for partitioned tables. - This is applicable only for file formats that implement StatsProvidingRecordReader (like ORC). - - - - hive.stats.collect.tablekeys - false - - Whether join and group by keys on tables are derived and maintained in the QueryPlan. - This is useful to identify how tables are accessed and to determine if they should be bucketed. - - - - hive.stats.collect.scancols - false - - Whether column accesses are tracked in the QueryPlan. - This is useful to identify how tables are accessed and to determine if there are wasted columns that can be trimmed. - - - - hive.stats.ndv.error - 20.0 - - Standard error expressed in percentage. Provides a tradeoff between accuracy and compute cost. - A lower value for error indicates higher accuracy and a higher compute cost. - - - - hive.stats.key.prefix.max.length - 150 - - Determines if when the prefix of the key used for intermediate stats collection - exceeds a certain length, a hash of the key is used instead. If the value < 0 then hashing - - - - hive.stats.key.prefix.reserve.length - 24 - - Reserved length for postfix of stats key. Currently only meaningful for counter type which should - keep length of full stats key smaller than max length configured by hive.stats.key.prefix.max.length. - For counter type, it should be bigger than the length of LB spec if exists. - - - - hive.stats.max.variable.length - 100 - - To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.), - average row size is multiplied with the total number of rows coming out of each operator. - Average row size is computed from average column size of all columns in the row. In the absence - of column statistics, for variable length columns (like string, bytes etc.), this value will be - used. For fixed length columns their corresponding Java equivalent sizes are used - (float - 4 bytes, double - 8 bytes etc.). - - - - hive.stats.list.num.entries - 10 - - To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.), - average row size is multiplied with the total number of rows coming out of each operator. - Average row size is computed from average column size of all columns in the row. In the absence - of column statistics and for variable length complex columns like list, the average number of - entries/values can be specified using this config. - - - - hive.stats.map.num.entries - 10 - - To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.), - average row size is multiplied with the total number of rows coming out of each operator. - Average row size is computed from average column size of all columns in the row. In the absence - of column statistics and for variable length complex columns like map, the average number of - entries/values can be specified using this config. - - - - hive.stats.map.parallelism - 1 - - Hive/Tez optimizer estimates the data size flowing through each of the operators. - For GROUPBY operator, to accurately compute the data size map-side parallelism needs to - be known. By default, this value is set to 1 since optimizer is not aware of the number of - mappers during compile-time. This Hive config can be used to specify the number of mappers - to be used for data size computation of GROUPBY operator. - - - - hive.stats.fetch.partition.stats - true - - Annotation of operator tree with statistics information requires partition level basic - statistics like number of rows, data size and file size. Partition statistics are fetched from - metastore. Fetching partition statistics for each needed partition can be expensive when the - number of partitions is high. This flag can be used to disable fetching of partition statistics - from metastore. When this flag is disabled, Hive will make calls to filesystem to get file sizes - and will estimate the number of rows from row schema. - - - - hive.stats.fetch.column.stats - false - - Annotation of operator tree with statistics information requires column statistics. - Column statistics are fetched from metastore. Fetching column statistics for each needed column - can be expensive when the number of columns is high. This flag can be used to disable fetching - of column statistics from metastore. - - - - hive.stats.join.factor - 1.1 - - Hive/Tez optimizer estimates the data size flowing through each of the operators. JOIN operator - uses column statistics to estimate the number of rows flowing out of it and hence the data size. - In the absence of column statistics, this factor determines the amount of rows that flows out - of JOIN operator. - - - - hive.stats.deserialization.factor - 1.0 - - Hive/Tez optimizer estimates the data size flowing through each of the operators. In the absence - of basic statistics like number of rows and data size, file size is used to estimate the number - of rows and data size. Since files in tables/partitions are serialized (and optionally - compressed) the estimates of number of rows and data size cannot be reliably determined. - This factor is multiplied with the file size to account for serialization and compression. - - - - hive.support.concurrency - false - - Whether Hive supports concurrency control or not. - A ZooKeeper instance must be up and running when using zookeeper Hive lock manager - - - - hive.lock.manager - org.apache.hadoop.hive.ql.lockmgr.zookeeper.ZooKeeperHiveLockManager - - - - hive.lock.numretries - 100 - The number of times you want to try to get all the locks - - - hive.unlock.numretries - 10 - The number of times you want to retry to do one unlock - - - hive.lock.sleep.between.retries - 60 - The sleep time (in seconds) between various retries - - - hive.lock.mapred.only.operation - false - - This param is to control whether or not only do lock on queries - that need to execute at least one mapred job. - - - - hive.zookeeper.quorum - - The list of ZooKeeper servers to talk to. This is only needed for read/write locks. - - - hive.zookeeper.client.port - 2181 - The port of ZooKeeper servers to talk to. This is only needed for read/write locks. - - - hive.zookeeper.session.timeout - 600000 - - ZooKeeper client's session timeout. The client is disconnected, and as a result, all locks released, - if a heartbeat is not sent in the timeout. - - - - hive.zookeeper.namespace - hive_zookeeper_namespace - The parent node under which all ZooKeeper nodes are created. - - - hive.zookeeper.clean.extra.nodes - false - Clean extra nodes at the end of the session. - - - hive.txn.manager - org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager - - - - hive.txn.timeout - 300 - time after which transactions are declared aborted if the client has not sent a heartbeat, in seconds. - - - hive.txn.max.open.batch - 1000 - - Maximum number of transactions that can be fetched in one call to open_txns(). - Increasing this will decrease the number of delta files created when - streaming data into Hive. But it will also increase the number of - open transactions at any given time, possibly impacting read performance. - - - - hive.compactor.initiator.on - false - Whether to run the compactor's initiator thread in this metastore instance or not. - - - hive.compactor.worker.threads - 0 - Number of compactor worker threads to run on this metastore instance. - - - hive.compactor.worker.timeout - 86400 - - Time in seconds, before a given compaction in working state is declared a failure - and returned to the initiated state. - - - - hive.compactor.check.interval - 300 - - Time in seconds between checks to see if any partitions need compacted. - This should be kept high because each check for compaction requires many calls against the NameNode. - - - - hive.compactor.delta.num.threshold - 10 - - Number of delta files that must exist in a directory before the compactor will attempt - a minor compaction. - - - - hive.compactor.delta.pct.threshold - 0.1 - Percentage (by size) of base that deltas can be before major compaction is initiated. - - - hive.compactor.abortedtxn.threshold - 1000 - - Number of aborted transactions involving a particular table or partition before major - compaction is initiated. - - - - hive.hbase.wal.enabled - true - - Whether writes to HBase should be forced to the write-ahead log. - Disabling this improves HBase write performance at the risk of lost writes in case of a crash. - - - - hive.hbase.generatehfiles - false - True when HBaseStorageHandler should generate hfiles instead of operate against the online table. - - - hive.archive.enabled - false - Whether archiving operations are permitted - - - hive.optimize.index.groupby - false - Whether to enable optimization of group-by queries using Aggregate indexes. - - - hive.outerjoin.supports.filters - true - - - - hive.fetch.task.conversion - more - - Some select queries can be converted to single FETCH task minimizing latency. - Currently the query should be single sourced not having any subquery and should not have - any aggregations or distincts (which incurs RS), lateral views and joins. - 1. minimal : SELECT STAR, FILTER on partition columns, LIMIT only - 2. more : SELECT, FILTER, LIMIT only (support TABLESAMPLE and virtual columns) - - - - hive.fetch.task.conversion.threshold - 1073741824 - - Input threshold for applying hive.fetch.task.conversion. If target table is native, input length - is calculated by summation of file lengths. If it's not native, storage handler for the table - can optionally implement org.apache.hadoop.hive.ql.metadata.InputEstimator interface. - - - - hive.fetch.task.aggr - false - - Aggregation queries with no group-by clause (for example, select count(*) from src) execute - final aggregations in single reduce task. If this is set true, Hive delegates final aggregation - stage to fetch task, possibly decreasing the query time. - - - - hive.compute.query.using.stats - false - - When set to true Hive will answer a few queries like count(1) purely using stats - stored in metastore. For basic stats collection turn on the config hive.stats.autogather to true. - For more advanced stats collection need to run analyze table queries. - - - - hive.fetch.output.serde - org.apache.hadoop.hive.serde2.DelimitedJSONSerDe - The SerDe used by FetchTask to serialize the fetch output. - - - hive.cache.expr.evaluation - true - If true, evaluation result of deterministic expression referenced twice or more will be cached. - - - hive.variable.substitute - true - This enables substitution using syntax like ${var} ${system:var} and ${env:var}. - - - hive.variable.substitute.depth - 40 - The maximum replacements the substitution engine will do. - - - hive.conf.validation - true - Enables type checking for registered Hive configurations - - - hive.semantic.analyzer.hook - - - - - hive.security.authorization.enabled - false - enable or disable the Hive client authorization - - - hive.security.authorization.manager - org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider - - The Hive client authorization manager class name. The user defined authorization class should implement - interface org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider. - - - - hive.security.authenticator.manager - org.apache.hadoop.hive.ql.security.HadoopDefaultAuthenticator - - hive client authenticator manager class name. The user defined authenticator should implement - interface org.apache.hadoop.hive.ql.security.HiveAuthenticationProvider. - - - - hive.security.metastore.authorization.manager - org.apache.hadoop.hive.ql.security.authorization.DefaultHiveMetastoreAuthorizationProvider - - authorization manager class name to be used in the metastore for authorization. - The user defined authorization class should implement interface - org.apache.hadoop.hive.ql.security.authorization.HiveMetastoreAuthorizationProvider. - - - - hive.security.metastore.authenticator.manager - org.apache.hadoop.hive.ql.security.HadoopDefaultMetastoreAuthenticator - - authenticator manager class name to be used in the metastore for authentication. - The user defined authenticator should implement interface org.apache.hadoop.hive.ql.security.HiveAuthenticationProvider. - - - - hive.security.authorization.createtable.user.grants - - - the privileges automatically granted to some users whenever a table gets created. - An example like "userX,userY:select;userZ:create" will grant select privilege to userX and userY, - and grant create privilege to userZ whenever a new table created. - - - - hive.security.authorization.createtable.group.grants - - - the privileges automatically granted to some groups whenever a table gets created. - An example like "groupX,groupY:select;groupZ:create" will grant select privilege to groupX and groupY, - and grant create privilege to groupZ whenever a new table created. - - - - hive.security.authorization.createtable.role.grants - - - the privileges automatically granted to some roles whenever a table gets created. - An example like "roleX,roleY:select;roleZ:create" will grant select privilege to roleX and roleY, - and grant create privilege to roleZ whenever a new table created. - - - - hive.security.authorization.createtable.owner.grants - - - the privileges automatically granted to the owner whenever a table gets created. - An example like "select,drop" will grant select and drop privilege to the owner of the table - - - - hive.security.authorization.sqlstd.confwhitelist - - interal variable. List of modifiable configurations by user. - - - hive.cli.print.header - false - Whether to print the names of the columns in query output. - - - hive.error.on.empty.partition - false - Whether to throw an exception if dynamic partition insert generates empty results. - - - hive.index.compact.file - - internal variable - - - hive.index.blockfilter.file - - internal variable - - - hive.index.compact.file.ignore.hdfs - false - - When true the HDFS location stored in the index file will be ignored at runtime. - If the data got moved or the name of the cluster got changed, the index data should still be usable. - - - - hive.exim.uri.scheme.whitelist - hdfs,pfile - A comma separated list of acceptable URI schemes for import and export. - - - hive.mapper.cannot.span.multiple.partitions - false - - - - hive.rework.mapredwork - false - - should rework the mapred work or not. - This is first introduced by SymlinkTextInputFormat to replace symlink files with real paths at compile time. - - - - hive.exec.concatenate.check.index - true - - If this is set to true, Hive will throw error when doing - 'alter table tbl_name [partSpec] concatenate' on a table/partition - that has indexes on it. The reason the user want to set this to true - is because it can help user to avoid handling all index drop, recreation, - rebuild work. This is very helpful for tables with thousands of partitions. - - - - hive.io.exception.handlers - - - A list of io exception handler class names. This is used - to construct a list exception handlers to handle exceptions thrown - by record readers - - - - hive.log4j.file - - - Hive log4j configuration file. - If the property is not set, then logging will be initialized using hive-log4j.properties found on the classpath. - If the property is set, the value must be a valid URI (java.net.URI, e.g. "file:///tmp/my-logging.properties"), - which you can then extract a URL from and pass to PropertyConfigurator.configure(URL). - - - - hive.exec.log4j.file - - - Hive log4j configuration file for execution mode(sub command). - If the property is not set, then logging will be initialized using hive-exec-log4j.properties found on the classpath. - If the property is set, the value must be a valid URI (java.net.URI, e.g. "file:///tmp/my-logging.properties"), - which you can then extract a URL from and pass to PropertyConfigurator.configure(URL). - - - - hive.global.init.file.location - - - The location of HS2 global init file (.hiverc). - If the property is not set, then HS2 will search for the file in $HIVE_CONF_DIR/. - If the property is set, the value must be a valid path where the init file is located. - - - - hive.autogen.columnalias.prefix.label - _c - - String used as a prefix when auto generating column alias. - By default the prefix label will be appended with a column position number to form the column alias. - Auto generation would happen if an aggregate function is used in a select clause without an explicit alias. - - - - hive.autogen.columnalias.prefix.includefuncname - false - Whether to include function name in the column alias auto generated by Hive. - - - hive.exec.perf.logger - org.apache.hadoop.hive.ql.log.PerfLogger - - The class responsible for logging client side performance metrics. - Must be a subclass of org.apache.hadoop.hive.ql.log.PerfLogger - - - - hive.start.cleanup.scratchdir - false - To cleanup the Hive scratchdir when starting the Hive Server - - - hive.insert.into.multilevel.dirs - false - - Where to insert into multilevel directories like - "insert directory '/HIVEFT25686/chinna/' from table" - - - - hive.warehouse.subdir.inherit.perms - false - - Set this to true if the the table directories should inherit the - permission of the warehouse or database directory instead of being created - with the permissions derived from dfs umask - - - - hive.insert.into.external.tables - true - whether insert into external tables is allowed - - - hive.exec.driver.run.hooks - - A comma separated list of hooks which implement HiveDriverRunHook. Will be run at the beginning and end of Driver.run, these will be run in the order specified. - - - hive.ddl.output.format - - - The data format to use for DDL output. One of "text" (for human - readable text) or "json" (for a json object). - - - - hive.entity.separator - @ - Separator used to construct names of tables and partitions. For example, dbname@tablename@partitionname - - - hive.display.partition.cols.separately - true - - In older Hive version (0.10 and earlier) no distinction was made between - partition columns or non-partition columns while displaying columns in describe - table. From 0.12 onwards, they are displayed separately. This flag will let you - get old behavior, if desired. See, test-case in patch for HIVE-6689. - - - - hive.server2.max.start.attempts - 30 - - This number of times HiveServer2 will attempt to start before exiting, sleeping 60 seconds between retries. - The default of 30 will keep trying for 30 minutes. - - - - hive.server2.transport.mode - binary - Server transport mode. "binary" or "http" - - - hive.server2.thrift.http.port - 10001 - Port number when in HTTP mode. - - - hive.server2.thrift.http.path - cliservice - Path component of URL endpoint when in HTTP mode. - - - hive.server2.thrift.http.min.worker.threads - 5 - Minimum number of worker threads when in HTTP mode. - - - hive.server2.thrift.http.max.worker.threads - 500 - Maximum number of worker threads when in HTTP mode. - - - hive.server2.thrift.port - 10000 - - Port number of HiveServer2 Thrift interface. - Can be overridden by setting $HIVE_SERVER2_THRIFT_PORT - - - - hive.server2.thrift.bind.host - - - Bind host on which to run the HiveServer2 Thrift interface. - Can be overridden by setting $HIVE_SERVER2_THRIFT_BIND_HOST - - - - hive.server2.thrift.sasl.qop - auth - - Sasl QOP value; Set it to one of following values to enable higher levels of - protection for HiveServer2 communication with clients. - "auth" - authentication only (default) - "auth-int" - authentication plus integrity protection - "auth-conf" - authentication plus integrity and confidentiality protection - This is applicable only if HiveServer2 is configured to use Kerberos authentication. - - - - hive.server2.thrift.min.worker.threads - 5 - Minimum number of Thrift worker threads - - - hive.server2.thrift.max.worker.threads - 500 - Maximum number of Thrift worker threads - - - hive.server2.async.exec.threads - 100 - Number of threads in the async thread pool for HiveServer2 - - - hive.server2.async.exec.shutdown.timeout - 10 - Time (in seconds) for which HiveServer2 shutdown will wait for async - - - hive.server2.async.exec.wait.queue.size - 100 - - Size of the wait queue for async thread pool in HiveServer2. - After hitting this limit, the async thread pool will reject new requests. - - - - hive.server2.async.exec.keepalive.time - 10 - - Time (in seconds) that an idle HiveServer2 async thread (from the thread pool) will wait - for a new task to arrive before terminating - - - - hive.server2.long.polling.timeout - 5000 - - Time in milliseconds that HiveServer2 will wait, - before responding to asynchronous calls that use long polling - - - - hive.server2.authentication - NONE - - Client authentication types. - NONE: no authentication check - LDAP: LDAP/AD based authentication - KERBEROS: Kerberos/GSSAPI authentication - CUSTOM: Custom authentication provider - (Use with property hive.server2.custom.authentication.class) - - - - hive.server2.allow.user.substitution - true - Allow alternate user to be specified as part of HiveServer2 open connection request. - - - hive.server2.authentication.kerberos.keytab - - Kerberos keytab file for server principal - - - hive.server2.authentication.kerberos.principal - - Kerberos server principal - - - hive.server2.authentication.spnego.keytab - - - keytab file for SPNego principal, optional, - typical value would look like /etc/security/keytabs/spnego.service.keytab, - This keytab would be used by HiveServer2 when Kerberos security is enabled and - HTTP transport mode is used. - This needs to be set only if SPNEGO is to be used in authentication. - SPNego authentication would be honored only if valid - hive.server2.authentication.spnego.principal - and - hive.server2.authentication.spnego.keytab - are specified. - - - - hive.server2.authentication.spnego.principal - - - SPNego service principal, optional, - typical value would look like HTTP/_HOST@EXAMPLE.COM - SPNego service principal would be used by HiveServer2 when Kerberos security is enabled - and HTTP transport mode is used. - This needs to be set only if SPNEGO is to be used in authentication. - - - - hive.server2.authentication.ldap.url - - LDAP connection URL - - - hive.server2.authentication.ldap.baseDN - - LDAP base DN - - - hive.server2.authentication.ldap.Domain - - - - - hive.server2.custom.authentication.class - - - Custom authentication class. Used when property - 'hive.server2.authentication' is set to 'CUSTOM'. Provided class - must be a proper implementation of the interface - org.apache.hive.service.auth.PasswdAuthenticationProvider. HiveServer2 - will call its Authenticate(user, passed) method to authenticate requests. - The implementation may optionally extend Hadoop's - org.apache.hadoop.conf.Configured class to grab Hive's Configuration object. - - - - hive.server2.authentication.pam.services - - - List of the underlying pam services that should be used when auth type is PAM - A file with the same name must exist in /etc/pam.d - - - - hive.server2.enable.doAs - true - - Setting this property to true will have HiveServer2 execute - Hive operations as the user making the calls to it. - - - - hive.server2.table.type.mapping - CLASSIC - - This setting reflects how HiveServer2 will report the table types for JDBC and other - client implementations that retrieve the available tables and supported table types - HIVE : Exposes Hive's native table types like MANAGED_TABLE, EXTERNAL_TABLE, VIRTUAL_VIEW - CLASSIC : More generic types like TABLE and VIEW - - - - hive.server2.session.hook - - - - - hive.server2.use.SSL - false - - - - hive.server2.keystore.path - - - - - hive.server2.keystore.password - - - - - hive.security.command.whitelist - set,reset,dfs,add,delete,compile - Comma separated list of non-SQL Hive commands users are authorized to execute - - - hive.conf.restricted.list - hive.security.authenticator.manager,hive.security.authorization.manager - Comma separated list of configuration options which are immutable at runtime - - - hive.multi.insert.move.tasks.share.dependencies - false - - If this is set all move tasks for tables/partitions (not directories) at the end of a - multi-insert query will only begin once the dependencies for all these move tasks have been - met. - Advantages: If concurrency is enabled, the locks will only be released once the query has - finished, so with this config enabled, the time when the table/partition is - generated will be much closer to when the lock on it is released. - Disadvantages: If concurrency is not enabled, with this disabled, the tables/partitions which - are produced by this query and finish earlier will be available for querying - much earlier. Since the locks are only released once the query finishes, this - does not apply if concurrency is enabled. - - - - hive.exec.infer.bucket.sort - false - - If this is set, when writing partitions, the metadata will include the bucketing/sorting - properties with which the data was written if any (this will not overwrite the metadata - inherited from the table if the table is bucketed/sorted) - - - - hive.exec.infer.bucket.sort.num.buckets.power.two - false - - If this is set, when setting the number of reducers for the map reduce task which writes the - final output files, it will choose a number which is a power of two, unless the user specifies - the number of reducers to use using mapred.reduce.tasks. The number of reducers - may be set to a power of two, only to be followed by a merge task meaning preventing - anything from being inferred. - With hive.exec.infer.bucket.sort set to true: - Advantages: If this is not set, the number of buckets for partitions will seem arbitrary, - which means that the number of mappers used for optimized joins, for example, will - be very low. With this set, since the number of buckets used for any partition is - a power of two, the number of mappers used for optimized joins will be the least - number of buckets used by any partition being joined. - Disadvantages: This may mean a much larger or much smaller number of reducers being used in the - final map reduce job, e.g. if a job was originally going to take 257 reducers, - it will now take 512 reducers, similarly if the max number of reducers is 511, - and a job was going to use this many, it will now use 256 reducers. - - - - hive.merge.current.job.concatenate.list.bucketing - true - - - - hive.merge.current.job.concatenate.list.bucketing.depth - 0 - - - - hive.optimize.listbucketing - false - Enable list bucketing optimizer. Default value is false so that we disable it by default. - - - hive.server.read.socket.timeout - 10 - Timeout for the HiveServer to close the connection if no response from the client in N seconds, defaults to 10 seconds. - - - hive.server.tcp.keepalive - true - Whether to enable TCP keepalive for the Hive Server. Keepalive will prevent accumulation of half-open connections. - - - hive.decode.partition.name - false - Whether to show the unquoted partition names in query results. - - - hive.execution.engine - mr - Chooses execution engine. Options are: mr (Map reduce, default) or tez (hadoop 2 only) - - - hive.jar.directory - - - This is the location hive in tez mode will look for to find a site wide - installed hive instance. - - - - hive.user.install.directory - hdfs:///user/ - - If hive (in tez mode only) cannot find a usable hive jar in "hive.jar.directory", - it will upload the hive jar to "hive.user.install.directory/user.name" - and use it to run queries. - - - - hive.vectorized.execution.enabled - false - - This flag should be set to true to enable vectorized mode of query execution. - The default value is false. - - - - hive.vectorized.groupby.checkinterval - 100000 - Number of entries added to the group by aggregation hash before a recomputation of average entry size is performed. - - - hive.vectorized.groupby.maxentries - 1000000 - - Max number of entries in the vector group by aggregation hashtables. - Exceeding this will trigger a flush irrelevant of memory pressure condition. - - - - hive.vectorized.groupby.flush.percent - 0.1 - Percent of entries in the group by aggregation hash flushed when the memory threshold is exceeded. - - - hive.typecheck.on.insert - true - - - - hive.rpc.query.plan - false - Whether to send the query plan via local resource or RPC - - - hive.compute.splits.in.am - true - Whether to generate the splits locally or in the AM (tez only) - - - hive.prewarm.enabled - false - Enables container prewarm for Tez (Hadoop 2 only) - - - hive.prewarm.numcontainers - 10 - Controls the number of containers to prewarm for Tez (Hadoop 2 only) - - - hive.stageid.rearrange - none - - - - hive.explain.dependency.append.tasktype - false - - - - hive.counters.group.name - HIVE - The name of counter group for internal Hive variables (CREATED_FILE, FATAL_ERROR, etc.) - - - hive.server2.tez.default.queues - - - A list of comma separated values corresponding to YARN queues of the same name. - When HiveServer2 is launched in Tez mode, this configuration needs to be set - for multiple Tez sessions to run in parallel on the cluster. - - - - hive.server2.tez.sessions.per.default.queue - 1 - - A positive integer that determines the number of Tez sessions that should be - launched on each of the queues specified by "hive.server2.tez.default.queues". - Determines the parallelism on each queue. - - - - hive.server2.tez.initialize.default.sessions - false - - This flag is used in HiveServer2 to enable a user to use HiveServer2 without - turning on Tez for HiveServer2. The user could potentially want to run queries - over Tez without the pool of sessions. - - - - hive.support.quoted.identifiers - column - - Whether to use quoted identifier. 'none' ot 'column' can be used. - none: default(past) behavior. Implies only alphaNumeric and underscore are valid characters in identifiers. - column: implies column names can contain any character. - - - - hive.users.in.admin.role - - - Comma separated list of users who are in admin role for bootstrapping. - More users can be added in ADMIN role later. - - - - hive.compat - 0.12 - - Enable (configurable) deprecated behaviors by setting desired level of backward compatibility. - Setting to 0.12: - Maintains division behavior: int / int = double - - - - hive.convert.join.bucket.mapjoin.tez - false - - Whether joins can be automatically converted to bucket map joins in hive - when tez is used as the execution engine. - - - - hive.exec.check.crossproducts - true - Check if a plan contains a Cross Product. If there is one, output a warning to the Session's console. - - - hive.localize.resource.wait.interval - 5000 - Time in milliseconds to wait for another thread to localize the same resource for hive-tez. - - - hive.localize.resource.num.wait.attempts - 5 - The number of attempts waiting for localizing a resource in hive-tez. - - - hive.tez.auto.reducer.parallelism - false - - Turn on Tez' auto reducer parallelism feature. When enabled, Hive will still estimate data sizes - and set parallelism estimates. Tez will sample source vertices' output sizes and adjust the estimates at runtime as - necessary. - - - - hive.tez.max.partition.factor - 2.0 - When auto reducer parallelism is enabled this factor will be used to over-partition data in shuffle edges. - - - hive.tez.min.partition.factor - 0.25 - - When auto reducer parallelism is enabled this factor will be used to put a lower limit to the number - of reducers that tez specifies. - - -