Here are the instructions for setting up Tez on your hadoop 2 cluster:
https://github.com/apache/incubator-tez/blob/branch-0.2.0/INSTALL.txt
Notes:
- I start hive with "hive -hiveconf hive.execution.engine=tez", not exactly necessary, but it will start the AM/containers right away instead of on first query.
- hive-exec jar should be copied to
hdfs:///user/hive/ (location can be changed with: hive.jar.directory). This avoids re-localization of the hive jar.
Hive settings:
// needed because SMB isn't supported on tez yet
set hive.optimize.bucketmapjoin=false;
set hive.optimize.bucketmapjoin.sortedmerge=false;
set hive.auto.convert.sortmerge.join=false;
set hive.auto.convert.sortmerge.join.noconditionaltask=false;
set hive.auto.convert.join.noconditionaltask=true;
// depends on your available mem/cluster, but map/reduce mb should be set to the same for container reuse
set hive.auto.convert.join.noconditionaltask.size=64000000;
set mapred.map.child.java.opts=-server -Xmx3584m -Djava.net.preferIPv4Stack=true;
set mapred.reduce.child.java.opts=-server -Xmx3584m -Djava.net.preferIPv4Stack=true;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
// generic opts
set hive.optimize.reducededuplication.min.reducer=1;
set hive.optimize.mapjoin.mapreduce=true;
// autogather might require you to up the max number of counters, if you run into issues
set hive.stats.autogather=true;
set hive.stats.dbclass=counter;
// tea settings can also go into fez-site if desired
set mapreduce.map.output.compress=true;
set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.DefaultCodec;
set tez.runtime.intermediate-output.should-compress=true;
set tez.runtime.intermediate-output.compress.codec=org.apache.hadoop.io.compress.DefaultCodec;
set tez.runtime.intermdiate-input.is-compressed=true;
set tez.runtime.intermediate-input.compress.codec=org.apache.hadoop.io.compress.DefaultCodec;
// tez groups in the AM
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set hive.orc.splits.include.file.footer=true;
set hive.root.logger=ERROR,console;
set hive.execution.engine=tez;
set hive.vectorized.execution.enabled=true;
set hive.exec.local.cache=true;
set hive.compute.query.using.stats=true;
for tez:
<property>
<name>tez.am.resource.memory.mb</name>
<value>8192</value>
</property>
<property>
<name>tez.am.java.opts</name>
<value>-server -Xmx7168m -Djava.net.preferIPv4Stack=true</value>
</property>
<property>
<name>tez.am.grouping.min-size</name>
<value>16777216</value>
</property>
<!-- Client Submission timeout value when submitting DAGs to a session -->
<property>
<name>tez.session.client.timeout.secs</name>
<value>-1</value>
</property>
<!-- prewarm stuff -->
<property>
<name>tez.session.pre-warm.enabled</name>
<value>true</value>
</property>
<property>
<name>tez.session.pre-warm.num.containers</name>
<value>10</value>
</property>
<property>
<name>tez.am.grouping.split-waves</name>
<value>0.9</value>
</property>
<property>
<name>tez.am.container.reuse.enabled</name>
<value>true</value>
</property>
<property>
<name>tez.am.container.reuse.rack-fallback.enabled</name>
<value>true</value>
</property>
<property>
<name>tez.am.container.reuse.non-local-fallback.enabled</name>
<value>true</value>
</property>
<property>
<name>tez.am.container.session.delay-allocation-millis</name>
<value>-1</value>
</property>
<property>
<name>tez.am.container.reuse.locality.delay-allocation-millis</name>
<value>250</value>
</property>