diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index fa3e048..ee1ea81 100644
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -223,6 +223,9 @@
// ignore the mapjoin hint
HIVEIGNOREMAPJOINHINT("hive.ignore.mapjoin.hint", true),
+ // max number of footer user can set for a table file
+ HIVE_FILE_MAX_FOOTER("hive.file.max.footer", 100),
+
// Hadoop Configuration Properties
// Properties with null values are ignored and exist only for the purpose of giving us
// a symbolic name to reference in the Hive source code. Properties with non-null
diff --git conf/hive-default.xml.template conf/hive-default.xml.template
index c61a0bb..4a35ee4 100644
--- conf/hive-default.xml.template
+++ conf/hive-default.xml.template
@@ -386,6 +386,12 @@
+ hive.file.max.footer
+ 100
+ maximum number of lines for footer user can define for a table file
+
+
+
hive.map.aggr
true
Whether to use map-side aggregation in Hive Group By queries
diff --git data/files/header_footer_table_1/0001.txt data/files/header_footer_table_1/0001.txt
new file mode 100644
index 0000000..c242b42
--- /dev/null
+++ data/files/header_footer_table_1/0001.txt
@@ -0,0 +1,8 @@
+name message 0
+steven hive 1
+dave oozie 2
+xifa phd 3
+chuan hadoop 4
+shanyu senior 5
+footer1 footer1 0
+footer2 0
\ No newline at end of file
diff --git data/files/header_footer_table_1/0002.txt data/files/header_footer_table_1/0002.txt
new file mode 100644
index 0000000..d5db38d
--- /dev/null
+++ data/files/header_footer_table_1/0002.txt
@@ -0,0 +1,8 @@
+name message 0
+steven2 hive 11
+dave2 oozie 12
+xifa2 phd 13
+chuan2 hadoop 14
+shanyu2 senior 15
+footer1 footer1 0
+footer2 0
\ No newline at end of file
diff --git data/files/header_footer_table_1/0003.txt data/files/header_footer_table_1/0003.txt
new file mode 100644
index 0000000..f7a763d
--- /dev/null
+++ data/files/header_footer_table_1/0003.txt
@@ -0,0 +1,4 @@
+name message 0
+david3 oozie 22
+footer1 footer1 0
+footer2 0
\ No newline at end of file
diff --git data/files/header_footer_table_2/2012/01/01/0001.txt data/files/header_footer_table_2/2012/01/01/0001.txt
new file mode 100644
index 0000000..c242b42
--- /dev/null
+++ data/files/header_footer_table_2/2012/01/01/0001.txt
@@ -0,0 +1,8 @@
+name message 0
+steven hive 1
+dave oozie 2
+xifa phd 3
+chuan hadoop 4
+shanyu senior 5
+footer1 footer1 0
+footer2 0
\ No newline at end of file
diff --git data/files/header_footer_table_2/2012/01/02/0002.txt data/files/header_footer_table_2/2012/01/02/0002.txt
new file mode 100644
index 0000000..d5db38d
--- /dev/null
+++ data/files/header_footer_table_2/2012/01/02/0002.txt
@@ -0,0 +1,8 @@
+name message 0
+steven2 hive 11
+dave2 oozie 12
+xifa2 phd 13
+chuan2 hadoop 14
+shanyu2 senior 15
+footer1 footer1 0
+footer2 0
\ No newline at end of file
diff --git data/files/header_footer_table_2/2012/01/03/0003.txt data/files/header_footer_table_2/2012/01/03/0003.txt
new file mode 100644
index 0000000..f7a763d
--- /dev/null
+++ data/files/header_footer_table_2/2012/01/03/0003.txt
@@ -0,0 +1,4 @@
+name message 0
+david3 oozie 22
+footer1 footer1 0
+footer2 0
\ No newline at end of file
diff --git itests/qtest/pom.xml itests/qtest/pom.xml
index c3cbb89..5dba96d 100644
--- itests/qtest/pom.xml
+++ itests/qtest/pom.xml
@@ -36,8 +36,8 @@
false
false
- stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q
- cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q
+ stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,file_with_header_footer.q
+ cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q
add_part_exist.q,alter1.q,alter2.q,alter4.q,alter5.q,alter_rename_partition.q,alter_rename_partition_authorization.q,archive.q,archive_corrupt.q,archive_multi.q,archive_mr_1806.q,archive_multi_mr_1806.q,authorization_1.q,authorization_2.q,authorization_4.q,authorization_5.q,authorization_6.q,authorization_7.q,ba_table1.q,ba_table2.q,ba_table3.q,ba_table_udfs.q,binary_table_bincolserde.q,binary_table_colserde.q,cluster.q,columnarserde_create_shortcut.q,combine2.q,constant_prop.q,create_nested_type.q,create_or_replace_view.q,create_struct_table.q,create_union_table.q,database.q,database_location.q,database_properties.q,ddltime.q,describe_database_json.q,drop_database_removes_partition_dirs.q,escape1.q,escape2.q,exim_00_nonpart_empty.q,exim_01_nonpart.q,exim_02_00_part_empty.q,exim_02_part.q,exim_03_nonpart_over_compat.q,exim_04_all_part.q,exim_04_evolved_parts.q,exim_05_some_part.q,exim_06_one_part.q,exim_07_all_part_over_nonoverlap.q,exim_08_nonpart_rename.q,exim_09_part_spec_nonoverlap.q,exim_10_external_managed.q,exim_11_managed_external.q,exim_12_external_location.q,exim_13_managed_location.q,exim_14_managed_location_over_existing.q,exim_15_external_part.q,exim_16_part_external.q,exim_17_part_managed.q,exim_18_part_external.q,exim_19_00_part_external_location.q,exim_19_part_external_location.q,exim_20_part_managed_location.q,exim_21_export_authsuccess.q,exim_22_import_exist_authsuccess.q,exim_23_import_part_authsuccess.q,exim_24_import_nonexist_authsuccess.q,global_limit.q,groupby_complex_types.q,groupby_complex_types_multi_single_reducer.q,index_auth.q,index_auto.q,index_auto_empty.q,index_bitmap.q,index_bitmap1.q,index_bitmap2.q,index_bitmap3.q,index_bitmap_auto.q,index_bitmap_rc.q,index_compact.q,index_compact_1.q,index_compact_2.q,index_compact_3.q,index_stale_partitioned.q,init_file.q,input16.q,input16_cc.q,input46.q,input_columnarserde.q,input_dynamicserde.q,input_lazyserde.q,input_testxpath3.q,input_testxpath4.q,insert2_overwrite_partitions.q,insertexternal1.q,join_thrift.q,lateral_view.q,load_binary_data.q,load_exist_part_authsuccess.q,load_nonpart_authsuccess.q,load_part_authsuccess.q,loadpart_err.q,lock1.q,lock2.q,lock3.q,lock4.q,merge_dynamic_partition.q,multi_insert.q,multi_insert_move_tasks_share_dependencies.q,null_column.q,ppd_clusterby.q,query_with_semi.q,rename_column.q,sample6.q,sample_islocalmode_hook.q,set_processor_namespaces.q,show_tables.q,source.q,split_sample.q,str_to_map.q,transform1.q,udaf_collect_set.q,udaf_context_ngrams.q,udaf_histogram_numeric.q,udaf_ngrams.q,udaf_percentile_approx.q,udf_array.q,udf_bitmap_and.q,udf_bitmap_or.q,udf_explode.q,udf_format_number.q,udf_map.q,udf_map_keys.q,udf_map_values.q,udf_max.q,udf_min.q,udf_named_struct.q,udf_percentile.q,udf_printf.q,udf_sentences.q,udf_sort_array.q,udf_split.q,udf_struct.q,udf_substr.q,udf_translate.q,udf_union.q,udf_xpath.q,udtf_stack.q,view.q,virtual_column.q
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java
index d2b2526..cf66a85 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java
@@ -37,6 +37,7 @@
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
import org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader;
+import org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.KVPair;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveRecordReader;
import org.apache.hadoop.hive.ql.metadata.HiveException;
@@ -46,6 +47,7 @@
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
+import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.DelegatedObjectInspectorFactory;
@@ -80,6 +82,10 @@
private PartitionDesc currPart;
private TableDesc currTbl;
private boolean tblDataDone;
+ private ArrayList footerBuf = null;
+ private int footerCur = 0;
+ private int headerCount = 0;
+ private int footerCount = 0;
private boolean hasVC;
private boolean isPartitioned;
@@ -520,6 +526,37 @@ protected void pushRow(InspectableObject row) throws HiveException {
private transient final InspectableObject inspectable = new InspectableObject();
+ private boolean skipHeader(int headerCount) throws IOException {
+ while (headerCount > 0) {
+ if (!currRecReader.next(key, value))
+ return false;
+ headerCount--;
+ }
+ return true;
+ }
+
+ private boolean initializefooterBuf() throws IOException {
+ if (footerCount > 0) {
+ // if table format has footer row, initialize the buffer
+ footerBuf = new ArrayList();
+ // fill the buffer
+ while (footerBuf.size() < footerCount) {
+ boolean ret = currRecReader.next(key, value);
+ if (!ret) {
+ // reach the end of the file
+ return ret;
+ }
+ KVPair tem = new KVPair();
+ tem.key = ReflectionUtils.copy(job, key, tem.key);
+ tem.value = ReflectionUtils.copy(job, value, tem.value);
+ footerBuf.add(tem);
+ }
+ footerCur = 0;
+ return true;
+ }
+ return true;
+ }
+
/**
* Get the next row. The fetch context is modified appropriately.
*
@@ -527,6 +564,7 @@ protected void pushRow(InspectableObject row) throws HiveException {
public InspectableObject getNextRow() throws IOException {
try {
while (true) {
+ boolean ret = true;
if (context != null) {
context.resetRow();
}
@@ -535,9 +573,71 @@ public InspectableObject getNextRow() throws IOException {
if (currRecReader == null) {
return null;
}
+ // start reading a new file
+ // if file contains header, skip header lines before reading the records
+ // if file contains footer, used a fixed size queue as a buffer when
+ // reading the records and the footer lines will kept inside the queue
+ // when the file reaches the end
+ headerCount = 0;
+ footerCount = 0;
+ TableDesc table = null;
+ if (currTbl != null) {
+ table = currTbl;
+ } else if (currPart != null) {
+ table = currPart.getTableDesc();
+ }
+ if (table != null) {
+ try {
+ headerCount = Integer.parseInt(table.getProperties()
+ .getProperty(serdeConstants.HEADER_COUNT, "0"));
+ } catch (NumberFormatException nfe) {
+ // throw exception if the header is not set to a number
+ throw new IOException(nfe);
+ }
+ try {
+ footerCount = Integer.parseInt(table.getProperties()
+ .getProperty(serdeConstants.FOOTER_COUNT, "0"));
+ if (footerCount > HiveConf.getIntVar(job, HiveConf.ConfVars.HIVE_FILE_MAX_FOOTER)) {
+ LOG.info("Footer for table " + table.getTableName() +
+ " exceed max footer length, setting footer rows to " +
+ HiveConf.getIntVar(job, HiveConf.ConfVars.HIVE_FILE_MAX_FOOTER));
+ throw new IOException("footer number exceeds the limit defined in hive.file.max.footer");
+ }
+ } catch (NumberFormatException nfe) {
+ // throw exception if the footer is not set to a number
+ throw new IOException(nfe);
+ }
+ }
+ // skip header lines
+ ret = skipHeader(headerCount);
+ // initialize footer buffer
+ if (ret)
+ ret = initializefooterBuf();
+ }
+ if (ret && footerBuf == null) {
+ // when file doesn't end after skipping header line
+ // and there is no footer lines, read normally
+ ret = currRecReader.next(key, value);
+ }
+ if (ret && footerBuf == null) {
+ // when file doesn't end after skipping header line
+ // and there is no footer lines, read normally
+ ret = currRecReader.next(key, value);
+ }
+ if (ret && footerBuf != null) {
+ // when table files have footer rows
+ key = ReflectionUtils.copy(job, (WritableComparable)footerBuf.get(footerCur).getKey(), key);
+ value = ReflectionUtils.copy(job, (Writable)footerBuf.get(footerCur).getValue(), value);
+ ret = currRecReader.next((WritableComparable)footerBuf.get(footerCur).getKey(),
+ (Writable)footerBuf.get(footerCur).getValue());
+ if (ret) {
+ footerCur = (++footerCur) % footerBuf.size();
+ } else {
+ // file reach the end, need to nullify key/value
+ key = ReflectionUtils.copy(job, (WritableComparable)footerBuf.get(footerCur).getKey(), key);
+ value = ReflectionUtils.copy(job, (Writable)footerBuf.get(footerCur).getValue(), value);
+ }
}
-
- boolean ret = currRecReader.next(key, value);
if (ret) {
if (operator != null && context != null && context.inputFileChanged()) {
// The child operators cleanup if input file has changed
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveContextAwareRecordReader.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveContextAwareRecordReader.java
index dd5cb6b..10b7da6 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/HiveContextAwareRecordReader.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveContextAwareRecordReader.java
@@ -20,24 +20,35 @@
import java.io.IOException;
import java.util.ArrayList;
+import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
+import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil;
+import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.IOContext.Comparison;
+import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
+import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.util.ReflectionUtils;
/** This class prepares an IOContext, and provides the ability to perform a binary search on the
* data. The binary search can be used by setting the value of inputFormatSorted in the
@@ -217,6 +228,62 @@ public float getProgress() throws IOException {
}
}
+ // data structure to store key value pair to facilitate footer queue
+ public static class KVPair {
+ public K key;
+ public V value;
+ public KVPair(K key, V value){
+ this.key = key;
+ this.value = value;
+ }
+ public KVPair(){
+ this.key = null;
+ this.value = null;
+ }
+ public K getKey() {
+ return this.key;
+ }
+ public V getValue() {
+ return this.value;
+ }
+ }
+
+ private ArrayList footerBuf = null;
+ private int footerCur = 0;
+ private int headerCount = 0;
+ private int footerCount = 0;
+
+ private boolean skipHeader(int headerCount, K key, V value) throws IOException {
+ while (headerCount > 0) {
+ if (!recordReader.next(key, value))
+ return false;
+ headerCount--;
+ }
+ return true;
+ }
+
+ private boolean initializefooterBuf(K key, V value) throws IOException {
+ if (footerCount > 0) {
+ // if table format has footer row, initialize the buffer
+ footerBuf = new ArrayList();
+ // fill the buffer
+ while (footerBuf.size() < footerCount) {
+ boolean ret = recordReader.next(key, value);
+ if (!ret) {
+ // reach the end of the file
+ return ret;
+ }
+ KVPair tem = new KVPair();
+ tem.key = ReflectionUtils.copy(jobConf, key, tem.key);
+ tem.value = ReflectionUtils.copy(jobConf, value, tem.value);
+ footerBuf.add(tem);
+ }
+ footerCur = 0;
+ return true;
+ }
+ return true;
+ }
+
public boolean doNext(K key, V value) throws IOException {
if (this.isSorted) {
if (this.getIOContext().shouldEndBinarySearch() ||
@@ -271,7 +338,71 @@ public boolean doNext(K key, V value) throws IOException {
}
try {
- return recordReader.next(key, value);
+ // when start reading new file, check header, footer rows
+ // if file contains header, skip header lines before reading the records
+ // if file contains footer, used a fixed size queue as a buffer when
+ // reading the records and the footer lines will kept inside the queue
+ // when the file reaches the end
+ if (this.ioCxtRef.getCurrentBlockStart() == 0) {
+ // check if the table file has header to skip
+ Path filePath = this.ioCxtRef.getInputPath();
+ PartitionDesc part = null;
+ try {
+ Map pathToPartitionInfo = Utilities
+ .getMapWork(jobConf).getPathToPartitionInfo();
+ part = HiveFileFormatUtils
+ .getPartitionDescFromPathRecursively(pathToPartitionInfo,
+ filePath, IOPrepareCache.get().getPartitionDescMap());
+ } catch (Exception e) {
+ LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath()
+ + "because " + e.getMessage());
+ part = null;
+ }
+ TableDesc table = (part == null) ? null : part.getTableDesc();
+ if (table != null) {
+ try {
+ headerCount = Integer.parseInt(table.getProperties()
+ .getProperty(serdeConstants.HEADER_COUNT, "0"));
+ } catch (NumberFormatException nfe) {
+ // throw exception if the header is not set to a number
+ throw new IOException(nfe);
+ }
+ try {
+ footerCount = Integer.parseInt(table.getProperties()
+ .getProperty(serdeConstants.FOOTER_COUNT, "0"));
+ if (footerCount > HiveConf.getIntVar(jobConf, HiveConf.ConfVars.HIVE_FILE_MAX_FOOTER)) {
+ LOG.info("Footer for table " + table.getTableName() +
+ " exceed max footer length, setting footer rows to " +
+ HiveConf.getIntVar(jobConf, HiveConf.ConfVars.HIVE_FILE_MAX_FOOTER));
+ throw new IOException("footer number exceeds the limit defined in hive.file.max.footer");
+ }
+ } catch (NumberFormatException nfe) {
+ // throw exception if the footer is not set to a number
+ throw new IOException(nfe);
+ }
+ }
+ // if input contains header, skip header
+ if (!skipHeader(headerCount, key, value))
+ return false;
+ if (!initializefooterBuf(key, value))
+ return false;
+ }
+ if (footerBuf == null) {
+ // table files don't have footer rows
+ return recordReader.next(key, value);
+ } else {
+ // table files have footer rows
+ key = (K)ReflectionUtils.copy(jobConf, footerBuf.get(footerCur).getKey(), key);
+ value = (V)ReflectionUtils.copy(jobConf, footerBuf.get(footerCur).getValue(), value);
+ boolean ret = recordReader.next(footerBuf.get(footerCur).getKey(), footerBuf.get(footerCur).getValue());
+ if (ret) {
+ footerCur = (++footerCur) % footerBuf.size();
+ } else {
+ key = (K)ReflectionUtils.copy(jobConf, footerBuf.get(footerCur).getKey(), key);
+ value = (V)ReflectionUtils.copy(jobConf, footerBuf.get(footerCur).getValue(), value);
+ }
+ return ret;
+ }
} catch (Exception e) {
return HiveIOExceptionHandlerUtil.handleRecordReaderNextException(e, jobConf);
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
index 974a5d6..ca07f62 100755
--- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
@@ -33,6 +33,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
@@ -42,6 +43,7 @@
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.Writable;
@@ -290,6 +292,29 @@ protected void init(JobConf job) {
FileInputFormat.setInputPaths(newjob, dir);
newjob.setInputFormat(inputFormat.getClass());
+ TableDesc tableDesc = part.getTableDesc();
+ int headerCount = 0;
+ int footerCount = 0;
+ if (tableDesc != null) {
+ try {
+ headerCount = Integer.parseInt(tableDesc.getProperties()
+ .getProperty(serdeConstants.HEADER_COUNT, "0"));
+ } catch (NumberFormatException nfe) {
+ //set header count to 0 if an invalid property is passed
+ headerCount = 0;
+ }
+ try {
+ footerCount = Integer.parseInt(tableDesc.getProperties()
+ .getProperty(serdeConstants.FOOTER_COUNT, "0"));
+ } catch (NumberFormatException nfe) {
+ //set header count to 0 if an invalid property is passed
+ footerCount = 0;
+ }
+ if (headerCount != 0 || footerCount != 0) {
+ //input file has header or footer, can not be splitted
+ newjob.setLong("mapred.min.split.size", Long.MAX_VALUE);
+ }
+ }
InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
for (InputSplit is : iss) {
result.add(new HiveInputSplit(is, inputFormatClass.getName()));
diff --git ql/src/test/org/apache/hadoop/hive/ql/io/TestHiveBinarySearchRecordReader.java ql/src/test/org/apache/hadoop/hive/ql/io/TestHiveBinarySearchRecordReader.java
index 85dd975..1047a8d 100644
--- ql/src/test/org/apache/hadoop/hive/ql/io/TestHiveBinarySearchRecordReader.java
+++ ql/src/test/org/apache/hadoop/hive/ql/io/TestHiveBinarySearchRecordReader.java
@@ -24,13 +24,19 @@
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.util.LinkedHashMap;
import junit.framework.Assert;
import junit.framework.TestCase;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.HiveInputFormat.HiveInputSplit;
+import org.apache.hadoop.hive.ql.plan.MapredWork;
+import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
@@ -127,6 +133,15 @@ private void init() throws IOException {
when(rcfReader.getPos()).thenReturn(50L);
conf = new JobConf();
conf.setBoolean("hive.input.format.sorted", true);
+
+ TableDesc tblDesc = Utilities.defaultTd;
+ PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
+ LinkedHashMap pt = new LinkedHashMap();
+ pt.put("/tmp/testfolder", partDesc);
+ MapredWork mrwork = new MapredWork();
+ mrwork.getMapWork().setPathToPartitionInfo(pt);
+ Utilities.setMapRedWork(conf, mrwork,"/tmp/" + System.getProperty("user.name") + "/hive");
+
hiveSplit = new TestHiveInputSplit();
hbsReader = new TestHiveRecordReader(rcfReader, conf);
hbsReader.initIOContext(hiveSplit, conf, Class.class, rcfReader);
diff --git ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
index 0686d9b..94061af 100644
--- ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
+++ ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
@@ -21,6 +21,7 @@
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
+import java.util.LinkedHashMap;
import java.util.List;
import junit.framework.TestCase;
@@ -38,6 +39,9 @@
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
+import org.apache.hadoop.hive.ql.plan.MapredWork;
+import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
@@ -70,6 +74,15 @@
protected void setUp() throws IOException {
conf = new Configuration();
job = new JobConf(conf);
+
+ TableDesc tblDesc = Utilities.defaultTd;
+ PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
+ LinkedHashMap pt = new LinkedHashMap();
+ pt.put("/tmp/testfolder", partDesc);
+ MapredWork mrwork = new MapredWork();
+ mrwork.getMapWork().setPathToPartitionInfo(pt);
+ Utilities.setMapRedWork(job, mrwork,"/tmp/" + System.getProperty("user.name") + "/hive");
+
fileSystem = FileSystem.getLocal(conf);
testDir = new Path(System.getProperty("test.tmp.dir", System.getProperty(
"user.dir", new File(".").getAbsolutePath()))
diff --git ql/src/test/queries/clientnegative/file_with_header_footer_negative.q ql/src/test/queries/clientnegative/file_with_header_footer_negative.q
new file mode 100644
index 0000000..44f85de
--- /dev/null
+++ ql/src/test/queries/clientnegative/file_with_header_footer_negative.q
@@ -0,0 +1,13 @@
+dfs -mkdir hdfs:///tmp/test/;
+
+dfs -copyFromLocal ../data/files/header_footer_table_1 hdfs:///tmp/test/header_footer_table_1;
+
+dfs -copyFromLocal ../data/files/header_footer_table_2 hdfs:///tmp/test/header_footer_table_2;
+
+CREATE EXTERNAL TABLE header_footer_table_1 (name string, message string, id int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION 'hdfs:///tmp/test/header_footer_table_1' tblproperties ("skip.header.line.count"="1", "skip.footer.line.count"="200");
+
+SELECT * FROM header_footer_table_1;
+
+DROP TABLE header_footer_table_1;
+
+dfs -rmr hdfs:///tmp/test;
\ No newline at end of file
diff --git ql/src/test/queries/clientpositive/file_with_header_footer.q ql/src/test/queries/clientpositive/file_with_header_footer.q
new file mode 100644
index 0000000..0857550
--- /dev/null
+++ ql/src/test/queries/clientpositive/file_with_header_footer.q
@@ -0,0 +1,29 @@
+dfs -mkdir hdfs:///tmp/test/;
+
+dfs -copyFromLocal ../data/files/header_footer_table_1 hdfs:///tmp/test/header_footer_table_1;
+
+dfs -copyFromLocal ../data/files/header_footer_table_2 hdfs:///tmp/test/header_footer_table_2;
+
+CREATE EXTERNAL TABLE header_footer_table_1 (name string, message string, id int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION 'hdfs:///tmp/test/header_footer_table_1' tblproperties ("skip.header.line.count"="1", "skip.footer.line.count"="2");
+
+SELECT * FROM header_footer_table_1;
+
+SELECT * FROM header_footer_table_1 WHERE id < 50;
+
+CREATE EXTERNAL TABLE header_footer_table_2 (name string, message string, id int) PARTITIONED BY (year int, month int, day int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' tblproperties ("skip.header.line.count"="1", "skip.footer.line.count"="2");
+
+ALTER TABLE header_footer_table_2 ADD PARTITION (year=2012, month=1, day=1) location 'hdfs:///tmp/test/header_footer_table_2/2012/01/01';
+
+ALTER TABLE header_footer_table_2 ADD PARTITION (year=2012, month=1, day=2) location 'hdfs:///tmp/test/header_footer_table_2/2012/01/02';
+
+ALTER TABLE header_footer_table_2 ADD PARTITION (year=2012, month=1, day=3) location 'hdfs:///tmp/test/header_footer_table_2/2012/01/03';
+
+SELECT * FROM header_footer_table_2;
+
+SELECT * FROM header_footer_table_2 WHERE id < 50;
+
+DROP TABLE header_footer_table_1;
+
+DROP TABLE header_footer_table_2;
+
+dfs -rmr hdfs:///tmp/test;
\ No newline at end of file
diff --git ql/src/test/results/clientnegative/file_with_header_footer_negative.q.out ql/src/test/results/clientnegative/file_with_header_footer_negative.q.out
new file mode 100644
index 0000000..0ae8f43
--- /dev/null
+++ ql/src/test/results/clientnegative/file_with_header_footer_negative.q.out
@@ -0,0 +1,14 @@
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@header_footer_table_1
+PREHOOK: query: SELECT * FROM header_footer_table_1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@header_footer_table_1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM header_footer_table_1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@header_footer_table_1
+#### A masked pattern was here ####
+Failed with exception java.io.IOException:java.io.IOException: footer number exceeds the limit defined in hive.file.max.footer
\ No newline at end of file
diff --git ql/src/test/results/clientpositive/file_with_header_footer.q.out ql/src/test/results/clientpositive/file_with_header_footer.q.out
new file mode 100644
index 0000000..ea31c3b
--- /dev/null
+++ ql/src/test/results/clientpositive/file_with_header_footer.q.out
@@ -0,0 +1,136 @@
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@header_footer_table_1
+PREHOOK: query: SELECT * FROM header_footer_table_1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@header_footer_table_1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM header_footer_table_1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@header_footer_table_1
+#### A masked pattern was here ####
+steven hive 1
+dave oozie 2
+xifa phd 3
+chuan hadoop 4
+shanyu senior 5
+steven2 hive 11
+dave2 oozie 12
+xifa2 phd 13
+chuan2 hadoop 14
+shanyu2 senior 15
+david3 oozie 22
+PREHOOK: query: SELECT * FROM header_footer_table_1 WHERE id < 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@header_footer_table_1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM header_footer_table_1 WHERE id < 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@header_footer_table_1
+#### A masked pattern was here ####
+steven hive 1
+dave oozie 2
+xifa phd 3
+chuan hadoop 4
+shanyu senior 5
+steven2 hive 11
+dave2 oozie 12
+xifa2 phd 13
+chuan2 hadoop 14
+shanyu2 senior 15
+david3 oozie 22
+PREHOOK: query: CREATE EXTERNAL TABLE header_footer_table_2 (name string, message string, id int) PARTITIONED BY (year int, month int, day int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' tblproperties ("skip.header.line.count"="1", "skip.footer.line.count"="2")
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE EXTERNAL TABLE header_footer_table_2 (name string, message string, id int) PARTITIONED BY (year int, month int, day int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' tblproperties ("skip.header.line.count"="1", "skip.footer.line.count"="2")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@header_footer_table_2
+#### A masked pattern was here ####
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Input: default@header_footer_table_2
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Input: default@header_footer_table_2
+POSTHOOK: Output: default@header_footer_table_2@year=2012/month=1/day=1
+#### A masked pattern was here ####
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Input: default@header_footer_table_2
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Input: default@header_footer_table_2
+POSTHOOK: Output: default@header_footer_table_2@year=2012/month=1/day=2
+#### A masked pattern was here ####
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Input: default@header_footer_table_2
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Input: default@header_footer_table_2
+POSTHOOK: Output: default@header_footer_table_2@year=2012/month=1/day=3
+PREHOOK: query: SELECT * FROM header_footer_table_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@header_footer_table_2
+PREHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=1
+PREHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=2
+PREHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=3
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM header_footer_table_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@header_footer_table_2
+POSTHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=1
+POSTHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=2
+POSTHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=3
+#### A masked pattern was here ####
+steven hive 1 2012 1 1
+dave oozie 2 2012 1 1
+xifa phd 3 2012 1 1
+chuan hadoop 4 2012 1 1
+shanyu senior 5 2012 1 1
+steven2 hive 11 2012 1 2
+dave2 oozie 12 2012 1 2
+xifa2 phd 13 2012 1 2
+chuan2 hadoop 14 2012 1 2
+shanyu2 senior 15 2012 1 2
+david3 oozie 22 2012 1 3
+PREHOOK: query: SELECT * FROM header_footer_table_2 WHERE id < 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@header_footer_table_2
+PREHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=1
+PREHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=2
+PREHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=3
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM header_footer_table_2 WHERE id < 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@header_footer_table_2
+POSTHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=1
+POSTHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=2
+POSTHOOK: Input: default@header_footer_table_2@year=2012/month=1/day=3
+#### A masked pattern was here ####
+steven hive 1 2012 1 1
+dave oozie 2 2012 1 1
+xifa phd 3 2012 1 1
+chuan hadoop 4 2012 1 1
+shanyu senior 5 2012 1 1
+steven2 hive 11 2012 1 2
+dave2 oozie 12 2012 1 2
+xifa2 phd 13 2012 1 2
+chuan2 hadoop 14 2012 1 2
+shanyu2 senior 15 2012 1 2
+david3 oozie 22 2012 1 3
+PREHOOK: query: DROP TABLE header_footer_table_1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@header_footer_table_1
+PREHOOK: Output: default@header_footer_table_1
+POSTHOOK: query: DROP TABLE header_footer_table_1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@header_footer_table_1
+POSTHOOK: Output: default@header_footer_table_1
+PREHOOK: query: DROP TABLE header_footer_table_2
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@header_footer_table_2
+PREHOOK: Output: default@header_footer_table_2
+POSTHOOK: query: DROP TABLE header_footer_table_2
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@header_footer_table_2
+POSTHOOK: Output: default@header_footer_table_2
+#### A masked pattern was here ####
diff --git serde/if/serde.thrift serde/if/serde.thrift
index 2ceb572..31c87ee 100644
--- serde/if/serde.thrift
+++ serde/if/serde.thrift
@@ -37,6 +37,8 @@ const string LINE_DELIM = "line.delim"
const string MAPKEY_DELIM = "mapkey.delim"
const string QUOTE_CHAR = "quote.delim"
const string ESCAPE_CHAR = "escape.delim"
+const string HEADER_COUNT = "skip.header.line.count"
+const string FOOTER_COUNT = "skip.footer.line.count"
typedef string PrimitiveType
typedef string CollectionType
diff --git serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java
index 22a6168..515cf25 100644
--- serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java
+++ serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java
@@ -61,6 +61,10 @@
public static final String ESCAPE_CHAR = "escape.delim";
+ public static final String HEADER_COUNT = "skip.header.line.count";
+
+ public static final String FOOTER_COUNT = "skip.footer.line.count";
+
public static final String VOID_TYPE_NAME = "void";
public static final String BOOLEAN_TYPE_NAME = "boolean";