diff --git itests/qtest/pom.xml itests/qtest/pom.xml
index df9e326..0b61c9e 100644
--- itests/qtest/pom.xml
+++ itests/qtest/pom.xml
@@ -36,7 +36,7 @@
false
false
- stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q
+ orc_split_elimination.q,stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q
cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q
tez_join_tests.q,tez_joins_explain.q,mrr.q,tez_dml.q,tez_insert_overwrite_local_directory_1.q
join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index a5747a6..852fff9 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -20,6 +20,7 @@
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
@@ -69,6 +70,7 @@
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
+import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* A MapReduce/Hive input format for ORC files.
@@ -532,7 +534,7 @@ private FileInfo verifyCachedFileInfo(FileStatus file) {
private final long blockSize;
private final BlockLocation[] locations;
private final FileInfo fileInfo;
- private Iterable stripes;
+ private List stripes;
private FileMetaInfo fileMetaInfo;
private Metadata metadata;
private List types;
@@ -660,18 +662,22 @@ public void run() {
SearchArgument sarg = createSarg(types, conf);
List stripeStats = null;
int[] filterColumns = null;
+ boolean[] selectedStripes = new boolean[stripes.size()];
+ boolean doSplitElim = false;
+ Arrays.fill(selectedStripes, true);
if (sarg != null) {
List sargLeaves = null;
String[] allColumns = conf.get(serdeConstants.LIST_COLUMNS).split(",");
- String[] neededColumns = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",");
+ String[] neededColumns = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
+ .split(",");
sargLeaves = sarg.getLeaves();
filterColumns = new int[sargLeaves.size()];
for (int i = 0; i < filterColumns.length; ++i) {
String colName = sargLeaves.get(i).getColumnName();
// if needed columns does not contain the column specified in filter expression then
- // it must be partition column. There will not be columns within ORC file for partitioned
- // column, so we can ignore them
+ // it must be partition column. There will not be columns within ORC file for
+ // partitioned column, so we can ignore them
if (containsColumn(neededColumns, colName)) {
filterColumns[i] = RecordReaderImpl.findColumns(allColumns, colName);
} else {
@@ -679,52 +685,143 @@ public void run() {
}
}
+ doSplitElim = true;
stripeStats = metadata.getStripeStatistics();
+
+ if (stripeStats == null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Stripe stats is null. Disabling split elimination.");
+ }
+
+ doSplitElim = false;
+ } else {
+ if (stripeStats.isEmpty()) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Stripe stats is empty. Disabling split elimination.");
+ }
+
+ doSplitElim = false;
+ }
+
+ if (stripeStats.size() != stripes.size()) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Stripe stats count does not match stripes count. Disabling split elimination.");
+ }
+
+ doSplitElim = false;
+ }
+ }
+
+ if (doSplitElim) {
+ for (int i = 0; i < stripes.size(); i++) {
+ if (sarg != null
+ && !isStripeSatisfyPredicate(stripeStats.get(i), sarg, filterColumns)) {
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Eliminating ORC stripe-" + (i + 1) + " of file '" + file.getPath()
+ + "' as it did not satisfy predicate condition.");
+ }
+ selectedStripes[i] = false;
+ }
+ }
+ }
}
long currentOffset = -1;
long currentLength = 0;
- int idx = -1;
- for(StripeInformation stripe: stripes) {
- idx++;
+ StripeInformation currStripe = null;
+ StripeInformation nextStripe = null;
+ int i = 0;
- // eliminate stripes that doesn't satisfy the predicate condition
- if (sarg != null && !isStripeSatisfyPredicate(stripeStats.get(idx), sarg, filterColumns)) {
+ // iterate till last but one stripe, edge cases handled below
+ for (i = 0; i < stripes.size() - 1; i++) {
+ currStripe = stripes.get(i);
+ nextStripe = stripes.get(i + 1);
- // if a stripe doesn't satisfy predicate condition then skip it
- if (LOG.isDebugEnabled()) {
- LOG.debug("Eliminating ORC stripe-" + idx + " of file '" + file.getPath()
- + "' as it did not satisfy predicate condition.");
+ if (selectedStripes[i]) {
+
+ // if we aren't building a split, start a new one.
+ if (currentOffset == -1) {
+ currentOffset = currStripe.getOffset();
+ currentLength = nextStripe.getOffset() - currentOffset;
+ } else {
+ currentLength = nextStripe.getOffset() - currentOffset;
}
- // create split for the previous unfinished stripe
+ // if we are working on a stripe, over the min stripe size, and
+ // crossed a block boundary, cut the input split here.
+ if (currentOffset != -1 && currentLength > context.minSize
+ && (currentOffset / blockSize != nextStripe.getOffset() / blockSize)) {
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("case 1) creating splits (reached block size: " + blockSize
+ + "). currentOffset: " + currentOffset + " currentLength: " + currentLength);
+ }
+ createSplit(currentOffset, currentLength, fileMetaInfo);
+ currentOffset = -1;
+ continue;
+ }
+
+ // if we are crossing max split size, create a new split
+ if (currentLength >= context.maxSize) {
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("case 2) creating splits (reached max split size: " + context.maxSize
+ + "). currentOffset: " + currentOffset + " currentLength: " + currentLength);
+ }
+ createSplit(currentOffset, currentLength, fileMetaInfo);
+ currentOffset = -1;
+ continue;
+ }
+ } else {
+
+ // Stripe pruned as it did not satisfy predicate condition
+
+ // create split for the previous unfinished stripe(s)
if (currentOffset != -1) {
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("case 3) creating splits (PPD pruning + unfinished stripe(s)). currentOffset: "
+ + currentOffset + " currentLength: " + currentLength);
+ }
createSplit(currentOffset, currentLength, fileMetaInfo);
currentOffset = -1;
}
continue;
}
+ }
+
+ // edge cases
- // if we are working on a stripe, over the min stripe size, and
- // crossed a block boundary, cut the input split here.
- if (currentOffset != -1 && currentLength > context.minSize &&
- (currentOffset / blockSize != stripe.getOffset() / blockSize)) {
- createSplit(currentOffset, currentLength, fileMetaInfo);
- currentOffset = -1;
+ // only last stripe is left and is selected
+ if (currentOffset == -1 && selectedStripes[i]) {
+ currentOffset = stripes.get(i).getOffset();
+ currentLength = stripes.get(i).getLength();
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("case 4) creating splits (only last stripe is left). currentOffset: "
+ + currentOffset + " currentLength: " + currentLength);
}
- // if we aren't building a split, start a new one.
- if (currentOffset == -1) {
- currentOffset = stripe.getOffset();
- currentLength = stripe.getLength();
- } else {
- currentLength += stripe.getLength();
+ createSplit(currentOffset, currentLength, fileMetaInfo);
+ } else if (currentOffset != -1 && selectedStripes[i]) {
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("case 5) creating splits (last few stripes left + last stripe selected). currentOffset: "
+ + currentOffset + " currentLength: " + currentLength);
}
- if (currentLength >= context.maxSize) {
- createSplit(currentOffset, currentLength, fileMetaInfo);
- currentOffset = -1;
+
+ // last few stripes are left and last stripe is also selected
+ currentLength = (stripes.get(i).getOffset() - currentOffset) + stripes.get(i).getLength();
+ createSplit(currentOffset, currentLength, fileMetaInfo);
+ } else if (currentOffset != -1 && !selectedStripes[i]) {
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("case 6) creating splits (last few stripes left + last stripe not selected). currentOffset: "
+ + currentOffset + " currentLength: " + currentLength);
}
- }
- if (currentOffset != -1) {
+
+ // last few stripes are left and last stripe is not selected
+ currentLength = (stripes.get(i).getOffset() - currentOffset);
createSplit(currentOffset, currentLength, fileMetaInfo);
}
} catch (Throwable th) {
@@ -748,7 +845,7 @@ private void populateAndCacheStripeDetails() {
boolean found = false;
if (fileInfo != null) {
found = true;
- stripes = fileInfo.stripeInfos;
+ stripes = Lists.newArrayList(fileInfo.stripeInfos);
fileMetaInfo = fileInfo.fileMetaInfo;
metadata = fileInfo.metadata;
types = fileInfo.types;
@@ -762,7 +859,7 @@ private void populateAndCacheStripeDetails() {
}
if (!found) {
orcReader = OrcFile.createReader(fs, file.getPath());
- stripes = orcReader.getStripes();
+ stripes = Lists.newArrayList(orcReader.getStripes());
metadata = orcReader.getMetadata();
types = orcReader.getTypes();
fileMetaInfo = context.footerInSplits ? orcReader.getFileMetaInfo() : null;