diff --git itests/qtest/pom.xml itests/qtest/pom.xml index 29988a6..e58b7c7 100644 --- itests/qtest/pom.xml +++ itests/qtest/pom.xml @@ -36,7 +36,7 @@ false false - stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q + orc_split_elimination.q,stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q tez_join_tests.q,tez_joins_explain.q,mrr.q,tez_dml.q,tez_insert_overwrite_local_directory_1.q join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index a5747a6..0ebe41d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -69,6 +70,7 @@ import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; +import com.google.common.collect.Lists; import com.google.common.util.concurrent.ThreadFactoryBuilder; /** * A MapReduce/Hive input format for ORC files. @@ -532,7 +534,7 @@ private FileInfo verifyCachedFileInfo(FileStatus file) { private final long blockSize; private final BlockLocation[] locations; private final FileInfo fileInfo; - private Iterable stripes; + private List stripes; private FileMetaInfo fileMetaInfo; private Metadata metadata; private List types; @@ -660,17 +662,21 @@ public void run() { SearchArgument sarg = createSarg(types, conf); List stripeStats = null; int[] filterColumns = null; + boolean[] selectedStripes = new boolean[stripes.size()]; + Arrays.fill(selectedStripes, true); if (sarg != null) { List sargLeaves = null; String[] allColumns = conf.get(serdeConstants.LIST_COLUMNS).split(","); - String[] neededColumns = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(","); + String[] neededColumns = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + .split(","); sargLeaves = sarg.getLeaves(); filterColumns = new int[sargLeaves.size()]; for (int i = 0; i < filterColumns.length; ++i) { String colName = sargLeaves.get(i).getColumnName(); // if needed columns does not contain the column specified in filter expression then - // it must be partition column. There will not be columns within ORC file for partitioned + // it must be partition column. There will not be columns within ORC file for + // partitioned // column, so we can ignore them if (containsColumn(neededColumns, colName)) { filterColumns[i] = RecordReaderImpl.findColumns(allColumns, colName); @@ -680,51 +686,114 @@ public void run() { } stripeStats = metadata.getStripeStatistics(); + + for (int i = 0; i < stripes.size(); i++) { + if (sarg != null && !isStripeSatisfyPredicate(stripeStats.get(i), sarg, filterColumns)) { + + if (LOG.isDebugEnabled()) { + LOG.debug("Eliminating ORC stripe-" + (i + 1) + " of file '" + file.getPath() + + "' as it did not satisfy predicate condition."); + } + selectedStripes[i] = false; + } + } } long currentOffset = -1; long currentLength = 0; - int idx = -1; - for(StripeInformation stripe: stripes) { - idx++; + StripeInformation currStripe = null; + StripeInformation nextStripe = null; + int i = 0; - // eliminate stripes that doesn't satisfy the predicate condition - if (sarg != null && !isStripeSatisfyPredicate(stripeStats.get(idx), sarg, filterColumns)) { + // iterate till last but one stripe, edge cases handled below + for (i = 0; i < stripes.size() - 1; i++) { + currStripe = stripes.get(i); + nextStripe = stripes.get(i + 1); - // if a stripe doesn't satisfy predicate condition then skip it - if (LOG.isDebugEnabled()) { - LOG.debug("Eliminating ORC stripe-" + idx + " of file '" + file.getPath() - + "' as it did not satisfy predicate condition."); + if (selectedStripes[i]) { + + // if we aren't building a split, start a new one. + if (currentOffset == -1) { + currentOffset = currStripe.getOffset(); + currentLength = nextStripe.getOffset() - currentOffset; + } else { + currentLength = nextStripe.getOffset() - currentOffset; } - // create split for the previous unfinished stripe + // if we are working on a stripe, over the min stripe size, and + // crossed a block boundary, cut the input split here. + if (currentOffset != -1 && currentLength > context.minSize + && (currentOffset / blockSize != nextStripe.getOffset() / blockSize)) { + + if (LOG.isDebugEnabled()) { + LOG.debug("case 1) creating splits (reached block size: " + blockSize + + "). currentOffset: " + currentOffset + " currentLength: " + currentLength); + } + createSplit(currentOffset, currentLength, fileMetaInfo); + currentOffset = -1; + continue; + } + + // if we are crossing max split size, create a new split + if (currentLength >= context.maxSize) { + + if (LOG.isDebugEnabled()) { + LOG.debug("case 2) creating splits (reached max split size: " + context.maxSize + + "). currentOffset: " + currentOffset + " currentLength: " + currentLength); + } + createSplit(currentOffset, currentLength, fileMetaInfo); + currentOffset = -1; + continue; + } + } else { + + // Stripe pruned as it did not satisfy predicate condition + + // create split for the previous unfinished stripe(s) if (currentOffset != -1) { + + if (LOG.isDebugEnabled()) { + LOG.debug("case 3) creating splits (PPD pruning + unfinished stripe(s)). currentOffset: " + + currentOffset + " currentLength: " + currentLength); + } createSplit(currentOffset, currentLength, fileMetaInfo); currentOffset = -1; } continue; } + } - // if we are working on a stripe, over the min stripe size, and - // crossed a block boundary, cut the input split here. - if (currentOffset != -1 && currentLength > context.minSize && - (currentOffset / blockSize != stripe.getOffset() / blockSize)) { - createSplit(currentOffset, currentLength, fileMetaInfo); - currentOffset = -1; + // edge cases + + // only last stripe is left and is selected + if (currentOffset == -1 && selectedStripes[i]) { + currentOffset = stripes.get(i).getOffset(); + currentLength = stripes.get(i).getLength(); + + if (LOG.isDebugEnabled()) { + LOG.debug("case 4) creating splits (only last stripe is left). currentOffset: " + + currentOffset + " currentLength: " + currentLength); } - // if we aren't building a split, start a new one. - if (currentOffset == -1) { - currentOffset = stripe.getOffset(); - currentLength = stripe.getLength(); - } else { - currentLength += stripe.getLength(); + createSplit(currentOffset, currentLength, fileMetaInfo); + } else if (currentOffset != -1 && selectedStripes[i]) { + + if (LOG.isDebugEnabled()) { + LOG.debug("case 5) creating splits (last few stripes left + last stripe selected). currentOffset: " + + currentOffset + " currentLength: " + currentLength); } - if (currentLength >= context.maxSize) { - createSplit(currentOffset, currentLength, fileMetaInfo); - currentOffset = -1; + + // last few stripes are left and last stripe is also selected + currentLength = (stripes.get(i).getOffset() - currentOffset) + stripes.get(i).getLength(); + createSplit(currentOffset, currentLength, fileMetaInfo); + } else if (currentOffset != -1 && !selectedStripes[i]) { + + if (LOG.isDebugEnabled()) { + LOG.debug("case 6) creating splits (last few stripes left + last stripe not selected). currentOffset: " + + currentOffset + " currentLength: " + currentLength); } - } - if (currentOffset != -1) { + + // last few stripes are left and last stripe is not selected + currentLength = (stripes.get(i).getOffset() - currentOffset); createSplit(currentOffset, currentLength, fileMetaInfo); } } catch (Throwable th) { @@ -748,7 +817,7 @@ private void populateAndCacheStripeDetails() { boolean found = false; if (fileInfo != null) { found = true; - stripes = fileInfo.stripeInfos; + stripes = Lists.newArrayList(fileInfo.stripeInfos); fileMetaInfo = fileInfo.fileMetaInfo; metadata = fileInfo.metadata; types = fileInfo.types; @@ -762,7 +831,7 @@ private void populateAndCacheStripeDetails() { } if (!found) { orcReader = OrcFile.createReader(fs, file.getPath()); - stripes = orcReader.getStripes(); + stripes = Lists.newArrayList(orcReader.getStripes()); metadata = orcReader.getMetadata(); types = orcReader.getTypes(); fileMetaInfo = context.footerInSplits ? orcReader.getFileMetaInfo() : null;