diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 75250e5..5dd6c39 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -18,7 +18,10 @@ package org.apache.hadoop.hive.ql.io.orc; -import com.google.common.collect.Lists; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -28,12 +31,9 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.InputFormatChecker; -import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileSplit; @@ -42,10 +42,6 @@ import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - /** * A MapReduce/Hive input format for ORC files. */ @@ -80,22 +76,22 @@ conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "null")); LOG.info("included columns names = " + conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "null")); - boolean[] includeColumn = findIncludedColumns(types, conf); + boolean[] includedColumns = findIncludedColumns(types, conf); if (serializedPushdown != null && columnNamesString != null) { sarg = SearchArgument.FACTORY.create (Utilities.deserializeExpression(serializedPushdown, conf)); LOG.info("ORC pushdown predicate: " + sarg); String[] neededColumnNames = columnNamesString.split(","); int i = 0; - for(int columnId: types.get(0).getSubtypesList()) { - if (includeColumn[columnId]) { + for (int columnId : types.get(0).getSubtypesList()) { + if (includedColumns == null || includedColumns[columnId]) { columnNames[columnId] = neededColumnNames[i++]; } } } else { LOG.info("No ORC pushdown predicate"); } - this.reader = file.rows(offset, length,includeColumn, sarg, columnNames); + this.reader = file.rows(offset, length,includedColumns, sarg, columnNames); this.offset = offset; this.length = length; } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java index 444bfa6..a94e901 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java @@ -18,7 +18,14 @@ package org.apache.hadoop.hive.ql.io.orc; -import com.google.protobuf.CodedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; @@ -28,12 +35,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.Text; -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; +import com.google.protobuf.CodedInputStream; final class ReaderImpl implements Reader { @@ -332,6 +334,13 @@ public RecordReader rows(long offset, long length, boolean[] include public RecordReader rows(long offset, long length, boolean[] include, SearchArgument sarg, String[] columnNames ) throws IOException { + + // if included columns is null, then include all columns + if (include == null && columnNames != null) { + include = new boolean[columnNames.length]; + Arrays.fill(include, true); + } + return new RecordReaderImpl(this.getStripes(), fileSystem, path, offset, length, footer.getTypesList(), codec, bufferSize, include, footer.getRowIndexStride(), sarg, columnNames); diff --git ql/src/test/queries/clientpositive/orc_predicate_pushdown.q ql/src/test/queries/clientpositive/orc_predicate_pushdown.q index df89802..a55448b 100644 --- ql/src/test/queries/clientpositive/orc_predicate_pushdown.q +++ ql/src/test/queries/clientpositive/orc_predicate_pushdown.q @@ -49,6 +49,10 @@ SET hive.optimize.index.filter=false; -- hive.optimize.index.filter is set to true. the explain plan should show filter expression -- in table scan operator. +SELECT * FROM orc_pred WHERE t<100 limit 1; +SET hive.optimize.index.filter=true; +SELECT * FROM orc_pred WHERE t<100 limit 1; + SELECT SUM(HASH(t)) FROM orc_pred WHERE t IS NOT NULL AND t < 0 diff --git ql/src/test/results/clientpositive/orc_predicate_pushdown.q.out ql/src/test/results/clientpositive/orc_predicate_pushdown.q.out index ffd577f..8f4bcb2 100644 --- ql/src/test/results/clientpositive/orc_predicate_pushdown.q.out +++ ql/src/test/results/clientpositive/orc_predicate_pushdown.q.out @@ -275,10 +275,7 @@ PREHOOK: query: -- all the following queries have predicates which are pushed do -- hive.optimize.index.filter is set to true. the explain plan should show filter expression -- in table scan operator. -SELECT SUM(HASH(t)) FROM orc_pred - WHERE t IS NOT NULL - AND t < 0 - AND t > -2 +SELECT * FROM orc_pred WHERE t<100 limit 1 PREHOOK: type: QUERY PREHOOK: Input: default@orc_pred #### A masked pattern was here #### @@ -286,7 +283,50 @@ POSTHOOK: query: -- all the following queries have predicates which are pushed d -- hive.optimize.index.filter is set to true. the explain plan should show filter expression -- in table scan operator. -SELECT SUM(HASH(t)) FROM orc_pred +SELECT * FROM orc_pred WHERE t<100 limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_pred +#### A masked pattern was here #### +POSTHOOK: Lineage: orc_pred.b SIMPLE [(staging)staging.FieldSchema(name:b, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_pred.bin SIMPLE [(staging)staging.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: orc_pred.bo SIMPLE [(staging)staging.FieldSchema(name:bo, type:boolean, comment:null), ] +POSTHOOK: Lineage: orc_pred.d SIMPLE [(staging)staging.FieldSchema(name:d, type:double, comment:null), ] +POSTHOOK: Lineage: orc_pred.dec SIMPLE [(staging)staging.FieldSchema(name:dec, type:decimal, comment:null), ] +POSTHOOK: Lineage: orc_pred.f SIMPLE [(staging)staging.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: orc_pred.i SIMPLE [(staging)staging.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: orc_pred.s SIMPLE [(staging)staging.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: orc_pred.si SIMPLE [(staging)staging.FieldSchema(name:si, type:smallint, comment:null), ] +POSTHOOK: Lineage: orc_pred.t SIMPLE [(staging)staging.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: orc_pred.ts SIMPLE [(staging)staging.FieldSchema(name:ts, type:timestamp, comment:null), ] +19 442 65553 4294967380 26.43 37.77 true alice zipper 2013-03-01 09:11:58.703217 29.62 history +PREHOOK: query: SELECT * FROM orc_pred WHERE t<100 limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_pred +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM orc_pred WHERE t<100 limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_pred +#### A masked pattern was here #### +POSTHOOK: Lineage: orc_pred.b SIMPLE [(staging)staging.FieldSchema(name:b, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_pred.bin SIMPLE [(staging)staging.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: orc_pred.bo SIMPLE [(staging)staging.FieldSchema(name:bo, type:boolean, comment:null), ] +POSTHOOK: Lineage: orc_pred.d SIMPLE [(staging)staging.FieldSchema(name:d, type:double, comment:null), ] +POSTHOOK: Lineage: orc_pred.dec SIMPLE [(staging)staging.FieldSchema(name:dec, type:decimal, comment:null), ] +POSTHOOK: Lineage: orc_pred.f SIMPLE [(staging)staging.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: orc_pred.i SIMPLE [(staging)staging.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: orc_pred.s SIMPLE [(staging)staging.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: orc_pred.si SIMPLE [(staging)staging.FieldSchema(name:si, type:smallint, comment:null), ] +POSTHOOK: Lineage: orc_pred.t SIMPLE [(staging)staging.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: orc_pred.ts SIMPLE [(staging)staging.FieldSchema(name:ts, type:timestamp, comment:null), ] +19 442 65553 4294967380 26.43 37.77 true alice zipper 2013-03-01 09:11:58.703217 29.62 history +PREHOOK: query: SELECT SUM(HASH(t)) FROM orc_pred + WHERE t IS NOT NULL + AND t < 0 + AND t > -2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_pred +#### A masked pattern was here #### +POSTHOOK: query: SELECT SUM(HASH(t)) FROM orc_pred WHERE t IS NOT NULL AND t < 0 AND t > -2