diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java index 1b11e0e762..346ab5c8e7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java @@ -57,6 +57,7 @@ import org.apache.orc.impl.OutStream; import org.apache.orc.impl.RecordReaderUtils; import org.apache.orc.impl.StreamName; +import org.apache.orc.impl.RecordReaderImpl.SargApplier; import org.apache.orc.impl.StreamName.Area; import org.apache.orc.impl.WriterImpl; import org.apache.orc.StripeInformation; @@ -323,15 +324,17 @@ public void readEncodedColumns(int stripeIx, StripeInformation stripe, trace.logColumnRead(i, colRgIx, enc.getKind()); } CreateHelper listToRead = new CreateHelper(); - boolean hasIndexOnlyCols = false; + boolean hasIndexOnlyCols = false, hasAnyNonData = false; for (OrcProto.Stream stream : streamList) { long length = stream.getLength(); int colIx = stream.getColumn(); OrcProto.Stream.Kind streamKind = stream.getKind(); - if (!physicalFileIncludes[colIx] || StreamName.getArea(streamKind) != StreamName.Area.DATA) { - // We have a stream for included column, but in future it might have no data streams. - // It's more like "has at least one column included that has an index stream". - hasIndexOnlyCols = hasIndexOnlyCols || physicalFileIncludes[colIx]; + boolean isIndexCol = StreamName.getArea(streamKind) != StreamName.Area.DATA; + hasAnyNonData = hasAnyNonData || isIndexCol; + // We have a stream for included column, but in future it might have no data streams. + // It's more like "has at least one column included that has an index stream". + hasIndexOnlyCols = hasIndexOnlyCols || (isIndexCol && physicalFileIncludes[colIx]); + if (!physicalFileIncludes[colIx] || isIndexCol) { if (isTracingEnabled) { LOG.trace("Skipping stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length); @@ -367,8 +370,22 @@ public void readEncodedColumns(int stripeIx, StripeInformation stripe, boolean hasFileId = this.fileKey != null; if (listToRead.get() == null) { // No data to read for this stripe. Check if we have some included index-only columns. - // TODO: there may be a bug here. Could there be partial RG filtering on index-only column? - if (hasIndexOnlyCols && (rgs == null)) { + // For example, count(1) would have the root column, that has no data stream, included. + // It may also happen that we have a column included with no streams whatsoever. That + // should only be possible if the file has no index streams. + boolean hasAnyIncludes = false; + if (!hasIndexOnlyCols) { + for (int i = 0; i < physicalFileIncludes.length; ++i) { + if (!physicalFileIncludes[i]) continue; + hasAnyIncludes = true; + break; + } + } + boolean nonProjectionRead = hasIndexOnlyCols || (!hasAnyNonData && hasAnyIncludes); + + // TODO: Could there be partial RG filtering w/no projection? + // We should probably just disable filtering for such cases if they exist. + if (nonProjectionRead && (rgs == SargApplier.READ_ALL_RGS)) { OrcEncodedColumnBatch ecb = POOLS.ecbPool.take(); ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, physicalFileIncludes.length); try { @@ -1004,7 +1021,7 @@ private CacheChunk prepareRangesForCompressedRead(long cOffset, long endCOffset, if (current instanceof CacheChunk) { // 2a. This is a decoded compression buffer, add as is. CacheChunk cc = (CacheChunk)current; - if (isTracingEnabled) { // TODO# HERE unaccompanied lock + if (isTracingEnabled) { LOG.trace("Locking " + cc.getBuffer() + " due to reuse"); } cacheWrapper.reuseBuffer(cc.getBuffer()); diff --git ql/src/test/queries/clientpositive/vector_acid4.q ql/src/test/queries/clientpositive/vector_acid4.q new file mode 100644 index 0000000000..628ecb5825 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_acid4.q @@ -0,0 +1,44 @@ +--! qt:dataset:src + +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; +set hive.vectorized.execution.enabled=true; +set hive.compute.query.using.stats=false; +set hive.fetch.task.conversion=none; +set hive.llap.io.enabled=true; +set hive.compute.query.using.stats=false; +SET hive.exec.orc.default.row.index.stride=1000; +set hive.mapred.mode=nonstrict; + +set hive.exec.orc.delta.streaming.optimizations.enabled=true; + + +drop table cross_numbers; +create table cross_numbers(i string); +insert into table cross_numbers select key from src limit 20; + +drop table lots_of_rows; +create table lots_of_rows(key string) stored as orc tblproperties("transactional"="false"); +insert into table lots_of_rows select concat(key, '', i) from src cross join cross_numbers; + +drop table testacid1; +create table testacid1(id string, id2 string) clustered by (id2) into 2 buckets stored as orc tblproperties("transactional"="true"); +insert into table testacid1 select key, key from lots_of_rows; + +drop table lots_of_row; + +select * from testacid1 order by id limit 30; +select sum(hash(*)) from testacid1 limit 10; + +select count(id) from testacid1; + +select count(1) from testacid1; + +select count(1) from testacid1 where id = '0128'; + +explain update testacid1 set id = '206' where id = '0128'; +update testacid1 set id = '206' where id = '0128'; + +select * from testacid1 order by id limit 30; diff --git ql/src/test/results/clientpositive/llap/vector_acid4.q.out ql/src/test/results/clientpositive/llap/vector_acid4.q.out new file mode 100644 index 0000000000..9ad987a3b1 --- /dev/null +++ ql/src/test/results/clientpositive/llap/vector_acid4.q.out @@ -0,0 +1,264 @@ +PREHOOK: query: drop table cross_numbers +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table cross_numbers +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table cross_numbers(i string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@cross_numbers +POSTHOOK: query: create table cross_numbers(i string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@cross_numbers +PREHOOK: query: insert into table cross_numbers select key from src limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@cross_numbers +POSTHOOK: query: insert into table cross_numbers select key from src limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@cross_numbers +POSTHOOK: Lineage: cross_numbers.i SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: drop table lots_of_rows +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table lots_of_rows +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table lots_of_rows(key string) stored as orc tblproperties("transactional"="false") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@lots_of_rows +POSTHOOK: query: create table lots_of_rows(key string) stored as orc tblproperties("transactional"="false") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@lots_of_rows +Warning: Shuffle Join MERGEJOIN[16][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: insert into table lots_of_rows select concat(key, '', i) from src cross join cross_numbers +PREHOOK: type: QUERY +PREHOOK: Input: default@cross_numbers +PREHOOK: Input: default@src +PREHOOK: Output: default@lots_of_rows +POSTHOOK: query: insert into table lots_of_rows select concat(key, '', i) from src cross join cross_numbers +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cross_numbers +POSTHOOK: Input: default@src +POSTHOOK: Output: default@lots_of_rows +POSTHOOK: Lineage: lots_of_rows.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (cross_numbers)cross_numbers.FieldSchema(name:i, type:string, comment:null), ] +PREHOOK: query: drop table testacid1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table testacid1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table testacid1(id string, id2 string) clustered by (id2) into 2 buckets stored as orc tblproperties("transactional"="true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@testacid1 +POSTHOOK: query: create table testacid1(id string, id2 string) clustered by (id2) into 2 buckets stored as orc tblproperties("transactional"="true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@testacid1 +PREHOOK: query: insert into table testacid1 select key, key from lots_of_rows +PREHOOK: type: QUERY +PREHOOK: Input: default@lots_of_rows +PREHOOK: Output: default@testacid1 +POSTHOOK: query: insert into table testacid1 select key, key from lots_of_rows +POSTHOOK: type: QUERY +POSTHOOK: Input: default@lots_of_rows +POSTHOOK: Output: default@testacid1 +POSTHOOK: Lineage: testacid1.id SIMPLE [(lots_of_rows)lots_of_rows.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: testacid1.id2 SIMPLE [(lots_of_rows)lots_of_rows.FieldSchema(name:key, type:string, comment:null), ] +PREHOOK: query: drop table lots_of_row +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table lots_of_row +POSTHOOK: type: DROPTABLE +PREHOOK: query: select * from testacid1 order by id limit 30 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select * from testacid1 order by id limit 30 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +0128 0128 +0128 0128 +0128 0128 +0150 0150 +0150 0150 +0150 0150 +0165 0165 +0165 0165 +0165 0165 +0193 0193 +0193 0193 +0193 0193 +0213 0213 +0213 0213 +0213 0213 +0224 0224 +0224 0224 +0224 0224 +0238 0238 +0238 0238 +0238 0238 +0255 0255 +0255 0255 +0255 0255 +0265 0265 +0265 0265 +0265 0265 +027 027 +027 027 +027 027 +PREHOOK: query: select sum(hash(*)) from testacid1 limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select sum(hash(*)) from testacid1 limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +8838111640064 +PREHOOK: query: select count(id) from testacid1 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select count(id) from testacid1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +10000 +PREHOOK: query: select count(1) from testacid1 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from testacid1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +10000 +PREHOOK: query: select count(1) from testacid1 where id = '0128' +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from testacid1 where id = '0128' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +3 +PREHOOK: query: explain update testacid1 set id = '206' where id = '0128' +PREHOOK: type: QUERY +POSTHOOK: query: explain update testacid1 set id = '206' where id = '0128' +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: testacid1 + Statistics: Num rows: 10000 Data size: 1800000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (id = '0128') (type: boolean) + Statistics: Num rows: 2 Data size: 360 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ROW__ID (type: struct), id2 (type: string) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: struct) + sort order: + + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: string) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: struct), '206' (type: string), VALUE._col0 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.testacid1 + Write Type: UPDATE + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: false + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.testacid1 + Write Type: UPDATE + + Stage: Stage-3 + Stats Work + Basic Stats Work: + +PREHOOK: query: update testacid1 set id = '206' where id = '0128' +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +PREHOOK: Output: default@testacid1 +POSTHOOK: query: update testacid1 set id = '206' where id = '0128' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +POSTHOOK: Output: default@testacid1 +PREHOOK: query: select * from testacid1 order by id limit 30 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select * from testacid1 order by id limit 30 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +0150 0150 +0150 0150 +0150 0150 +0165 0165 +0165 0165 +0165 0165 +0193 0193 +0193 0193 +0193 0193 +0213 0213 +0213 0213 +0213 0213 +0224 0224 +0224 0224 +0224 0224 +0238 0238 +0238 0238 +0238 0238 +0255 0255 +0255 0255 +0255 0255 +0265 0265 +0265 0265 +0265 0265 +027 027 +027 027 +027 027 +0273 0273 +0273 0273 +0273 0273