diff --git a/pom.xml b/pom.xml index 1f43c416db..7885c4e338 100644 --- a/pom.xml +++ b/pom.xml @@ -189,7 +189,7 @@ 2.0.0-M5 4.1.17.Final 3.10.5.Final - 1.9.0 + 1.10.0 0.16.0 1.5.6 2.5.0 diff --git a/ql/src/java/org/apache/hadoop/hive/llap/LlapCacheAwareFs.java b/ql/src/java/org/apache/hadoop/hive/llap/LlapCacheAwareFs.java index c75dd70b9d..f68ebd7c6d 100644 --- a/ql/src/java/org/apache/hadoop/hive/llap/LlapCacheAwareFs.java +++ b/ql/src/java/org/apache/hadoop/hive/llap/LlapCacheAwareFs.java @@ -140,7 +140,11 @@ public boolean delete(Path arg0, boolean arg1) throws IOException { @Override public FileStatus getFileStatus(Path arg0) throws IOException { LlapCacheAwareFs.CacheAwareInputStream ctx = getCtx(arg0); - return ctx.getFs().getFileStatus(ctx.path); + FileStatus fileStatus = ctx.getFs().getFileStatus(ctx.path); + // We replace the path in the file status by the input path as Parquet + // may use the path in the file status to open the file + fileStatus.setPath(arg0); + return fileStatus; } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java index 4a17ee494b..9ce1ba4591 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.io.parquet.vector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.ColumnDescriptor; @@ -168,7 +169,7 @@ public Void visit(DataPageV2 dataPageV2) { }); } - private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset, int valueCount) + private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) throws IOException { this.pageValueCount = valueCount; this.endOfPageValueCount = valuesRead + pageValueCount; @@ -190,7 +191,7 @@ private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset, int } try { - dataColumn.initFromPage(pageValueCount, bytes, offset); + dataColumn.initFromPage(pageValueCount, in); } catch (IOException e) { throw new IOException("could not read page in col " + descriptor, e); } @@ -202,16 +203,15 @@ private void readPageV1(DataPageV1 page) { this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); try { - byte[] bytes = page.getBytes().toByteArray(); - LOG.debug("page size " + bytes.length + " bytes and " + pageValueCount + " records"); - LOG.debug("reading repetition levels at 0"); - rlReader.initFromPage(pageValueCount, bytes, 0); - int next = rlReader.getNextOffset(); - LOG.debug("reading definition levels at " + next); - dlReader.initFromPage(pageValueCount, bytes, next); - next = dlReader.getNextOffset(); - LOG.debug("reading data at " + next); - initDataReader(page.getValueEncoding(), bytes, next, page.getValueCount()); + BytesInput bytes = page.getBytes(); + LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records"); + ByteBufferInputStream in = bytes.toInputStream(); + LOG.debug("reading repetition levels at " + in.position()); + rlReader.initFromPage(pageValueCount, in); + LOG.debug("reading definition levels at " + in.position()); + dlReader.initFromPage(pageValueCount, in); + LOG.debug("reading data at " + in.position()); + initDataReader(page.getValueEncoding(), in, page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e); } @@ -224,7 +224,7 @@ private void readPageV2(DataPageV2 page) { this.definitionLevelColumn = newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); try { LOG.debug("page data size " + page.getData().size() + " bytes and " + pageValueCount + " records"); - initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0, page.getValueCount()); + initDataReader(page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java index 954e29bc05..4fd4cfd031 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.io.parquet.vector; +import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.column.Dictionary; import java.io.IOException; @@ -31,11 +32,10 @@ /** * Initialize the reader by page data. * @param valueCount value count - * @param page page data - * @param offset current offset + * @param in page data * @throws IOException */ - void initFromPage(int valueCount, byte[] page, int offset) throws IOException; + void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException; /** * @return the next Dictionary ID from the page diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java index 0406308dcc..f5f19e1380 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.column.Dictionary; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.io.api.Binary; @@ -74,13 +75,9 @@ public DefaultParquetDataColumnReader(Dictionary dict, int length) { this.length = length; } - public void initFromPage(int i, ByteBuffer byteBuffer, int i1) throws IOException { - valuesReader.initFromPage(i, byteBuffer, i1); - } - @Override - public void initFromPage(int valueCount, byte[] page, int offset) throws IOException { - this.initFromPage(valueCount, ByteBuffer.wrap(page), offset); + public void initFromPage(int i, ByteBufferInputStream in) throws IOException { + valuesReader.initFromPage(i, in); } @Override diff --git a/ql/src/test/results/clientpositive/parquet_analyze.q.out b/ql/src/test/results/clientpositive/parquet_analyze.q.out index 7a024f9c9e..e746621afa 100644 --- a/ql/src/test/results/clientpositive/parquet_analyze.q.out +++ b/ql/src/test/results/clientpositive/parquet_analyze.q.out @@ -94,7 +94,7 @@ Table Parameters: numFiles 1 numRows 100 rawDataSize 700 - totalSize 6700 + totalSize 6692 #### A masked pattern was here #### # Storage Information @@ -141,8 +141,8 @@ Table Parameters: bucketing_version 2 numFiles 1 numRows 100 - rawDataSize 5952 - totalSize 6700 + rawDataSize 5936 + totalSize 6692 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/parquet_vectorization_0.q.out b/ql/src/test/results/clientpositive/parquet_vectorization_0.q.out index 35253a837f..0509c7748e 100644 --- a/ql/src/test/results/clientpositive/parquet_vectorization_0.q.out +++ b/ql/src/test/results/clientpositive/parquet_vectorization_0.q.out @@ -1739,7 +1739,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -1761,7 +1761,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30556,7 +30556,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30578,7 +30578,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30668,7 +30668,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30690,7 +30690,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30781,7 +30781,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30803,7 +30803,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30882,7 +30882,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30904,7 +30904,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet diff --git a/ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out b/ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out index 2aaf23dbbc..02bac57bf2 100644 --- a/ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out +++ b/ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out @@ -1830,7 +1830,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -1852,7 +1852,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30652,7 +30652,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30674,7 +30674,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30767,7 +30767,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30789,7 +30789,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30883,7 +30883,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30905,7 +30905,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30989,7 +30989,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -31011,7 +31011,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 594976 + totalSize 595103 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet