diff --git a/pom.xml b/pom.xml
index 1f43c416db..7885c4e338 100644
--- a/pom.xml
+++ b/pom.xml
@@ -189,7 +189,7 @@
2.0.0-M5
4.1.17.Final
3.10.5.Final
- 1.9.0
+ 1.10.0
0.16.0
1.5.6
2.5.0
diff --git a/ql/src/java/org/apache/hadoop/hive/llap/LlapCacheAwareFs.java b/ql/src/java/org/apache/hadoop/hive/llap/LlapCacheAwareFs.java
index c75dd70b9d..f68ebd7c6d 100644
--- a/ql/src/java/org/apache/hadoop/hive/llap/LlapCacheAwareFs.java
+++ b/ql/src/java/org/apache/hadoop/hive/llap/LlapCacheAwareFs.java
@@ -140,7 +140,11 @@ public boolean delete(Path arg0, boolean arg1) throws IOException {
@Override
public FileStatus getFileStatus(Path arg0) throws IOException {
LlapCacheAwareFs.CacheAwareInputStream ctx = getCtx(arg0);
- return ctx.getFs().getFileStatus(ctx.path);
+ FileStatus fileStatus = ctx.getFs().getFileStatus(ctx.path);
+ // We replace the path in the file status by the input path as Parquet
+ // may use the path in the file status to open the file
+ fileStatus.setPath(arg0);
+ return fileStatus;
}
@Override
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java
index 4a17ee494b..9ce1ba4591 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.hive.ql.io.parquet.vector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.ColumnDescriptor;
@@ -168,7 +169,7 @@ public Void visit(DataPageV2 dataPageV2) {
});
}
- private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset, int valueCount)
+ private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount)
throws IOException {
this.pageValueCount = valueCount;
this.endOfPageValueCount = valuesRead + pageValueCount;
@@ -190,7 +191,7 @@ private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset, int
}
try {
- dataColumn.initFromPage(pageValueCount, bytes, offset);
+ dataColumn.initFromPage(pageValueCount, in);
} catch (IOException e) {
throw new IOException("could not read page in col " + descriptor, e);
}
@@ -202,16 +203,15 @@ private void readPageV1(DataPageV1 page) {
this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
try {
- byte[] bytes = page.getBytes().toByteArray();
- LOG.debug("page size " + bytes.length + " bytes and " + pageValueCount + " records");
- LOG.debug("reading repetition levels at 0");
- rlReader.initFromPage(pageValueCount, bytes, 0);
- int next = rlReader.getNextOffset();
- LOG.debug("reading definition levels at " + next);
- dlReader.initFromPage(pageValueCount, bytes, next);
- next = dlReader.getNextOffset();
- LOG.debug("reading data at " + next);
- initDataReader(page.getValueEncoding(), bytes, next, page.getValueCount());
+ BytesInput bytes = page.getBytes();
+ LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records");
+ ByteBufferInputStream in = bytes.toInputStream();
+ LOG.debug("reading repetition levels at " + in.position());
+ rlReader.initFromPage(pageValueCount, in);
+ LOG.debug("reading definition levels at " + in.position());
+ dlReader.initFromPage(pageValueCount, in);
+ LOG.debug("reading data at " + in.position());
+ initDataReader(page.getValueEncoding(), in, page.getValueCount());
} catch (IOException e) {
throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
}
@@ -224,7 +224,7 @@ private void readPageV2(DataPageV2 page) {
this.definitionLevelColumn = newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels());
try {
LOG.debug("page data size " + page.getData().size() + " bytes and " + pageValueCount + " records");
- initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0, page.getValueCount());
+ initDataReader(page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount());
} catch (IOException e) {
throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java
index 954e29bc05..4fd4cfd031 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hive.ql.io.parquet.vector;
+import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.column.Dictionary;
import java.io.IOException;
@@ -31,11 +32,10 @@
/**
* Initialize the reader by page data.
* @param valueCount value count
- * @param page page data
- * @param offset current offset
+ * @param in page data
* @throws IOException
*/
- void initFromPage(int valueCount, byte[] page, int offset) throws IOException;
+ void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException;
/**
* @return the next Dictionary ID from the page
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java
index 0406308dcc..f5f19e1380 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java
@@ -30,6 +30,7 @@
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.column.values.ValuesReader;
import org.apache.parquet.io.api.Binary;
@@ -74,13 +75,9 @@ public DefaultParquetDataColumnReader(Dictionary dict, int length) {
this.length = length;
}
- public void initFromPage(int i, ByteBuffer byteBuffer, int i1) throws IOException {
- valuesReader.initFromPage(i, byteBuffer, i1);
- }
-
@Override
- public void initFromPage(int valueCount, byte[] page, int offset) throws IOException {
- this.initFromPage(valueCount, ByteBuffer.wrap(page), offset);
+ public void initFromPage(int i, ByteBufferInputStream in) throws IOException {
+ valuesReader.initFromPage(i, in);
}
@Override
diff --git a/ql/src/test/results/clientpositive/parquet_analyze.q.out b/ql/src/test/results/clientpositive/parquet_analyze.q.out
index 7a024f9c9e..e746621afa 100644
--- a/ql/src/test/results/clientpositive/parquet_analyze.q.out
+++ b/ql/src/test/results/clientpositive/parquet_analyze.q.out
@@ -94,7 +94,7 @@ Table Parameters:
numFiles 1
numRows 100
rawDataSize 700
- totalSize 6700
+ totalSize 6692
#### A masked pattern was here ####
# Storage Information
@@ -141,8 +141,8 @@ Table Parameters:
bucketing_version 2
numFiles 1
numRows 100
- rawDataSize 5952
- totalSize 6700
+ rawDataSize 5936
+ totalSize 6692
#### A masked pattern was here ####
# Storage Information
diff --git a/ql/src/test/results/clientpositive/parquet_vectorization_0.q.out b/ql/src/test/results/clientpositive/parquet_vectorization_0.q.out
index 35253a837f..0509c7748e 100644
--- a/ql/src/test/results/clientpositive/parquet_vectorization_0.q.out
+++ b/ql/src/test/results/clientpositive/parquet_vectorization_0.q.out
@@ -1739,7 +1739,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -1761,7 +1761,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
@@ -30556,7 +30556,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -30578,7 +30578,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
@@ -30668,7 +30668,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -30690,7 +30690,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
@@ -30781,7 +30781,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -30803,7 +30803,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
@@ -30882,7 +30882,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -30904,7 +30904,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
diff --git a/ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out b/ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out
index 2aaf23dbbc..02bac57bf2 100644
--- a/ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out
+++ b/ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out
@@ -1830,7 +1830,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -1852,7 +1852,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
@@ -30652,7 +30652,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -30674,7 +30674,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
@@ -30767,7 +30767,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -30789,7 +30789,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
@@ -30883,7 +30883,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -30905,7 +30905,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet
@@ -30989,7 +30989,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
@@ -31011,7 +31011,7 @@ STAGE PLANS:
serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2}
serialization.format 1
serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
- totalSize 594976
+ totalSize 595103
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
name: default.alltypesparquet