diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java index 907a9b8..4a17ee4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java @@ -1,9 +1,13 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -14,10 +18,10 @@ package org.apache.hadoop.hive.ql.io.parquet.vector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.Dictionary; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.page.DataPage; import org.apache.parquet.column.page.DataPageV1; @@ -27,6 +31,7 @@ import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.DecimalMetadata; import org.apache.parquet.schema.Type; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,7 +67,7 @@ /** * The dictionary, if this column has dictionary encoding. */ - protected final Dictionary dictionary; + protected final ParquetDataColumnReader dictionary; /** * If true, the current page is dictionary encoded. @@ -82,7 +87,7 @@ */ protected IntIterator repetitionLevelColumn; protected IntIterator definitionLevelColumn; - protected ValuesReader dataColumn; + protected ParquetDataColumnReader dataColumn; /** * Total values in the current page. @@ -92,22 +97,39 @@ protected final PageReader pageReader; protected final ColumnDescriptor descriptor; protected final Type type; + protected final TypeInfo hiveType; + + /** + * Used for VectorizedDummyColumnReader. + */ + public BaseVectorizedColumnReader(){ + this.pageReader = null; + this.descriptor = null; + this.type = null; + this.dictionary = null; + this.hiveType = null; + this.maxDefLevel = -1; + } public BaseVectorizedColumnReader( ColumnDescriptor descriptor, PageReader pageReader, boolean skipTimestampConversion, - Type type) throws IOException { + Type parquetType, TypeInfo hiveType) throws IOException { this.descriptor = descriptor; - this.type = type; + this.type = parquetType; this.pageReader = pageReader; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); this.skipTimestampConversion = skipTimestampConversion; + this.hiveType = hiveType; DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { - this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); + this.dictionary = ParquetDataColumnReaderFactory + .getDataColumnReaderByTypeOnDictionary(parquetType.asPrimitiveType(), hiveType, + dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage), + skipTimestampConversion); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); @@ -130,7 +152,7 @@ protected void readPage() throws IOException { if (page == null) { return; } - // TODO: Why is this a visitor? + page.accept(new DataPage.Visitor() { @Override public Void visit(DataPageV1 dataPageV1) { @@ -146,7 +168,8 @@ public Void visit(DataPageV2 dataPageV2) { }); } - private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset, int valueCount) throws IOException { + private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset, int valueCount) + throws IOException { this.pageValueCount = valueCount; this.endOfPageValueCount = valuesRead + pageValueCount; if (dataEncoding.usesDictionary()) { @@ -156,10 +179,13 @@ private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset, int "could not read page in col " + descriptor + " as the dictionary was missing for encoding " + dataEncoding); } - dataColumn = dataEncoding.getDictionaryBasedValuesReader(descriptor, VALUES, dictionary); + dataColumn = ParquetDataColumnReaderFactory.getDataColumnReaderByType(type.asPrimitiveType(), hiveType, + dataEncoding.getDictionaryBasedValuesReader(descriptor, VALUES, dictionary + .getDictionary()), skipTimestampConversion); this.isCurrentPageDictionaryEncoded = true; } else { - dataColumn = dataEncoding.getValuesReader(descriptor, VALUES); + dataColumn = ParquetDataColumnReaderFactory.getDataColumnReaderByType(type.asPrimitiveType(), hiveType, + dataEncoding.getValuesReader(descriptor, VALUES), skipTimestampConversion); this.isCurrentPageDictionaryEncoded = false; } @@ -219,8 +245,20 @@ private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { } /** + * Check the underlying Parquet file is able to parse as Hive Decimal type. + * + * @param type + */ + protected void decimalTypeCheck(Type type) { + DecimalMetadata decimalMetadata = type.asPrimitiveType().getDecimalMetadata(); + if (decimalMetadata == null) { + throw new UnsupportedOperationException("The underlying Parquet type cannot be able to " + + "converted to Hive Decimal type: " + type); + } + } + + /** * Utility classes to abstract over different way to read ints with different encodings. - * TODO: remove this layer of abstraction? */ abstract static class IntIterator { abstract int nextInt(); @@ -258,6 +296,8 @@ int nextInt() { protected static final class NullIntIterator extends IntIterator { @Override - int nextInt() { return 0; } + int nextInt() { + return 0; + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java new file mode 100644 index 0000000..6bfa95a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReader.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.parquet.vector; + +import org.apache.parquet.column.Dictionary; + +import java.io.IOException; +import java.sql.Timestamp; + +/** + * The interface to wrap the underlying Parquet dictionary and non dictionary encoded page reader. + */ +public interface ParquetDataColumnReader { + + /** + * Initialize the reader by page data. + * @param valueCount value count + * @param page page data + * @param offset current offset + * @throws IOException + */ + void initFromPage(int valueCount, byte[] page, int offset) throws IOException; + + /** + * @return the next Dictionary ID from the page + */ + int readValueDictionaryId(); + + /** + * @return the next Long from the page + */ + long readLong(); + + /** + * @return the next Integer from the page + */ + int readInteger(); + + /** + * @return the next Float from the page + */ + float readFloat(); + + /** + * @return the next Boolean from the page + */ + boolean readBoolean(); + + /** + * @return the next String from the page + */ + byte[] readString(); + + /** + * @return the next Varchar from the page + */ + byte[] readVarchar(); + + /** + * @return the next Char from the page + */ + byte[] readChar(); + + /** + * @return the next Bytes from the page + */ + byte[] readBytes(); + + /** + * @return the next Decimal from the page + */ + byte[] readDecimal(); + + /** + * @return the next Double from the page + */ + double readDouble(); + + /** + * @return the next Timestamp from the page + */ + Timestamp readTimestamp(); + + /** + * @return the underlying dictionary if current reader is dictionary encoded + */ + Dictionary getDictionary(); + + /** + * @param id in dictionary + * @return the Bytes from the dictionary by id + */ + byte[] readBytes(int id); + + /** + * @param id in dictionary + * @return the Float from the dictionary by id + */ + float readFloat(int id); + + /** + * @param id in dictionary + * @return the Double from the dictionary by id + */ + double readDouble(int id); + + /** + * @param id in dictionary + * @return the Integer from the dictionary by id + */ + int readInteger(int id); + + /** + * @param id in dictionary + * @return the Long from the dictionary by id + */ + long readLong(int id); + + /** + * @param id in dictionary + * @return the Boolean from the dictionary by id + */ + boolean readBoolean(int id); + + /** + * @param id in dictionary + * @return the Decimal from the dictionary by id + */ + byte[] readDecimal(int id); + + /** + * @param id in dictionary + * @return the Timestamp from the dictionary by id + */ + Timestamp readTimestamp(int id); + + /** + * @param id in dictionary + * @return the String from the dictionary by id + */ + byte[] readString(int id); + + /** + * @param id in dictionary + * @return the Varchar from the dictionary by id + */ + byte[] readVarchar(int id); + + /** + * @param id in dictionary + * @return the Char from the dictionary by id + */ + byte[] readChar(int id); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java new file mode 100644 index 0000000..898a2c6 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java @@ -0,0 +1,908 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.parquet.vector; + +import org.apache.hadoop.hive.common.type.HiveBaseChar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.sql.Timestamp; +import java.util.Arrays; + +/** + * Parquet file has self-describing schema which may differ from the user required schema (e.g. + * schema evolution). This factory is used to retrieve user required typed data via corresponding + * reader which reads the underlying data. + */ +public final class ParquetDataColumnReaderFactory { + + private ParquetDataColumnReaderFactory() { + } + + /** + * The default data column reader for existing Parquet page reader which works for both + * dictionary or non dictionary types, Mirror from dictionary encoding path. + */ + public static class DefaultParquetDataColumnReader implements ParquetDataColumnReader { + protected ValuesReader valuesReader; + protected Dictionary dict; + + // Varchar or char length + protected int length = -1; + + public DefaultParquetDataColumnReader(ValuesReader valuesReader, int length) { + this.valuesReader = valuesReader; + this.length = length; + } + + public DefaultParquetDataColumnReader(Dictionary dict, int length) { + this.dict = dict; + this.length = length; + } + + public void initFromPage(int i, ByteBuffer byteBuffer, int i1) throws IOException { + valuesReader.initFromPage(i, byteBuffer, i1); + } + + @Override + public void initFromPage(int valueCount, byte[] page, int offset) throws IOException { + this.initFromPage(valueCount, ByteBuffer.wrap(page), offset); + } + + @Override + public boolean readBoolean() { + return valuesReader.readBoolean(); + } + + @Override + public boolean readBoolean(int id) { + return dict.decodeToBoolean(id); + } + + @Override + public byte[] readString(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readString() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readVarchar() { + // we need to enforce the size here even the types are the same + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readVarchar(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readChar() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readChar(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readBytes() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readBytes(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readDecimal() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readDecimal(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public float readFloat() { + return valuesReader.readFloat(); + } + + @Override + public float readFloat(int id) { + return dict.decodeToFloat(id); + } + + @Override + public double readDouble() { + return valuesReader.readDouble(); + } + + @Override + public double readDouble(int id) { + return dict.decodeToDouble(id); + } + + @Override + public Timestamp readTimestamp() { + throw new RuntimeException("Unsupported operation"); + } + + @Override + public Timestamp readTimestamp(int id) { + throw new RuntimeException("Unsupported operation"); + } + + @Override + public int readInteger() { + return valuesReader.readInteger(); + } + + @Override + public int readInteger(int id) { + return dict.decodeToInt(id); + } + + @Override + public long readLong(int id) { + return dict.decodeToLong(id); + } + + @Override + public long readLong() { + return valuesReader.readLong(); + } + + @Override + public int readValueDictionaryId() { + return valuesReader.readValueDictionaryId(); + } + + public void skip() { + valuesReader.skip(); + } + + @Override + public Dictionary getDictionary() { + return dict; + } + + /** + * Enforce the max legnth of varchar or char. + */ + protected String enforceMaxLength(String value) { + return HiveBaseChar.enforceMaxLength(value, length); + } + + /** + * Enforce the char length. + */ + protected String getPaddedString(String value) { + return HiveBaseChar.getPaddedValue(value, length); + } + + /** + * Method to convert string to UTF-8 bytes. + */ + protected static byte[] convertToBytes(String value) { + try { + // convert integer to string + return value.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("Failed to encode string in UTF-8", e); + } + } + } + + /** + * The reader who reads from the underlying int32 value value. Implementation is in consist with + * ETypeConverter EINT32_CONVERTER + */ + public static class TypesFromInt32PageReader extends DefaultParquetDataColumnReader { + + public TypesFromInt32PageReader(ValuesReader realReader, int length) { + super(realReader, length); + } + + public TypesFromInt32PageReader(Dictionary dict, int length) { + super(dict, length); + } + + @Override + public long readLong() { + return valuesReader.readInteger(); + } + + @Override + public long readLong(int id) { + return dict.decodeToInt(id); + } + + @Override + public float readFloat() { + return valuesReader.readInteger(); + } + + @Override + public float readFloat(int id) { + return dict.decodeToInt(id); + } + + @Override + public double readDouble() { + return valuesReader.readInteger(); + } + + @Override + public double readDouble(int id) { + return dict.decodeToInt(id); + } + + @Override + public byte[] readString() { + return convertToBytes(valuesReader.readInteger()); + } + + @Override + public byte[] readString(int id) { + return convertToBytes(dict.decodeToInt(id)); + } + + @Override + public byte[] readVarchar() { + String value = enforceMaxLength( + convertToString(valuesReader.readInteger())); + return convertToBytes(value); + } + + @Override + public byte[] readVarchar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToInt(id))); + return convertToBytes(value); + } + + @Override + public byte[] readChar() { + String value = enforceMaxLength( + convertToString(valuesReader.readInteger())); + return convertToBytes(value); + } + + @Override + public byte[] readChar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToInt(id))); + return convertToBytes(value); + } + + private static String convertToString(int value) { + return Integer.toString(value); + } + + private static byte[] convertToBytes(int value) { + return convertToBytes(convertToString(value)); + } + } + + /** + * The reader who reads from the underlying int64 value value. Implementation is in consist with + * ETypeConverter EINT64_CONVERTER + */ + public static class TypesFromInt64PageReader extends DefaultParquetDataColumnReader { + + public TypesFromInt64PageReader(ValuesReader realReader, int length) { + super(realReader, length); + } + + public TypesFromInt64PageReader(Dictionary dict, int length) { + super(dict, length); + } + + @Override + public float readFloat() { + return valuesReader.readLong(); + } + + @Override + public float readFloat(int id) { + return dict.decodeToLong(id); + } + + @Override + public double readDouble() { + return valuesReader.readLong(); + } + + @Override + public double readDouble(int id) { + return dict.decodeToLong(id); + } + + @Override + public byte[] readString() { + return convertToBytes(valuesReader.readLong()); + } + + @Override + public byte[] readString(int id) { + return convertToBytes(dict.decodeToLong(id)); + } + + @Override + public byte[] readVarchar() { + String value = enforceMaxLength( + convertToString(valuesReader.readLong())); + return convertToBytes(value); + } + + @Override + public byte[] readVarchar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToLong(id))); + return convertToBytes(value); + } + + @Override + public byte[] readChar() { + String value = enforceMaxLength( + convertToString(valuesReader.readLong())); + return convertToBytes(value); + } + + @Override + public byte[] readChar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToLong(id))); + return convertToBytes(value); + } + + private static String convertToString(long value) { + return Long.toString(value); + } + + private static byte[] convertToBytes(long value) { + return convertToBytes(convertToString(value)); + } + } + + /** + * The reader who reads from the underlying float value value. Implementation is in consist with + * ETypeConverter EFLOAT_CONVERTER + */ + public static class TypesFromFloatPageReader extends DefaultParquetDataColumnReader { + + public TypesFromFloatPageReader(ValuesReader realReader, int length) { + super(realReader, length); + } + + public TypesFromFloatPageReader(Dictionary realReader, int length) { + super(realReader, length); + } + + @Override + public double readDouble() { + return valuesReader.readFloat(); + } + + @Override + public double readDouble(int id) { + return dict.decodeToFloat(id); + } + + @Override + public byte[] readString() { + return convertToBytes(valuesReader.readFloat()); + } + + @Override + public byte[] readString(int id) { + return convertToBytes(dict.decodeToFloat(id)); + } + + @Override + public byte[] readVarchar() { + String value = enforceMaxLength( + convertToString(valuesReader.readFloat())); + return convertToBytes(value); + } + + @Override + public byte[] readVarchar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToFloat(id))); + return convertToBytes(value); + } + + @Override + public byte[] readChar() { + String value = enforceMaxLength( + convertToString(valuesReader.readFloat())); + return convertToBytes(value); + } + + @Override + public byte[] readChar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToFloat(id))); + return convertToBytes(value); + } + + private static String convertToString(float value) { + return Float.toString(value); + } + + private static byte[] convertToBytes(float value) { + return convertToBytes(convertToString(value)); + } + } + + /** + * The reader who reads from the underlying double value value. + */ + public static class TypesFromDoublePageReader extends DefaultParquetDataColumnReader { + + public TypesFromDoublePageReader(ValuesReader realReader, int length) { + super(realReader, length); + } + + public TypesFromDoublePageReader(Dictionary dict, int length) { + super(dict, length); + } + + @Override + public byte[] readString() { + return convertToBytes(valuesReader.readDouble()); + } + + @Override + public byte[] readString(int id) { + return convertToBytes(dict.decodeToDouble(id)); + } + + @Override + public byte[] readVarchar() { + String value = enforceMaxLength( + convertToString(valuesReader.readDouble())); + return convertToBytes(value); + } + + @Override + public byte[] readVarchar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToDouble(id))); + return convertToBytes(value); + } + + @Override + public byte[] readChar() { + String value = enforceMaxLength( + convertToString(valuesReader.readDouble())); + return convertToBytes(value); + } + + @Override + public byte[] readChar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToDouble(id))); + return convertToBytes(value); + } + + private static String convertToString(double value) { + return Double.toString(value); + } + + private static byte[] convertToBytes(double value) { + return convertToBytes(convertToString(value)); + } + } + + /** + * The reader who reads from the underlying boolean value value. + */ + public static class TypesFromBooleanPageReader extends DefaultParquetDataColumnReader { + + public TypesFromBooleanPageReader(ValuesReader valuesReader, int length) { + super(valuesReader, length); + } + + public TypesFromBooleanPageReader(Dictionary dict, int length) { + super(dict, length); + } + + @Override + public byte[] readString() { + return convertToBytes(valuesReader.readBoolean()); + } + + @Override + public byte[] readString(int id) { + return convertToBytes(dict.decodeToBoolean(id)); + } + + @Override + public byte[] readVarchar() { + String value = enforceMaxLength( + convertToString(valuesReader.readBoolean())); + return convertToBytes(value); + } + + @Override + public byte[] readVarchar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToBoolean(id))); + return convertToBytes(value); + } + + @Override + public byte[] readChar() { + String value = enforceMaxLength( + convertToString(valuesReader.readBoolean())); + return convertToBytes(value); + } + + @Override + public byte[] readChar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToBoolean(id))); + return convertToBytes(value); + } + + private static String convertToString(boolean value) { + return Boolean.toString(value); + } + + private static byte[] convertToBytes(boolean value) { + return convertToBytes(convertToString(value)); + } + } + + /** + * The reader who reads from the underlying Timestamp value value. + */ + public static class TypesFromInt96PageReader extends DefaultParquetDataColumnReader { + private boolean skipTimestampConversion = false; + + public TypesFromInt96PageReader(ValuesReader realReader, int length, + boolean skipTimestampConversion) { + super(realReader, length); + this.skipTimestampConversion = skipTimestampConversion; + } + + public TypesFromInt96PageReader(Dictionary dict, int length, boolean skipTimestampConversion) { + super(dict, length); + this.skipTimestampConversion = skipTimestampConversion; + } + + private Timestamp convert(Binary binary) { + ByteBuffer buf = binary.toByteBuffer(); + buf.order(ByteOrder.LITTLE_ENDIAN); + long timeOfDayNanos = buf.getLong(); + int julianDay = buf.getInt(); + NanoTime nt = new NanoTime(julianDay, timeOfDayNanos); + return NanoTimeUtils.getTimestamp(nt, skipTimestampConversion); + } + + @Override + public Timestamp readTimestamp(int id) { + return convert(dict.decodeToBinary(id)); + } + + @Override + public Timestamp readTimestamp() { + return convert(valuesReader.readBytes()); + } + + @Override + public byte[] readString() { + return convertToBytes(readTimestamp()); + } + + @Override + public byte[] readString(int id) { + return convertToBytes(readTimestamp(id)); + } + + @Override + public byte[] readVarchar() { + String value = enforceMaxLength( + convertToString(readTimestamp())); + return convertToBytes(value); + } + + @Override + public byte[] readVarchar(int id) { + String value = enforceMaxLength( + convertToString(readTimestamp(id))); + return convertToBytes(value); + } + + @Override + public byte[] readChar() { + String value = enforceMaxLength( + convertToString(readTimestamp())); + return convertToBytes(value); + } + + @Override + public byte[] readChar(int id) { + String value = enforceMaxLength( + convertToString(readTimestamp(id))); + return convertToBytes(value); + } + + private static String convertToString(Timestamp value) { + return value.toString(); + } + + private static byte[] convertToBytes(Timestamp value) { + return convertToBytes(convertToString(value)); + } + } + + /** + * The reader who reads from the underlying decimal value value. + */ + public static class TypesFromDecimalPageReader extends DefaultParquetDataColumnReader { + private HiveDecimalWritable tempDecimal = new HiveDecimalWritable(); + private short scale; + + public TypesFromDecimalPageReader(ValuesReader realReader, int length, short scale) { + super(realReader, length); + this.scale = scale; + } + + public TypesFromDecimalPageReader(Dictionary dict, int length, short scale) { + super(dict, length); + this.scale = scale; + } + + @Override + public byte[] readString() { + return convertToBytes(valuesReader.readBytes()); + } + + @Override + public byte[] readString(int id) { + return convertToBytes(dict.decodeToBinary(id)); + } + + @Override + public byte[] readVarchar() { + String value = enforceMaxLength( + convertToString(valuesReader.readBytes())); + return convertToBytes(value); + } + + @Override + public byte[] readVarchar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToBinary(id))); + return convertToBytes(value); + } + + @Override + public byte[] readChar() { + String value = enforceMaxLength( + convertToString(valuesReader.readBytes())); + return convertToBytes(value); + } + + @Override + public byte[] readChar(int id) { + String value = enforceMaxLength( + convertToString(dict.decodeToBinary(id))); + return convertToBytes(value); + } + + private String convertToString(Binary value) { + tempDecimal.set(value.getBytesUnsafe(), scale); + return tempDecimal.toString(); + } + + private byte[] convertToBytes(Binary value) { + return convertToBytes(convertToString(value)); + } + } + + /** + * The reader who reads from the underlying UTF8 string. + */ + public static class TypesFromStringPageReader extends DefaultParquetDataColumnReader { + + public TypesFromStringPageReader(ValuesReader realReader, int length) { + super(realReader, length); + } + + public TypesFromStringPageReader(Dictionary dict, int length) { + super(dict, length); + } + + @Override + public byte[] readVarchar() { + // check the character numbers with the length + final byte[] value = valuesReader.readBytes().getBytesUnsafe(); + return truncateIfNecesssary(value); + } + + @Override + public byte[] readVarchar(int id) { + // check the character numbers with the length + final byte[] value = dict.decodeToBinary(id).getBytesUnsafe(); + return truncateIfNecesssary(value); + } + + @Override + public byte[] readChar() { + // check the character numbers with the length + final byte[] value = valuesReader.readBytes().getBytesUnsafe(); + return truncateIfNecesssary(value); + } + + @Override + public byte[] readChar(int id) { + // check the character numbers with the length + final byte[] value = dict.decodeToBinary(id).getBytesUnsafe(); + return truncateIfNecesssary(value); + } + + private byte[] truncateIfNecesssary(byte[] bytes) { + if (length <= 0 || bytes == null) { + return bytes; + } + + int len = bytes.length; + int truncatedLength = StringExpr.truncate(bytes, 0, len, length); + if (truncatedLength >= len) { + return bytes; + } + + return Arrays.copyOf(bytes, truncatedLength); + } + } + + private static ParquetDataColumnReader getDataColumnReaderByTypeHelper(boolean isDictionary, + PrimitiveType parquetType, + TypeInfo hiveType, + Dictionary dictionary, + ValuesReader valuesReader, + boolean + skipTimestampConversion) + throws IOException { + // max length for varchar and char cases + int length = getVarcharLength(hiveType); + + switch (parquetType.getPrimitiveTypeName()) { + case INT32: + return isDictionary ? new TypesFromInt32PageReader(dictionary, length) : new + TypesFromInt32PageReader(valuesReader, length); + case INT64: + return isDictionary ? new TypesFromInt64PageReader(dictionary, length) : new + TypesFromInt64PageReader(valuesReader, length); + case FLOAT: + return isDictionary ? new TypesFromFloatPageReader(dictionary, length) : new + TypesFromFloatPageReader(valuesReader, length); + case INT96: + return isDictionary ? new TypesFromInt96PageReader(dictionary, length, + skipTimestampConversion) : new + TypesFromInt96PageReader(valuesReader, length, skipTimestampConversion); + case BOOLEAN: + return isDictionary ? new TypesFromBooleanPageReader(dictionary, length) : new + TypesFromBooleanPageReader(valuesReader, length); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return getConvertorFromBinary(isDictionary, parquetType, hiveType, valuesReader, dictionary); + case DOUBLE: + return isDictionary ? new TypesFromDoublePageReader(dictionary, length) : new + TypesFromDoublePageReader(valuesReader, length); + default: + return isDictionary ? new DefaultParquetDataColumnReader(dictionary, length) : new + DefaultParquetDataColumnReader(valuesReader, length); + } + } + + private static ParquetDataColumnReader getConvertorFromBinary(boolean isDict, + PrimitiveType parquetType, + TypeInfo hiveType, + ValuesReader valuesReader, + Dictionary dictionary) { + OriginalType originalType = parquetType.getOriginalType(); + + // max length for varchar and char cases + int length = getVarcharLength(hiveType); + + if (originalType == null) { + return isDict ? new DefaultParquetDataColumnReader(dictionary, length) : new + DefaultParquetDataColumnReader(valuesReader, length); + } + switch (originalType) { + case DECIMAL: + final short scale = (short) parquetType.asPrimitiveType().getDecimalMetadata().getScale(); + return isDict ? new TypesFromDecimalPageReader(dictionary, length, scale) : new + TypesFromDecimalPageReader(valuesReader, length, scale); + case UTF8: + return isDict ? new TypesFromStringPageReader(dictionary, length) : new + TypesFromStringPageReader(valuesReader, length); + default: + return isDict ? new DefaultParquetDataColumnReader(dictionary, length) : new + DefaultParquetDataColumnReader(valuesReader, length); + } + } + + public static ParquetDataColumnReader getDataColumnReaderByTypeOnDictionary( + PrimitiveType parquetType, + TypeInfo hiveType, + Dictionary realReader, boolean skipTimestampConversion) + throws IOException { + return getDataColumnReaderByTypeHelper(true, parquetType, hiveType, realReader, null, + skipTimestampConversion); + } + + public static ParquetDataColumnReader getDataColumnReaderByType(PrimitiveType parquetType, + TypeInfo hiveType, + ValuesReader realReader, + boolean skipTimestampConversion) + throws IOException { + return getDataColumnReaderByTypeHelper(false, parquetType, hiveType, null, realReader, + skipTimestampConversion); + } + + + // For Varchar or char type, return the max length of the type + private static int getVarcharLength(TypeInfo hiveType) { + int length = -1; + if (hiveType instanceof PrimitiveTypeInfo) { + PrimitiveTypeInfo hivePrimitiveType = (PrimitiveTypeInfo) hiveType; + switch (hivePrimitiveType.getPrimitiveCategory()) { + case CHAR: + length = ((CharTypeInfo) hivePrimitiveType).getLength(); + break; + case VARCHAR: + length = ((VarcharTypeInfo) hivePrimitiveType).getLength(); + break; + default: + break; + } + } + + return length; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java new file mode 100644 index 0000000..ee1d692 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.parquet.vector; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.io.IOException; +import java.util.Arrays; + +/** + * A dummy vectorized parquet reader to support schema evolution. + */ +public class VectorizedDummyColumnReader extends BaseVectorizedColumnReader { + + public VectorizedDummyColumnReader() { + super(); + } + + @Override + public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException { + Arrays.fill(column.isNull, true); + column.isRepeating = true; + column.noNulls = false; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java index c36640d..a8e8599 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java @@ -33,6 +33,7 @@ /** * It's column level Parquet reader which is used to read a batch of records for a list column. + * TODO Currently List type only support non nested case. */ public class VectorizedListColumnReader extends BaseVectorizedColumnReader { @@ -46,8 +47,9 @@ boolean isFirstRow = true; public VectorizedListColumnReader(ColumnDescriptor descriptor, PageReader pageReader, - boolean skipTimestampConversion, Type type) throws IOException { - super(descriptor, pageReader, skipTimestampConversion, type); + boolean skipTimestampConversion, Type type, TypeInfo hiveType) + throws IOException { + super(descriptor, pageReader, skipTimestampConversion, type, hiveType); } @Override @@ -81,7 +83,7 @@ public void readBatch(int total, ColumnVector column, TypeInfo columnType) throw // Decode the value if necessary if (isCurrentPageDictionaryEncoded) { - valueList = decodeDictionaryIds(valueList); + valueList = decodeDictionaryIds(category, valueList); } // Convert valueList to array for the ListColumnVector.child convertValueListToListColumnVector(category, lcv, valueList, index); @@ -142,75 +144,112 @@ private void addElement(ListColumnVector lcv, List elements, PrimitiveOb lcv.lengths[index] = elements.size() - lcv.offsets[index]; } + // Need to be in consistent with that VectorizedPrimitiveColumnReader#readBatchHelper + // TODO Reduce the duplicated code private Object readPrimitiveTypedRow(PrimitiveObjectInspector.PrimitiveCategory category) { switch (category) { - case INT: - case BYTE: - case SHORT: - return dataColumn.readInteger(); - case DATE: - case INTERVAL_YEAR_MONTH: - case LONG: - return dataColumn.readLong(); - case BOOLEAN: - return dataColumn.readBoolean() ? 1 : 0; - case DOUBLE: - return dataColumn.readDouble(); - case BINARY: - case STRING: - case CHAR: - case VARCHAR: - return dataColumn.readBytes().getBytesUnsafe(); - case FLOAT: - return dataColumn.readFloat(); - case DECIMAL: - return dataColumn.readBytes().getBytesUnsafe(); - case INTERVAL_DAY_TIME: - case TIMESTAMP: - default: - throw new RuntimeException("Unsupported type in the list: " + type); + case INT: + case BYTE: + case SHORT: + return dataColumn.readInteger(); + case DATE: + case INTERVAL_YEAR_MONTH: + case LONG: + return dataColumn.readLong(); + case BOOLEAN: + return dataColumn.readBoolean() ? 1 : 0; + case DOUBLE: + return dataColumn.readDouble(); + case BINARY: + return dataColumn.readBytes(); + case STRING: + case CHAR: + case VARCHAR: + return dataColumn.readString(); + case FLOAT: + return dataColumn.readFloat(); + case DECIMAL: + return dataColumn.readDecimal(); + case TIMESTAMP: + return dataColumn.readTimestamp(); + case INTERVAL_DAY_TIME: + default: + throw new RuntimeException("Unsupported type in the list: " + type); } } - private List decodeDictionaryIds(List valueList) { + private List decodeDictionaryIds(PrimitiveObjectInspector.PrimitiveCategory category, List + valueList) { int total = valueList.size(); List resultList; List intList = (List) valueList; - switch (descriptor.getType()) { - case INT32: - resultList = new ArrayList(total); - for (int i = 0; i < total; ++i) { - resultList.add(dictionary.decodeToInt(intList.get(i))); - } - break; - case INT64: - resultList = new ArrayList(total); - for (int i = 0; i < total; ++i) { - resultList.add(dictionary.decodeToLong(intList.get(i))); - } - break; - case FLOAT: - resultList = new ArrayList(total); - for (int i = 0; i < total; ++i) { - resultList.add(dictionary.decodeToFloat(intList.get(i))); - } - break; - case DOUBLE: - resultList = new ArrayList(total); - for (int i = 0; i < total; ++i) { - resultList.add(dictionary.decodeToDouble(intList.get(i))); - } - break; - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - resultList = new ArrayList(total); - for (int i = 0; i < total; ++i) { - resultList.add(dictionary.decodeToBinary(intList.get(i)).getBytesUnsafe()); - } - break; - default: - throw new UnsupportedOperationException("Unsupported type: " + descriptor.getType()); + + switch (category) { + case INT: + case BYTE: + case SHORT: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readInteger(intList.get(i))); + } + break; + case DATE: + case INTERVAL_YEAR_MONTH: + case LONG: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readLong(intList.get(i))); + } + break; + case BOOLEAN: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readBoolean(intList.get(i)) ? 1 : 0); + } + break; + case DOUBLE: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readDouble(intList.get(i))); + } + break; + case BINARY: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readBytes(intList.get(i))); + } + break; + case STRING: + case CHAR: + case VARCHAR: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readString(intList.get(i))); + } + break; + case FLOAT: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readFloat(intList.get(i))); + } + break; + case DECIMAL: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readDecimal(intList.get(i))); + } + break; + case TIMESTAMP: + resultList = new ArrayList(total); + for (int i = 0; i < total; ++i) { + resultList.add(dictionary.readTimestamp(intList.get(i))); + } + break; + case INTERVAL_DAY_TIME: + default: + throw new RuntimeException("Unsupported type in the list: " + type); } + return resultList; } @@ -228,71 +267,79 @@ private void setChildrenInfo(ListColumnVector lcv, int itemNum, int elementNum) lcv.offsets = lcvOffset; } - private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory category, ListColumnVector lcv, - List valueList, int elementNum) { + private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory category, + ListColumnVector lcv, + List valueList, int elementNum) { int total = valueList.size(); setChildrenInfo(lcv, total, elementNum); switch (category) { - case INT: - case BYTE: - case SHORT: - case BOOLEAN: - lcv.child = new LongColumnVector(total); - for (int i = 0; i < valueList.size(); i++) { - ((LongColumnVector)lcv.child).vector[i] = ((List)valueList).get(i); - } - break; - case DATE: - case INTERVAL_YEAR_MONTH: - case LONG: - lcv.child = new LongColumnVector(total); - for (int i = 0; i < valueList.size(); i++) { - ((LongColumnVector)lcv.child).vector[i] = ((List)valueList).get(i); - } - break; - case DOUBLE: - lcv.child = new DoubleColumnVector(total); - for (int i = 0; i < valueList.size(); i++) { - ((DoubleColumnVector)lcv.child).vector[i] = ((List)valueList).get(i); - } - break; - case BINARY: - case STRING: - case CHAR: - case VARCHAR: - lcv.child = new BytesColumnVector(total); - lcv.child.init(); - for (int i = 0; i < valueList.size(); i++) { - byte[] src = ((List)valueList).get(i); - ((BytesColumnVector)lcv.child).setRef(i, src, 0, src.length); - } - break; - case FLOAT: - lcv.child = new DoubleColumnVector(total); - for (int i = 0; i < valueList.size(); i++) { - ((DoubleColumnVector)lcv.child).vector[i] = ((List)valueList).get(i); - } - break; - case DECIMAL: - int precision = type.asPrimitiveType().getDecimalMetadata().getPrecision(); - int scale = type.asPrimitiveType().getDecimalMetadata().getScale(); - lcv.child = new DecimalColumnVector(total, precision, scale); - for (int i = 0; i < valueList.size(); i++) { - ((DecimalColumnVector)lcv.child).vector[i].set(((List)valueList).get(i), scale); - } - break; - case INTERVAL_DAY_TIME: - case TIMESTAMP: - default: - throw new RuntimeException("Unsupported type in the list: " + type); + case INT: + case BYTE: + case SHORT: + lcv.child = new LongColumnVector(total); + for (int i = 0; i < valueList.size(); i++) { + ((LongColumnVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + break; + case BOOLEAN: + lcv.child = new LongColumnVector(total); + for (int i = 0; i < valueList.size(); i++) { + ((LongColumnVector) lcv.child).vector[i] = ((List) valueList).get(i) ? 1 : 0; + } + break; + case DATE: + case INTERVAL_YEAR_MONTH: + case LONG: + lcv.child = new LongColumnVector(total); + for (int i = 0; i < valueList.size(); i++) { + ((LongColumnVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + break; + case DOUBLE: + lcv.child = new DoubleColumnVector(total); + for (int i = 0; i < valueList.size(); i++) { + ((DoubleColumnVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + break; + case BINARY: + case STRING: + case CHAR: + case VARCHAR: + lcv.child = new BytesColumnVector(total); + lcv.child.init(); + for (int i = 0; i < valueList.size(); i++) { + byte[] src = ((List) valueList).get(i); + ((BytesColumnVector) lcv.child).setRef(i, src, 0, src.length); + } + break; + case FLOAT: + lcv.child = new DoubleColumnVector(total); + for (int i = 0; i < valueList.size(); i++) { + ((DoubleColumnVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + break; + case DECIMAL: + decimalTypeCheck(type); + int precision = type.asPrimitiveType().getDecimalMetadata().getPrecision(); + int scale = type.asPrimitiveType().getDecimalMetadata().getScale(); + lcv.child = new DecimalColumnVector(total, precision, scale); + for (int i = 0; i < valueList.size(); i++) { + ((DecimalColumnVector) lcv.child).vector[i].set(((List) valueList).get(i), scale); + } + break; + case INTERVAL_DAY_TIME: + case TIMESTAMP: + default: + throw new RuntimeException("Unsupported type in the list: " + type); } } /** * Finish the result ListColumnVector with all collected information. */ - private void convertValueListToListColumnVector(PrimitiveObjectInspector.PrimitiveCategory category, - ListColumnVector lcv, List valueList, int elementNum) { + private void convertValueListToListColumnVector( + PrimitiveObjectInspector.PrimitiveCategory category, ListColumnVector lcv, List valueList, + int elementNum) { // Fill the child of ListColumnVector with valueList fillColumnVector(category, lcv, valueList, elementNum); setIsRepeating(lcv); @@ -371,8 +418,9 @@ private boolean compareColumnVector(ColumnVector cv1, ColumnVector cv2) { if (cv1 instanceof DecimalColumnVector && cv2 instanceof DecimalColumnVector) { return compareDecimalColumnVector((DecimalColumnVector) cv1, (DecimalColumnVector) cv2); } - throw new RuntimeException("Unsupported ColumnVector comparision between " + cv1.getClass().getName() - + " and " + cv2.getClass().getName()); + throw new RuntimeException( + "Unsupported ColumnVector comparision between " + cv1.getClass().getName() + + " and " + cv2.getClass().getName()); } else { return false; } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java index 08ac57b..7b77eee 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java @@ -1,9 +1,13 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -11,9 +15,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.parquet.vector; -import com.google.common.annotations.VisibleForTesting; +package org.apache.hadoop.hive.ql.io.parquet.vector; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -64,6 +67,7 @@ import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.InvalidSchemaException; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -459,6 +463,19 @@ private void checkEndOfRowGroup() throws IOException { return res; } + // TODO support only non nested case + private PrimitiveType getElementType(Type type) { + if (type.isPrimitive()) { + return type.asPrimitiveType(); + } + if (type.asGroupType().getFields().size() > 1) { + throw new RuntimeException( + "Current Parquet Vectorization reader doesn't support nested type"); + } + return type.asGroupType().getFields().get(0).asGroupType().getFields().get(0) + .asPrimitiveType(); + } + // Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema private VectorizedColumnReader buildVectorizedParquetReader( TypeInfo typeInfo, @@ -474,9 +491,13 @@ private VectorizedColumnReader buildVectorizedParquetReader( if (columnDescriptors == null || columnDescriptors.isEmpty()) { throw new RuntimeException( "Failed to find related Parquet column descriptor with type " + type); - } else { + } + if (fileSchema.getColumns().contains(descriptors.get(0))) { return new VectorizedPrimitiveColumnReader(descriptors.get(0), - pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type); + pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo); + } else { + // Support for schema evolution + return new VectorizedDummyColumnReader(); } case STRUCT: StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; @@ -502,8 +523,10 @@ private VectorizedColumnReader buildVectorizedParquetReader( throw new RuntimeException( "Failed to find related Parquet column descriptor with type " + type); } + return new VectorizedListColumnReader(descriptors.get(0), - pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type); + pages.getPageReader(descriptors.get(0)), skipTimestampConversion, getElementType(type), + typeInfo); case MAP: if (columnDescriptors == null || columnDescriptors.isEmpty()) { throw new RuntimeException( @@ -535,10 +558,10 @@ private VectorizedColumnReader buildVectorizedParquetReader( List kvTypes = groupType.getFields(); VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader( descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, - kvTypes.get(0)); + kvTypes.get(0), typeInfo); VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader( descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, - kvTypes.get(1)); + kvTypes.get(1), typeInfo); return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader); case UNION: default: diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java index 39689f1..c2811d5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java @@ -19,17 +19,13 @@ import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; -import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; -import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.page.PageReader; import org.apache.parquet.schema.Type; + import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.sql.Timestamp; /** * It's column level Parquet reader which is used to read a batch of records for a column, @@ -38,18 +34,18 @@ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader { public VectorizedPrimitiveColumnReader( - ColumnDescriptor descriptor, - PageReader pageReader, - boolean skipTimestampConversion, - Type type) throws IOException { - super(descriptor, pageReader, skipTimestampConversion, type); + ColumnDescriptor descriptor, + PageReader pageReader, + boolean skipTimestampConversion, + Type type, TypeInfo hiveType) throws IOException { + super(descriptor, pageReader, skipTimestampConversion, type, hiveType); } @Override public void readBatch( - int total, - ColumnVector column, - TypeInfo columnType) throws IOException { + int total, + ColumnVector column, + TypeInfo columnType) throws IOException { int rowId = 0; while (total > 0) { // Compute the number of values we want to read in this page. @@ -64,7 +60,7 @@ public void readBatch( LongColumnVector dictionaryIds = new LongColumnVector(); // Read and decode dictionary ids. readDictionaryIDs(num, dictionaryIds, rowId); - decodeDictionaryIds(rowId, num, column, dictionaryIds); + decodeDictionaryIds(rowId, num, column, columnType, dictionaryIds); } else { // assign values in vector readBatchHelper(num, column, columnType, rowId); @@ -75,10 +71,10 @@ public void readBatch( } private void readBatchHelper( - int num, - ColumnVector column, - TypeInfo columnType, - int rowId) throws IOException { + int num, + ColumnVector column, + TypeInfo columnType, + int rowId) throws IOException { PrimitiveTypeInfo primitiveColumnType = (PrimitiveTypeInfo) columnType; switch (primitiveColumnType.getPrimitiveCategory()) { @@ -90,6 +86,7 @@ private void readBatchHelper( case DATE: case INTERVAL_YEAR_MONTH: case LONG: + case INTERVAL_DAY_TIME: readLongs(num, (LongColumnVector) column, rowId); break; case BOOLEAN: @@ -99,10 +96,16 @@ private void readBatchHelper( readDoubles(num, (DoubleColumnVector) column, rowId); break; case BINARY: + readBinaries(num, (BytesColumnVector) column, rowId); + break; case STRING: - case CHAR: + readString(num, (BytesColumnVector) column, rowId); + break; case VARCHAR: - readBinaries(num, (BytesColumnVector) column, rowId); + readVarchar(num, (BytesColumnVector) column, rowId); + break; + case CHAR: + readChar(num, (BytesColumnVector) column, rowId); break; case FLOAT: readFloats(num, (DoubleColumnVector) column, rowId); @@ -113,16 +116,15 @@ private void readBatchHelper( case TIMESTAMP: readTimestamp(num, (TimestampColumnVector) column, rowId); break; - case INTERVAL_DAY_TIME: default: throw new IOException("Unsupported type: " + type); } } private void readDictionaryIDs( - int total, - LongColumnVector c, - int rowId) throws IOException { + int total, + LongColumnVector c, + int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); @@ -141,9 +143,9 @@ private void readDictionaryIDs( } private void readIntegers( - int total, - LongColumnVector c, - int rowId) throws IOException { + int total, + LongColumnVector c, + int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); @@ -162,9 +164,9 @@ private void readIntegers( } private void readDoubles( - int total, - DoubleColumnVector c, - int rowId) throws IOException { + int total, + DoubleColumnVector c, + int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); @@ -183,9 +185,9 @@ private void readDoubles( } private void readBooleans( - int total, - LongColumnVector c, - int rowId) throws IOException { + int total, + LongColumnVector c, + int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); @@ -204,9 +206,9 @@ private void readBooleans( } private void readLongs( - int total, - LongColumnVector c, - int rowId) throws IOException { + int total, + LongColumnVector c, + int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); @@ -225,9 +227,9 @@ private void readLongs( } private void readFloats( - int total, - DoubleColumnVector c, - int rowId) throws IOException { + int total, + DoubleColumnVector c, + int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); @@ -246,16 +248,17 @@ private void readFloats( } private void readDecimal( - int total, - DecimalColumnVector c, - int rowId) throws IOException { + int total, + DecimalColumnVector c, + int rowId) throws IOException { + decimalTypeCheck(type); int left = total; c.precision = (short) type.asPrimitiveType().getDecimalMetadata().getPrecision(); c.scale = (short) type.asPrimitiveType().getDecimalMetadata().getScale(); while (left > 0) { readRepetitionAndDefinitionLevels(); if (definitionLevel >= maxDefLevel) { - c.vector[rowId].set(dataColumn.readBytes().getBytesUnsafe(), c.scale); + c.vector[rowId].set(dataColumn.readDecimal(), c.scale); c.isNull[rowId] = false; c.isRepeating = c.isRepeating && (c.vector[0] == c.vector[rowId]); } else { @@ -268,15 +271,81 @@ private void readDecimal( } } + private void readString( + int total, + BytesColumnVector c, + int rowId) throws IOException { + int left = total; + while (left > 0) { + readRepetitionAndDefinitionLevels(); + if (definitionLevel >= maxDefLevel) { + c.setVal(rowId, dataColumn.readString()); + c.isNull[rowId] = false; + // TODO figure out a better way to set repeat for Binary type + c.isRepeating = false; + } else { + c.isNull[rowId] = true; + c.isRepeating = false; + c.noNulls = false; + } + rowId++; + left--; + } + } + + private void readChar( + int total, + BytesColumnVector c, + int rowId) throws IOException { + int left = total; + while (left > 0) { + readRepetitionAndDefinitionLevels(); + if (definitionLevel >= maxDefLevel) { + c.setVal(rowId, dataColumn.readChar()); + c.isNull[rowId] = false; + // TODO figure out a better way to set repeat for Binary type + c.isRepeating = false; + } else { + c.isNull[rowId] = true; + c.isRepeating = false; + c.noNulls = false; + } + rowId++; + left--; + } + } + + private void readVarchar( + int total, + BytesColumnVector c, + int rowId) throws IOException { + int left = total; + while (left > 0) { + readRepetitionAndDefinitionLevels(); + if (definitionLevel >= maxDefLevel) { + c.setVal(rowId, dataColumn.readVarchar()); + c.isNull[rowId] = false; + // TODO figure out a better way to set repeat for Binary type + c.isRepeating = false; + } else { + c.isNull[rowId] = true; + c.isRepeating = false; + c.noNulls = false; + } + rowId++; + left--; + } + } + private void readBinaries( - int total, - BytesColumnVector c, - int rowId) throws IOException { + int total, + BytesColumnVector c, + int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); if (definitionLevel >= maxDefLevel) { - c.setVal(rowId, dataColumn.readBytes().getBytesUnsafe()); + c.setVal(rowId, dataColumn.readBytes()); c.isNull[rowId] = false; // TODO figure out a better way to set repeat for Binary type c.isRepeating = false; @@ -296,11 +365,9 @@ private void readTimestamp(int total, TimestampColumnVector c, int rowId) throws readRepetitionAndDefinitionLevels(); if (definitionLevel >= maxDefLevel) { switch (descriptor.getType()) { - //INT64 is not yet supported + //INT64 is not yet supported case INT96: - NanoTime nt = NanoTime.fromBinary(dataColumn.readBytes()); - Timestamp ts = NanoTimeUtils.getTimestamp(nt, skipTimestampConversion); - c.set(rowId, ts); + c.set(rowId, dataColumn.readTimestamp()); break; default: throw new IOException( @@ -323,73 +390,99 @@ private void readTimestamp(int total, TimestampColumnVector c, int rowId) throws * Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`. */ private void decodeDictionaryIds( - int rowId, - int num, - ColumnVector column, - LongColumnVector dictionaryIds) { + int rowId, + int num, + ColumnVector column, + TypeInfo columnType, + LongColumnVector dictionaryIds) { System.arraycopy(dictionaryIds.isNull, rowId, column.isNull, rowId, num); if (column.noNulls) { column.noNulls = dictionaryIds.noNulls; } column.isRepeating = column.isRepeating && dictionaryIds.isRepeating; - switch (descriptor.getType()) { - case INT32: + + PrimitiveTypeInfo primitiveColumnType = (PrimitiveTypeInfo) columnType; + + switch (primitiveColumnType.getPrimitiveCategory()) { + case INT: + case BYTE: + case SHORT: for (int i = rowId; i < rowId + num; ++i) { ((LongColumnVector) column).vector[i] = - dictionary.decodeToInt((int) dictionaryIds.vector[i]); + dictionary.readInteger((int) dictionaryIds.vector[i]); } break; - case INT64: + case DATE: + case INTERVAL_YEAR_MONTH: + case LONG: for (int i = rowId; i < rowId + num; ++i) { ((LongColumnVector) column).vector[i] = - dictionary.decodeToLong((int) dictionaryIds.vector[i]); + dictionary.readLong((int) dictionaryIds.vector[i]); } break; - case FLOAT: + case BOOLEAN: for (int i = rowId; i < rowId + num; ++i) { - ((DoubleColumnVector) column).vector[i] = - dictionary.decodeToFloat((int) dictionaryIds.vector[i]); + ((LongColumnVector) column).vector[i] = + dictionary.readBoolean((int) dictionaryIds.vector[i]) ? 1 : 0; } break; case DOUBLE: for (int i = rowId; i < rowId + num; ++i) { ((DoubleColumnVector) column).vector[i] = - dictionary.decodeToDouble((int) dictionaryIds.vector[i]); + dictionary.readDouble((int) dictionaryIds.vector[i]); } break; - case INT96: + case BINARY: for (int i = rowId; i < rowId + num; ++i) { - ByteBuffer buf = dictionary.decodeToBinary((int) dictionaryIds.vector[i]).toByteBuffer(); - buf.order(ByteOrder.LITTLE_ENDIAN); - long timeOfDayNanos = buf.getLong(); - int julianDay = buf.getInt(); - NanoTime nt = new NanoTime(julianDay, timeOfDayNanos); - Timestamp ts = NanoTimeUtils.getTimestamp(nt, skipTimestampConversion); - ((TimestampColumnVector) column).set(i, ts); + ((BytesColumnVector) column) + .setVal(i, dictionary.readBytes((int) dictionaryIds.vector[i])); } break; - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - if (column instanceof BytesColumnVector) { - for (int i = rowId; i < rowId + num; ++i) { - ((BytesColumnVector) column) - .setVal(i, dictionary.decodeToBinary((int) dictionaryIds.vector[i]).getBytesUnsafe()); - } - } else { - DecimalColumnVector decimalColumnVector = ((DecimalColumnVector) column); - decimalColumnVector.precision = - (short) type.asPrimitiveType().getDecimalMetadata().getPrecision(); - decimalColumnVector.scale = (short) type.asPrimitiveType().getDecimalMetadata().getScale(); - for (int i = rowId; i < rowId + num; ++i) { - decimalColumnVector.vector[i] - .set(dictionary.decodeToBinary((int) dictionaryIds.vector[i]).getBytesUnsafe(), - decimalColumnVector.scale); - } + case STRING: + for (int i = rowId; i < rowId + num; ++i) { + ((BytesColumnVector) column) + .setVal(i, dictionary.readString((int) dictionaryIds.vector[i])); + } + break; + case VARCHAR: + for (int i = rowId; i < rowId + num; ++i) { + ((BytesColumnVector) column) + .setVal(i, dictionary.readVarchar((int) dictionaryIds.vector[i])); + } + break; + case CHAR: + for (int i = rowId; i < rowId + num; ++i) { + ((BytesColumnVector) column) + .setVal(i, dictionary.readChar((int) dictionaryIds.vector[i])); + } + break; + case FLOAT: + for (int i = rowId; i < rowId + num; ++i) { + ((DoubleColumnVector) column).vector[i] = + dictionary.readFloat((int) dictionaryIds.vector[i]); } break; + case DECIMAL: + decimalTypeCheck(type); + DecimalColumnVector decimalColumnVector = ((DecimalColumnVector) column); + decimalColumnVector.precision = (short) type.asPrimitiveType().getDecimalMetadata().getPrecision(); + decimalColumnVector.scale = (short) type.asPrimitiveType().getDecimalMetadata().getScale(); + for (int i = rowId; i < rowId + num; ++i) { + decimalColumnVector.vector[i] + .set(dictionary.readDecimal((int) dictionaryIds.vector[i]), + decimalColumnVector.scale); + } + break; + case TIMESTAMP: + for (int i = rowId; i < rowId + num; ++i) { + ((TimestampColumnVector) column) + .set(i, dictionary.readTimestamp((int) dictionaryIds.vector[i])); + } + break; + case INTERVAL_DAY_TIME: default: - throw new UnsupportedOperationException("Unsupported type: " + descriptor.getType()); + throw new UnsupportedOperationException("Unsupported type: " + type); } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/package-info.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/package-info.java new file mode 100644 index 0000000..b695974 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Hive Parquet Vectorized Reader related. + */ +package org.apache.hadoop.hive.ql.io.parquet.vector; diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedColumnReader.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedColumnReader.java index 9e414dc..52e6045 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedColumnReader.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedColumnReader.java @@ -35,7 +35,6 @@ import java.io.IOException; -import static junit.framework.TestCase.assertFalse; import static org.apache.parquet.hadoop.api.ReadSupport.PARQUET_READ_SCHEMA; public class TestVectorizedColumnReader extends VectorizedColumnReaderTestBase { @@ -55,26 +54,40 @@ public static void cleanup() throws IOException { @Test public void testIntRead() throws Exception { intRead(isDictionaryEncoding); + longReadInt(isDictionaryEncoding); + floatReadInt(isDictionaryEncoding); + doubleReadInt(isDictionaryEncoding); } @Test public void testLongRead() throws Exception { longRead(isDictionaryEncoding); + floatReadLong(isDictionaryEncoding); + doubleReadLong(isDictionaryEncoding); + } + + @Test + public void testTimestamp() throws Exception { + timestampRead(isDictionaryEncoding); + stringReadTimestamp(isDictionaryEncoding); } @Test public void testDoubleRead() throws Exception { doubleRead(isDictionaryEncoding); + stringReadDouble(isDictionaryEncoding); } @Test public void testFloatRead() throws Exception { floatRead(isDictionaryEncoding); + doubleReadFloat(isDictionaryEncoding); } @Test public void testBooleanRead() throws Exception { booleanRead(); + stringReadBoolean(); } @Test @@ -101,6 +114,7 @@ public void structReadSomeNull() throws Exception { @Test public void decimalRead() throws Exception { decimalRead(isDictionaryEncoding); + stringReadDecimal(isDictionaryEncoding); } private class TestVectorizedParquetRecordReader extends VectorizedParquetRecordReader { diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedDictionaryEncodingColumnReader.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedDictionaryEncodingColumnReader.java index 3e5d831..32d27d9 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedDictionaryEncodingColumnReader.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedDictionaryEncodingColumnReader.java @@ -41,21 +41,34 @@ public static void cleanup() throws IOException { @Test public void testIntRead() throws Exception { intRead(isDictionaryEncoding); + longReadInt(isDictionaryEncoding); + floatReadInt(isDictionaryEncoding); + doubleReadInt(isDictionaryEncoding); } @Test public void testLongRead() throws Exception { longRead(isDictionaryEncoding); + floatReadLong(isDictionaryEncoding); + doubleReadLong(isDictionaryEncoding); + } + + @Test + public void testTimestamp() throws Exception { + timestampRead(isDictionaryEncoding); + stringReadTimestamp(isDictionaryEncoding); } @Test public void testDoubleRead() throws Exception { doubleRead(isDictionaryEncoding); + stringReadDouble(isDictionaryEncoding); } @Test public void testFloatRead() throws Exception { floatRead(isDictionaryEncoding); + doubleReadFloat(isDictionaryEncoding); } @Test @@ -81,5 +94,6 @@ public void structReadSomeNull() throws Exception { @Test public void decimalRead() throws Exception { decimalRead(isDictionaryEncoding); + stringReadDecimal(isDictionaryEncoding); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/VectorizedColumnReaderTestBase.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/VectorizedColumnReaderTestBase.java index 5d3ebd6..a230441 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/VectorizedColumnReaderTestBase.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/VectorizedColumnReaderTestBase.java @@ -1,9 +1,13 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -26,12 +30,14 @@ import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; import org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapWork; @@ -43,8 +49,8 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroupFactory; @@ -54,9 +60,10 @@ import org.apache.parquet.hadoop.example.GroupWriteSupport; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.MessageType; + import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; +import java.sql.Timestamp; +import java.util.Arrays; import java.util.List; import static junit.framework.Assert.assertTrue; @@ -77,84 +84,84 @@ protected final static Path file = new Path("target/test/TestParquetVectorReader/testParquetFile"); protected static final MessageType schema = parseMessageType( - "message hive_schema { " - + "required int32 int32_field; " - + "required int64 int64_field; " - + "required int96 int96_field; " - + "required double double_field; " - + "required float float_field; " - + "required boolean boolean_field; " - + "required fixed_len_byte_array(3) flba_field; " - + "optional fixed_len_byte_array(1) some_null_field; " - + "optional fixed_len_byte_array(1) all_null_field; " - + "required binary binary_field; " - + "optional binary binary_field_some_null; " - + "required binary value (DECIMAL(5,2)); " - + "required group struct_field {" - + " required int32 a;\n" - + " required double b;\n" - + "}\n" - + "optional group nested_struct_field {" - + " optional group nsf {" - + " optional int32 c;\n" - + " optional int32 d;\n" - + " }\n" - + " optional double e;\n" - + "}\n" - + "optional group struct_field_some_null {" - + " optional int32 f;\n" - + " optional double g;\n" - + "}\n" - + "optional group map_field (MAP) {\n" - + " repeated group map (MAP_KEY_VALUE) {\n" - + " required binary key;\n" - + " optional binary value;\n" - + " }\n" - + "}\n" - + "optional group array_list (LIST) {\n" - + " repeated group bag {\n" - + " optional int32 array_element;\n" - + " }\n" - + "}\n" - + "repeated int32 list_int32_field;" - + "repeated int64 list_int64_field;" - + "repeated double list_double_field;" - + "repeated float list_float_field;" - + "repeated boolean list_boolean_field;" - + "repeated fixed_len_byte_array(3) list_byte_array_field;" - + "repeated binary list_binary_field;" - + "repeated binary list_decimal_field (DECIMAL(5,2));" - + "repeated binary list_binary_field_for_repeat_test;" - + "repeated int32 list_int32_field_for_repeat_test;" - + "repeated group map_int32 (MAP_KEY_VALUE) {\n" - + " required int32 key;\n" - + " optional int32 value;\n" - + "}\n" - + "repeated group map_int64 (MAP_KEY_VALUE) {\n" - + " required int64 key;\n" - + " optional int64 value;\n" - + "}\n" - + "repeated group map_double (MAP_KEY_VALUE) {\n" - + " required double key;\n" - + " optional double value;\n" - + "}\n" - + "repeated group map_float (MAP_KEY_VALUE) {\n" - + " required float key;\n" - + " optional float value;\n" - + "}\n" - + "repeated group map_binary (MAP_KEY_VALUE) {\n" - + " required binary key;\n" - + " optional binary value;\n" - + "}\n" - + "repeated group map_decimal (MAP_KEY_VALUE) {\n" - + " required binary key (DECIMAL(5,2));\n" - + " optional binary value (DECIMAL(5,2));\n" - + "}\n" - + "repeated group map_int32_for_repeat_test (MAP_KEY_VALUE) {\n" - + " required int32 key;\n" - + " optional int32 value;\n" - + "}\n" - + "} "); + "message hive_schema { " + + "required int32 int32_field; " + + "required int64 int64_field; " + + "required int96 int96_field; " + + "required double double_field; " + + "required float float_field; " + + "required boolean boolean_field; " + + "required fixed_len_byte_array(3) flba_field; " + + "optional fixed_len_byte_array(1) some_null_field; " + + "optional fixed_len_byte_array(1) all_null_field; " + + "required binary binary_field; " + + "optional binary binary_field_some_null; " + + "required binary value (DECIMAL(5,2)); " + + "required group struct_field {" + + " required int32 a;\n" + + " required double b;\n" + + "}\n" + + "optional group nested_struct_field {" + + " optional group nsf {" + + " optional int32 c;\n" + + " optional int32 d;\n" + + " }\n" + + " optional double e;\n" + + "}\n" + + "optional group struct_field_some_null {" + + " optional int32 f;\n" + + " optional double g;\n" + + "}\n" + + "optional group map_field (MAP) {\n" + + " repeated group map (MAP_KEY_VALUE) {\n" + + " required binary key;\n" + + " optional binary value;\n" + + " }\n" + + "}\n" + + "optional group array_list (LIST) {\n" + + " repeated group bag {\n" + + " optional int32 array_element;\n" + + " }\n" + + "}\n" + + "repeated int32 list_int32_field;" + + "repeated int64 list_int64_field;" + + "repeated double list_double_field;" + + "repeated float list_float_field;" + + "repeated boolean list_boolean_field;" + + "repeated fixed_len_byte_array(3) list_byte_array_field;" + + "repeated binary list_binary_field;" + + "repeated binary list_decimal_field (DECIMAL(5,2));" + + "repeated binary list_binary_field_for_repeat_test;" + + "repeated int32 list_int32_field_for_repeat_test;" + + "repeated group map_int32 (MAP_KEY_VALUE) {\n" + + " required int32 key;\n" + + " optional int32 value;\n" + + "}\n" + + "repeated group map_int64 (MAP_KEY_VALUE) {\n" + + " required int64 key;\n" + + " optional int64 value;\n" + + "}\n" + + "repeated group map_double (MAP_KEY_VALUE) {\n" + + " required double key;\n" + + " optional double value;\n" + + "}\n" + + "repeated group map_float (MAP_KEY_VALUE) {\n" + + " required float key;\n" + + " optional float value;\n" + + "}\n" + + "repeated group map_binary (MAP_KEY_VALUE) {\n" + + " required binary key;\n" + + " optional binary value;\n" + + "}\n" + + "repeated group map_decimal (MAP_KEY_VALUE) {\n" + + " required binary key (DECIMAL(5,2));\n" + + " optional binary value (DECIMAL(5,2));\n" + + "}\n" + + "repeated group map_int32_for_repeat_test (MAP_KEY_VALUE) {\n" + + " required int32 key;\n" + + " optional int32 value;\n" + + "}\n" + + "} "); protected static void removeFile() throws IOException { FileSystem fs = file.getFileSystem(conf); @@ -166,73 +173,66 @@ protected static void removeFile() throws IOException { protected static ParquetWriter initWriterFromFile() throws IOException { GroupWriteSupport.setSchema(schema, conf); return new ParquetWriter<>( - file, - new GroupWriteSupport(), - GZIP, 1024 * 1024, 1024, 1024 * 1024, - true, false, PARQUET_1_0, conf); + file, + new GroupWriteSupport(), + GZIP, 1024 * 1024, 1024, 1024 * 1024, + true, false, PARQUET_1_0, conf); } protected static int getIntValue( - boolean isDictionaryEncoding, - int index) { + boolean isDictionaryEncoding, + int index) { return isDictionaryEncoding ? index % UNIQUE_NUM : index; } protected static double getDoubleValue( - boolean isDictionaryEncoding, - int index) { + boolean isDictionaryEncoding, + int index) { return isDictionaryEncoding ? index % UNIQUE_NUM : index; } protected static long getLongValue( - boolean isDictionaryEncoding, - int index) { + boolean isDictionaryEncoding, + int index) { return isDictionaryEncoding ? (long) 2 * index % UNIQUE_NUM : (long) 2 * index; } protected static float getFloatValue( - boolean isDictionaryEncoding, - int index) { + boolean isDictionaryEncoding, + int index) { return (float) (isDictionaryEncoding ? index % UNIQUE_NUM * 2.0 : index * 2.0); } protected static boolean getBooleanValue( - float index) { + float index) { return (index % 2 == 0); } - protected static String getTimestampStr(int index) { - String s = String.valueOf(index); - int l = 4 - s.length(); - for (int i = 0; i < l; i++) { - s = "0" + s; - } - return "99999999" + s; + protected static NanoTime getNanoTime(int index) { + return NanoTimeUtils.getNanoTime(new Timestamp(index), false); } protected static HiveDecimal getDecimal( - boolean isDictionaryEncoding, - int index) { + boolean isDictionaryEncoding, + int index) { int decimalVal = index % 100; - String decimalStr = (decimalVal < 10) ? "0" + String.valueOf(decimalVal) : String.valueOf - (decimalVal); + String decimalStr = (decimalVal < 10) ? "0" + String.valueOf(decimalVal) : String + .valueOf(decimalVal); int intVal = (isDictionaryEncoding) ? index % UNIQUE_NUM : index / 100; - String d = String.valueOf(intVal) + decimalStr; - BigInteger bi = new BigInteger(d); - BigDecimal bd = new BigDecimal(bi); - return HiveDecimal.create(bd); + String d = String.valueOf(intVal) + "." + decimalStr; + return HiveDecimal.create(d); } protected static Binary getTimestamp( - boolean isDictionaryEncoding, - int index) { - String s = isDictionaryEncoding ? getTimestampStr(index % UNIQUE_NUM) : getTimestampStr(index); - return Binary.fromReusedByteArray(s.getBytes()); + boolean isDictionaryEncoding, + int index) { + NanoTime s = isDictionaryEncoding ? getNanoTime(index % UNIQUE_NUM) : getNanoTime(index); + return s.toBinary(); } protected static String getStr( - boolean isDictionaryEncoding, - int index) { + boolean isDictionaryEncoding, + int index) { int binaryLen = isDictionaryEncoding ? index % UNIQUE_NUM : index; String v = ""; while (binaryLen > 0) { @@ -244,8 +244,8 @@ protected static String getStr( } protected static Binary getBinaryValue( - boolean isDictionaryEncoding, - int index) { + boolean isDictionaryEncoding, + int index) { return Binary.fromString(getStr(isDictionaryEncoding, index)); } @@ -254,20 +254,20 @@ protected static boolean isNull(int index) { } public static VectorizedParquetRecordReader createTestParquetReader(String schemaString, Configuration conf) - throws IOException, InterruptedException, HiveException { + throws IOException, InterruptedException, HiveException { conf.set(PARQUET_READ_SCHEMA, schemaString); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); Job vectorJob = new Job(conf, "read vector"); ParquetInputFormat.setInputPaths(vectorJob, file); initialVectorizedRowBatchCtx(conf); - return new VectorizedParquetRecordReader(getFileSplit(vectorJob),new JobConf(conf)); + return new VectorizedParquetRecordReader(getFileSplit(vectorJob), new JobConf(conf)); } protected static FileSplit getFileSplit(Job vectorJob) throws IOException, InterruptedException { ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class); InputSplit split = (InputSplit) parquetInputFormat.getSplits(vectorJob).get(0); - FileSplit fsplit = new FileSplit(file,0L,split.getLength(),split.getLocations()); + FileSplit fsplit = new FileSplit(file, 0L, split.getLength(), split.getLocations()); return fsplit; } @@ -284,13 +284,13 @@ protected static void writeData(ParquetWriter writer, boolean isDictionar boolean booleanVal = getBooleanValue(i); Binary binary = getBinaryValue(isDictionaryEncoding, i); Group group = f.newGroup() - .append("int32_field", intVal) - .append("int64_field", longVal) - .append("int96_field", timeStamp) - .append("double_field", doubleVal) - .append("float_field", floatVal) - .append("boolean_field", booleanVal) - .append("flba_field", "abc"); + .append("int32_field", intVal) + .append("int64_field", longVal) + .append("int96_field", timeStamp) + .append("double_field", doubleVal) + .append("float_field", floatVal) + .append("boolean_field", booleanVal) + .append("flba_field", "abc"); if (!isNull) { group.append("some_null_field", "x"); @@ -306,8 +306,8 @@ protected static void writeData(ParquetWriter writer, boolean isDictionar group.append("value", Binary.fromConstantByteArray(w.getInternalStorage())); group.addGroup("struct_field") - .append("a", intVal) - .append("b", doubleVal); + .append("a", intVal) + .append("b", doubleVal); Group g = group.addGroup("nested_struct_field"); @@ -358,14 +358,154 @@ private static StructObjectInspector createStructObjectInspector(Configuration c return new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo); } - protected void intRead(boolean isDictionaryEncoding) throws InterruptedException, HiveException, IOException { - Configuration conf = new Configuration(); - conf.set(IOConstants.COLUMNS,"int32_field"); - conf.set(IOConstants.COLUMNS_TYPES,"int"); + protected void timestampRead(boolean isDictionaryEncoding) throws InterruptedException, + HiveException, IOException { + conf.set(IOConstants.COLUMNS, "int96_field"); + conf.set(IOConstants.COLUMNS_TYPES, "timestamp"); + conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = createTestParquetReader("message test { required " + + "int96 int96_field;}", conf); + VectorizedRowBatch previous = reader.createValue(); + try { + int c = 0; + while (reader.next(NullWritable.get(), previous)) { + TimestampColumnVector vector = (TimestampColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.nanos.length; i++) { + if (c == nElements) { + break; + } + Timestamp expected = isDictionaryEncoding ? new Timestamp(c % UNIQUE_NUM) : new Timestamp(c); + assertEquals("Not the same time at " + c, expected.getTime(), vector.getTime(i)); + assertEquals("Not the same nano at " + c, expected.getNanos(), vector.getNanos(i)); + assertFalse(vector.isNull[i]); + c++; + } + } + assertEquals(nElements, c); + } finally { + reader.close(); + } + } + + protected void stringReadTimestamp(boolean isDictionaryEncoding) throws InterruptedException, + HiveException, IOException { + conf.set(IOConstants.COLUMNS, "int96_field"); + conf.set(IOConstants.COLUMNS_TYPES, "string"); + conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = createTestParquetReader("message test { required " + + "int96 int96_field;}", conf); + VectorizedRowBatch previous = reader.createValue(); + try { + int c = 0; + while (reader.next(NullWritable.get(), previous)) { + BytesColumnVector vector = (BytesColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.vector.length; i++) { + if (c == nElements) { + break; + } + + Timestamp expected = isDictionaryEncoding ? new Timestamp(c % UNIQUE_NUM) : new Timestamp( + c); + String actual = new String(Arrays + .copyOfRange(vector.vector[i], vector.start[i], vector.start[i] + vector.length[i])); + assertEquals("Not the same time at " + c, expected.toString(), actual); + + assertFalse(vector.isNull[i]); + c++; + } + } + assertEquals(nElements, c); + } finally { + reader.close(); + } + } + + protected void floatReadInt(boolean isDictionaryEncoding) throws InterruptedException, + HiveException, IOException { + conf.set(IOConstants.COLUMNS, "int32_field"); + conf.set(IOConstants.COLUMNS_TYPES, "float"); + conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = createTestParquetReader("message test { required int32" + + " int32_field;}", conf); + VectorizedRowBatch previous = reader.createValue(); + try { + int c = 0; + while (reader.next(NullWritable.get(), previous)) { + DoubleColumnVector vector = (DoubleColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.vector.length; i++) { + if (c == nElements) { + break; + } + assertEquals("Failed at " + c, getIntValue(isDictionaryEncoding, c), vector.vector[i], 0); + assertFalse(vector.isNull[i]); + c++; + } + } + assertEquals(nElements, c); + } finally { + reader.close(); + } + } + + protected void doubleReadInt(boolean isDictionaryEncoding) throws InterruptedException, + HiveException, IOException { + conf.set(IOConstants.COLUMNS, "int32_field"); + conf.set(IOConstants.COLUMNS_TYPES, "double"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createTestParquetReader("message test { required int32 int32_field;}", conf); + createTestParquetReader("message test { required int32 int32_field;}", conf); + VectorizedRowBatch previous = reader.createValue(); + try { + int c = 0; + while (reader.next(NullWritable.get(), previous)) { + DoubleColumnVector vector = (DoubleColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.vector.length; i++) { + if (c == nElements) { + break; + } + assertEquals("Failed at " + c, getIntValue(isDictionaryEncoding, c), vector.vector[i], 0); + assertFalse(vector.isNull[i]); + c++; + } + } + assertEquals(nElements, c); + } finally { + reader.close(); + } + } + + protected void longReadInt(boolean isDictionaryEncoding) throws InterruptedException, + HiveException, IOException { + Configuration c = new Configuration(); + c.set(IOConstants.COLUMNS, "int32_field"); + c.set(IOConstants.COLUMNS_TYPES, "bigint"); + c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + intRead(isDictionaryEncoding, c); + } + + protected void intRead(boolean isDictionaryEncoding) throws InterruptedException, + HiveException, IOException { + Configuration c = new Configuration(); + c.set(IOConstants.COLUMNS, "int32_field"); + c.set(IOConstants.COLUMNS_TYPES, "int"); + c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + intRead(isDictionaryEncoding, c); + } + + private void intRead(boolean isDictionaryEncoding, Configuration conf) throws + InterruptedException, HiveException, IOException { + VectorizedParquetRecordReader reader = + createTestParquetReader("message test { required int32 int32_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -373,7 +513,7 @@ protected void intRead(boolean isDictionaryEncoding) throws InterruptedException LongColumnVector vector = (LongColumnVector) previous.cols[0]; assertTrue(vector.noNulls); for (int i = 0; i < vector.vector.length; i++) { - if(c == nElements){ + if (c == nElements) { break; } assertEquals("Failed at " + c, getIntValue(isDictionaryEncoding, c), vector.vector[i]); @@ -387,14 +527,78 @@ protected void intRead(boolean isDictionaryEncoding) throws InterruptedException } } + protected void floatReadLong(boolean isDictionaryEncoding) throws Exception { + Configuration c = new Configuration(); + c.set(IOConstants.COLUMNS, "int64_field"); + c.set(IOConstants.COLUMNS_TYPES, "float"); + c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = + createTestParquetReader("message test { required int64 int64_field;}", c); + VectorizedRowBatch previous = reader.createValue(); + try { + int count = 0; + while (reader.next(NullWritable.get(), previous)) { + DoubleColumnVector vector = (DoubleColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.vector.length; i++) { + if (count == nElements) { + break; + } + assertEquals("Failed at " + count, getLongValue(isDictionaryEncoding, count), vector + .vector[i], 0); + assertFalse(vector.isNull[i]); + count++; + } + } + assertEquals(nElements, count); + } finally { + reader.close(); + } + } + + protected void doubleReadLong(boolean isDictionaryEncoding) throws Exception { + Configuration c = new Configuration(); + c.set(IOConstants.COLUMNS, "int64_field"); + c.set(IOConstants.COLUMNS_TYPES, "double"); + c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = + createTestParquetReader("message test { required int64 int64_field;}", c); + VectorizedRowBatch previous = reader.createValue(); + try { + int count = 0; + while (reader.next(NullWritable.get(), previous)) { + DoubleColumnVector vector = (DoubleColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.vector.length; i++) { + if (count == nElements) { + break; + } + assertEquals("Failed at " + count, getLongValue(isDictionaryEncoding, count), + vector.vector[i], 0); + assertFalse(vector.isNull[i]); + count++; + } + } + assertEquals(nElements, count); + } finally { + reader.close(); + } + } + protected void longRead(boolean isDictionaryEncoding) throws Exception { - Configuration conf = new Configuration(); - conf.set(IOConstants.COLUMNS, "int64_field"); - conf.set(IOConstants.COLUMNS_TYPES, "bigint"); - conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); - conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + Configuration c = new Configuration(); + c.set(IOConstants.COLUMNS, "int64_field"); + c.set(IOConstants.COLUMNS_TYPES, "bigint"); + c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + longRead(isDictionaryEncoding, c); + } + + private void longRead(boolean isDictionaryEncoding, Configuration conf) throws Exception { VectorizedParquetRecordReader reader = - createTestParquetReader("message test { required int64 int64_field;}", conf); + createTestParquetReader("message test { required int64 int64_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -417,13 +621,49 @@ protected void longRead(boolean isDictionaryEncoding) throws Exception { } protected void doubleRead(boolean isDictionaryEncoding) throws Exception { - Configuration conf = new Configuration(); - conf.set(IOConstants.COLUMNS, "double_field"); - conf.set(IOConstants.COLUMNS_TYPES, "double"); - conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); - conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + Configuration c = new Configuration(); + c.set(IOConstants.COLUMNS, "double_field"); + c.set(IOConstants.COLUMNS_TYPES, "double"); + c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + doubleRead(isDictionaryEncoding, c); + } + + protected void stringReadDouble(boolean isDictionaryEncoding) throws Exception { + Configuration readerConf = new Configuration(); + readerConf.set(IOConstants.COLUMNS, "double_field"); + readerConf.set(IOConstants.COLUMNS_TYPES, "string"); + readerConf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + readerConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = + createTestParquetReader("message test { required double double_field;}", readerConf); + VectorizedRowBatch previous = reader.createValue(); + try { + int c = 0; + while (reader.next(NullWritable.get(), previous)) { + BytesColumnVector vector = (BytesColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.vector.length; i++) { + if (c == nElements) { + break; + } + String actual = new String(Arrays.copyOfRange(vector.vector[i], vector.start[i], vector + .start[i] + vector.length[i])); + assertEquals("Failed at " + c, String.valueOf(getDoubleValue(isDictionaryEncoding, c)), + actual); + assertFalse(vector.isNull[i]); + c++; + } + } + assertEquals(nElements, c); + } finally { + reader.close(); + } + } + + private void doubleRead(boolean isDictionaryEncoding, Configuration conf) throws Exception { VectorizedParquetRecordReader reader = - createTestParquetReader("message test { required double double_field;}", conf); + createTestParquetReader("message test { required double double_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -435,7 +675,7 @@ protected void doubleRead(boolean isDictionaryEncoding) throws Exception { break; } assertEquals("Failed at " + c, getDoubleValue(isDictionaryEncoding, c), vector.vector[i], - 0); + 0); assertFalse(vector.isNull[i]); c++; } @@ -447,13 +687,26 @@ protected void doubleRead(boolean isDictionaryEncoding) throws Exception { } protected void floatRead(boolean isDictionaryEncoding) throws Exception { - Configuration conf = new Configuration(); - conf.set(IOConstants.COLUMNS, "float_field"); - conf.set(IOConstants.COLUMNS_TYPES, "float"); - conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); - conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + Configuration c = new Configuration(); + c.set(IOConstants.COLUMNS, "float_field"); + c.set(IOConstants.COLUMNS_TYPES, "float"); + c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + floatRead(isDictionaryEncoding, c); + } + + protected void doubleReadFloat(boolean isDictionaryEncoding) throws Exception { + Configuration c = new Configuration(); + c.set(IOConstants.COLUMNS, "float_field"); + c.set(IOConstants.COLUMNS_TYPES, "double"); + c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + floatRead(isDictionaryEncoding, c); + } + + private void floatRead(boolean isDictionaryEncoding, Configuration conf) throws Exception { VectorizedParquetRecordReader reader = - createTestParquetReader("message test { required float float_field;}", conf); + createTestParquetReader("message test { required float float_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -465,7 +718,7 @@ protected void floatRead(boolean isDictionaryEncoding) throws Exception { break; } assertEquals("Failed at " + c, getFloatValue(isDictionaryEncoding, c), vector.vector[i], - 0); + 0); assertFalse(vector.isNull[i]); c++; } @@ -483,7 +736,7 @@ protected void booleanRead() throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createTestParquetReader("message test { required boolean boolean_field;}", conf); + createTestParquetReader("message test { required boolean boolean_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -505,6 +758,38 @@ protected void booleanRead() throws Exception { } } + protected void stringReadBoolean() throws Exception { + Configuration conf = new Configuration(); + conf.set(IOConstants.COLUMNS, "boolean_field"); + conf.set(IOConstants.COLUMNS_TYPES, "string"); + conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = + createTestParquetReader("message test { required boolean boolean_field;}", conf); + VectorizedRowBatch previous = reader.createValue(); + try { + int c = 0; + while (reader.next(NullWritable.get(), previous)) { + BytesColumnVector vector = (BytesColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.vector.length; i++) { + if (c == nElements) { + break; + } + + String actual = new String(Arrays.copyOfRange(vector.vector[i], vector.start[i], vector + .start[i] + vector.length[i])); + assertEquals("Failed at " + c, String.valueOf(getBooleanValue(c)), actual); + assertFalse(vector.isNull[i]); + c++; + } + } + assertEquals(nElements, c); + } finally { + reader.close(); + } + } + protected void binaryRead(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "binary_field_some_null"); @@ -512,7 +797,7 @@ protected void binaryRead(boolean isDictionaryEncoding) throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createTestParquetReader("message test { required binary binary_field_some_null;}", conf); + createTestParquetReader("message test { required binary binary_field_some_null;}", conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { @@ -527,7 +812,7 @@ protected void binaryRead(boolean isDictionaryEncoding) throws Exception { assertEquals("Null assert failed at " + c, isNull(c), vector.isNull[i]); if (!vector.isNull[i]) { actual = new String(ArrayUtils - .subarray(vector.vector[i], vector.start[i], vector.start[i] + vector.length[i])); + .subarray(vector.vector[i], vector.start[i], vector.start[i] + vector.length[i])); assertEquals("failed at " + c, getStr(isDictionaryEncoding, c), actual); } else { noNull = false; @@ -550,11 +835,11 @@ protected void structRead(boolean isDictionaryEncoding) throws Exception { conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); String schema = "message hive_schema {\n" - + "group struct_field {\n" - + " optional int32 a;\n" - + " optional double b;\n" - + "}\n" - + "}\n"; + + "group struct_field {\n" + + " optional int32 a;\n" + + " optional double b;\n" + + "}\n" + + "}\n"; VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; @@ -588,13 +873,13 @@ protected void nestedStructRead0(boolean isDictionaryEncoding) throws Exception conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); String schema = "message hive_schema {\n" - + "group nested_struct_field {\n" - + " optional group nsf {\n" - + " optional int32 c;\n" - + " optional int32 d;\n" - + " }" - + "optional double e;\n" - + "}\n"; + + "group nested_struct_field {\n" + + " optional group nsf {\n" + + " optional int32 c;\n" + + " optional int32 d;\n" + + " }" + + "optional double e;\n" + + "}\n"; VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; @@ -631,11 +916,11 @@ protected void nestedStructRead1(boolean isDictionaryEncoding) throws Exception conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); String schema = "message hive_schema {\n" - + "group nested_struct_field {\n" - + " optional group nsf {\n" - + " optional int32 c;\n" - + " }" - + "}\n"; + + "group nested_struct_field {\n" + + " optional group nsf {\n" + + " optional int32 c;\n" + + " }" + + "}\n"; VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; @@ -668,10 +953,10 @@ protected void structReadSomeNull(boolean isDictionaryEncoding) throws Exception conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); String schema = "message hive_schema {\n" - + "group struct_field_some_null {\n" - + " optional int32 f;\n" - + " optional double g;\n" - + "}\n"; + + "group struct_field_some_null {\n" + + " optional int32 f;\n" + + " optional double g;\n" + + "}\n"; VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; @@ -706,14 +991,48 @@ protected void structReadSomeNull(boolean isDictionaryEncoding) throws Exception } } - protected void decimalRead(boolean isDictionaryEncoding) throws Exception { + protected void stringReadDecimal(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "value"); - conf.set(IOConstants.COLUMNS_TYPES, "decimal(5,2)"); + conf.set(IOConstants.COLUMNS_TYPES, "string"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); VectorizedParquetRecordReader reader = - createTestParquetReader("message hive_schema { required value (DECIMAL(5,2));}", conf); + createTestParquetReader("message hive_schema { required value (DECIMAL(5,2));}", conf); + VectorizedRowBatch previous = reader.createValue(); + try { + int c = 0; + while (reader.next(NullWritable.get(), previous)) { + BytesColumnVector vector = (BytesColumnVector) previous.cols[0]; + assertTrue(vector.noNulls); + for (int i = 0; i < vector.vector.length; i++) { + if (c == nElements) { + break; + } + + String actual = new String(Arrays.copyOfRange(vector.vector[i], vector.start[i], vector + .start[i] + vector.length[i])); + assertEquals("Check failed at pos " + c, getDecimal(isDictionaryEncoding, c).toString(), + actual); + + assertFalse(vector.isNull[i]); + c++; + } + } + assertEquals(nElements, c); + } finally { + reader.close(); + } + } + + protected void decimalRead(boolean isDictionaryEncoding) throws Exception { + Configuration readerConf = new Configuration(); + readerConf.set(IOConstants.COLUMNS, "value"); + readerConf.set(IOConstants.COLUMNS_TYPES, "decimal(5,2)"); + readerConf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + readerConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = + createTestParquetReader("message hive_schema { required value (DECIMAL(5,2));}", readerConf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; @@ -725,7 +1044,8 @@ protected void decimalRead(boolean isDictionaryEncoding) throws Exception { break; } assertEquals("Check failed at pos " + c, getDecimal(isDictionaryEncoding, c), - vector.vector[i].getHiveDecimal()); + vector.vector[i].getHiveDecimal()); + assertFalse(vector.isNull[i]); c++; } diff --git ql/src/test/queries/clientpositive/schema_evol_par_vec_table_dictionary_encoding.q ql/src/test/queries/clientpositive/schema_evol_par_vec_table_dictionary_encoding.q new file mode 100644 index 0000000..6b706ab --- /dev/null +++ ql/src/test/queries/clientpositive/schema_evol_par_vec_table_dictionary_encoding.q @@ -0,0 +1,94 @@ +set hive.fetch.task.conversion=none; +set hive.vectorized.execution.enabled=true; +set parquet.enable.dictionary=true; + +drop table test_alter; +drop table test_alter2; +drop table test_alter3; + +create table test_alter (id string) stored as parquet; +insert into test_alter values ('1'), ('2'), ('3'); +select * from test_alter; + +-- add new column -> empty col values should return NULL +alter table test_alter add columns (newCol string); +select * from test_alter; + +-- insert data into new column -> New data should be returned +insert into test_alter values ('4', '100'); +select * from test_alter; + +-- remove the newly added column +-- this works in vectorized execution +alter table test_alter replace columns (id string); +select * from test_alter; + +-- add column using replace column syntax +alter table test_alter replace columns (id string, id2 string); +-- this surprisingly doesn't return the 100 added to 4th row above +select * from test_alter; +insert into test_alter values ('5', '100'); +select * from test_alter; + +-- use the same column name and datatype +alter table test_alter replace columns (id string, id2 string); +select * from test_alter; + +-- change string to char +alter table test_alter replace columns (id char(10), id2 string); +select * from test_alter; + +-- change string to varchar +alter table test_alter replace columns (id string, id2 string); +alter table test_alter replace columns (id varchar(10), id2 string); +select * from test_alter; + +-- change columntype and column name +alter table test_alter replace columns (id string, id2 string); +alter table test_alter replace columns (idv varchar(10), id2 string); +select * from test_alter; + +-- test int to long type conversion +create table test_alter2 (id int) stored as parquet; +insert into test_alter2 values (1); +alter table test_alter2 replace columns (id bigint); +select * from test_alter2; + +-- test float to double type conversion +drop table test_alter2; +create table test_alter2 (id float) stored as parquet; +insert into test_alter2 values (1.5); +alter table test_alter2 replace columns (id double); +select * from test_alter2; + +drop table test_alter2; +create table test_alter2 (ts timestamp) stored as parquet; +insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456'); +select * from test_alter2; +alter table test_alter2 replace columns (ts string); +select * from test_alter2; + +drop table test_alter2; +create table test_alter2 (ts timestamp) stored as parquet; +insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456'); +select * from test_alter2; +alter table test_alter2 replace columns (ts varchar(19)); +-- this should truncate the microseconds +select * from test_alter2; + +drop table test_alter2; +create table test_alter2 (ts timestamp) stored as parquet; +insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456'); +select * from test_alter2; +alter table test_alter2 replace columns (ts char(25)); +select * from test_alter2; + +-- test integer types upconversion +create table test_alter3 (id1 tinyint, id2 smallint, id3 int, id4 bigint) stored as parquet; +insert into test_alter3 values (10, 20, 30, 40); +alter table test_alter3 replace columns (id1 smallint, id2 int, id3 bigint, id4 decimal(10,4)); +-- this fails mostly due to bigint to decimal +-- select * from test_alter3; +select id1, id2, id3 from test_alter3; + + diff --git ql/src/test/queries/clientpositive/schema_evol_par_vec_table_non_dictionary_encoding.q ql/src/test/queries/clientpositive/schema_evol_par_vec_table_non_dictionary_encoding.q new file mode 100644 index 0000000..3006bd4 --- /dev/null +++ ql/src/test/queries/clientpositive/schema_evol_par_vec_table_non_dictionary_encoding.q @@ -0,0 +1,94 @@ +set hive.fetch.task.conversion=none; +set hive.vectorized.execution.enabled=true; +set parquet.enable.dictionary=false; + +drop table test_alter; +drop table test_alter2; +drop table test_alter3; + +create table test_alter (id string) stored as parquet; +insert into test_alter values ('1'), ('2'), ('3'); +select * from test_alter; + +-- add new column -> empty col values should return NULL +alter table test_alter add columns (newCol string); +select * from test_alter; + +-- insert data into new column -> New data should be returned +insert into test_alter values ('4', '100'); +select * from test_alter; + +-- remove the newly added column +-- this works in vectorized execution +alter table test_alter replace columns (id string); +select * from test_alter; + +-- add column using replace column syntax +alter table test_alter replace columns (id string, id2 string); +-- this surprisingly doesn't return the 100 added to 4th row above +select * from test_alter; +insert into test_alter values ('5', '100'); +select * from test_alter; + +-- use the same column name and datatype +alter table test_alter replace columns (id string, id2 string); +select * from test_alter; + +-- change string to char +alter table test_alter replace columns (id char(10), id2 string); +select * from test_alter; + +-- change string to varchar +alter table test_alter replace columns (id string, id2 string); +alter table test_alter replace columns (id varchar(10), id2 string); +select * from test_alter; + +-- change columntype and column name +alter table test_alter replace columns (id string, id2 string); +alter table test_alter replace columns (idv varchar(10), id2 string); +select * from test_alter; + +-- test int to long type conversion +create table test_alter2 (id int) stored as parquet; +insert into test_alter2 values (1); +alter table test_alter2 replace columns (id bigint); +select * from test_alter2; + +-- test float to double type conversion +drop table test_alter2; +create table test_alter2 (id float) stored as parquet; +insert into test_alter2 values (1.5); +alter table test_alter2 replace columns (id double); +select * from test_alter2; + +drop table test_alter2; +create table test_alter2 (ts timestamp) stored as parquet; +insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456'); +select * from test_alter2; +alter table test_alter2 replace columns (ts string); +select * from test_alter2; + +drop table test_alter2; +create table test_alter2 (ts timestamp) stored as parquet; +insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456'); +select * from test_alter2; +alter table test_alter2 replace columns (ts varchar(19)); +-- this should truncate the microseconds +select * from test_alter2; + +drop table test_alter2; +create table test_alter2 (ts timestamp) stored as parquet; +insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456'); +select * from test_alter2; +alter table test_alter2 replace columns (ts char(25)); +select * from test_alter2; + +-- test integer types upconversion +create table test_alter3 (id1 tinyint, id2 smallint, id3 int, id4 bigint) stored as parquet; +insert into test_alter3 values (10, 20, 30, 40); +alter table test_alter3 replace columns (id1 smallint, id2 int, id3 bigint, id4 decimal(10,4)); +-- this fails mostly due to bigint to decimal +-- select * from test_alter3; +select id1, id2, id3 from test_alter3; + + diff --git ql/src/test/results/clientpositive/schema_evol_par_vec_table.q.out ql/src/test/results/clientpositive/schema_evol_par_vec_table.q.out new file mode 100644 index 0000000..a6128b6 --- /dev/null +++ ql/src/test/results/clientpositive/schema_evol_par_vec_table.q.out @@ -0,0 +1,357 @@ +PREHOOK: query: drop table test_alter +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table test_alter3 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter3 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table test_alter (id string) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter +POSTHOOK: query: create table test_alter (id string) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter +PREHOOK: query: insert into test_alter values ('1'), ('2'), ('3') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('1'), ('2'), ('3') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 +2 +3 +PREHOOK: query: alter table test_alter add columns (newCol string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter add columns (newCol string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +PREHOOK: query: insert into test_alter values ('4', '100') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('4', '100') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +POSTHOOK: Lineage: test_alter.newcol SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +4 100 +PREHOOK: query: alter table test_alter replace columns (id string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 +2 +3 +4 +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: insert into test_alter values ('5', '100') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('5', '100') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +POSTHOOK: Lineage: test_alter.id2 SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id char(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id char(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: alter table test_alter replace columns (id varchar(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id varchar(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: alter table test_alter replace columns (idv varchar(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (idv varchar(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +NULL 100 +NULL NULL +NULL NULL +NULL NULL +NULL NULL +PREHOOK: query: create table test_alter2 (id int) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (id int) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values (1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.id SCRIPT [] +PREHOOK: query: alter table test_alter2 replace columns (id bigint) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (id bigint) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +1 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (id float) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (id float) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values (1.5) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values (1.5) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.id SCRIPT [] +PREHOOK: query: alter table test_alter2 replace columns (id double) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (id double) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +1.5 +PREHOOK: query: create table test_alter3 (id1 tinyint, id2 smallint, id3 int, id4 bigint) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: create table test_alter3 (id1 tinyint, id2 smallint, id3 int, id4 bigint) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter3 +PREHOOK: query: insert into test_alter3 values (10, 20, 30, 40) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: insert into test_alter3 values (10, 20, 30, 40) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter3 +POSTHOOK: Lineage: test_alter3.id1 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id2 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id3 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id4 SCRIPT [] +PREHOOK: query: alter table test_alter3 replace columns (id1 smallint, id2 int, id3 bigint, id4 decimal(10,4)) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter3 +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: alter table test_alter3 replace columns (id1 smallint, id2 int, id3 bigint, id4 decimal(10,4)) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter3 +POSTHOOK: Output: default@test_alter3 +PREHOOK: query: select id1, id2, id3 from test_alter3 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter3 +#### A masked pattern was here #### +POSTHOOK: query: select id1, id2, id3 from test_alter3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter3 +#### A masked pattern was here #### +10 20 30 diff --git ql/src/test/results/clientpositive/schema_evol_par_vec_table_dictionary_encoding.q.out ql/src/test/results/clientpositive/schema_evol_par_vec_table_dictionary_encoding.q.out new file mode 100644 index 0000000..1d2f36d --- /dev/null +++ ql/src/test/results/clientpositive/schema_evol_par_vec_table_dictionary_encoding.q.out @@ -0,0 +1,522 @@ +PREHOOK: query: drop table test_alter +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table test_alter3 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter3 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table test_alter (id string) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter +POSTHOOK: query: create table test_alter (id string) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter +PREHOOK: query: insert into test_alter values ('1'), ('2'), ('3') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('1'), ('2'), ('3') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 +2 +3 +PREHOOK: query: alter table test_alter add columns (newCol string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter add columns (newCol string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +PREHOOK: query: insert into test_alter values ('4', '100') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('4', '100') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +POSTHOOK: Lineage: test_alter.newcol SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +4 100 +PREHOOK: query: alter table test_alter replace columns (id string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 +2 +3 +4 +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: insert into test_alter values ('5', '100') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('5', '100') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +POSTHOOK: Lineage: test_alter.id2 SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id char(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id char(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: alter table test_alter replace columns (id varchar(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id varchar(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: alter table test_alter replace columns (idv varchar(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (idv varchar(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +NULL 100 +NULL NULL +NULL NULL +NULL NULL +NULL NULL +PREHOOK: query: create table test_alter2 (id int) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (id int) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values (1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.id SCRIPT [] +PREHOOK: query: alter table test_alter2 replace columns (id bigint) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (id bigint) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +1 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (id float) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (id float) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values (1.5) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values (1.5) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.id SCRIPT [] +PREHOOK: query: alter table test_alter2 replace columns (id double) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (id double) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +1.5 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.ts SCRIPT [] +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.123456 +2018-01-02 14:15:16.123456 +2018-01-03 16:17:18.123456 +PREHOOK: query: alter table test_alter2 replace columns (ts string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (ts string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.123456 +2018-01-02 14:15:16.123456 +2018-01-03 16:17:18.123456 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.ts SCRIPT [] +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.123456 +2018-01-02 14:15:16.123456 +2018-01-03 16:17:18.123456 +PREHOOK: query: alter table test_alter2 replace columns (ts varchar(19)) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (ts varchar(19)) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15 +2018-01-02 14:15:16 +2018-01-03 16:17:18 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.ts SCRIPT [] +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.123456 +2018-01-02 14:15:16.123456 +2018-01-03 16:17:18.123456 +PREHOOK: query: alter table test_alter2 replace columns (ts char(25)) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (ts char(25)) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.12345 +2018-01-02 14:15:16.12345 +2018-01-03 16:17:18.12345 +PREHOOK: query: create table test_alter3 (id1 tinyint, id2 smallint, id3 int, id4 bigint) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: create table test_alter3 (id1 tinyint, id2 smallint, id3 int, id4 bigint) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter3 +PREHOOK: query: insert into test_alter3 values (10, 20, 30, 40) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: insert into test_alter3 values (10, 20, 30, 40) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter3 +POSTHOOK: Lineage: test_alter3.id1 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id2 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id3 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id4 SCRIPT [] +PREHOOK: query: alter table test_alter3 replace columns (id1 smallint, id2 int, id3 bigint, id4 decimal(10,4)) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter3 +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: alter table test_alter3 replace columns (id1 smallint, id2 int, id3 bigint, id4 decimal(10,4)) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter3 +POSTHOOK: Output: default@test_alter3 +PREHOOK: query: select id1, id2, id3 from test_alter3 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter3 +#### A masked pattern was here #### +POSTHOOK: query: select id1, id2, id3 from test_alter3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter3 +#### A masked pattern was here #### +10 20 30 diff --git ql/src/test/results/clientpositive/schema_evol_par_vec_table_non_dictionary_encoding.q.out ql/src/test/results/clientpositive/schema_evol_par_vec_table_non_dictionary_encoding.q.out new file mode 100644 index 0000000..1d2f36d --- /dev/null +++ ql/src/test/results/clientpositive/schema_evol_par_vec_table_non_dictionary_encoding.q.out @@ -0,0 +1,522 @@ +PREHOOK: query: drop table test_alter +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table test_alter3 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table test_alter3 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table test_alter (id string) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter +POSTHOOK: query: create table test_alter (id string) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter +PREHOOK: query: insert into test_alter values ('1'), ('2'), ('3') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('1'), ('2'), ('3') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 +2 +3 +PREHOOK: query: alter table test_alter add columns (newCol string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter add columns (newCol string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +PREHOOK: query: insert into test_alter values ('4', '100') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('4', '100') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +POSTHOOK: Lineage: test_alter.newcol SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +4 100 +PREHOOK: query: alter table test_alter replace columns (id string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 +2 +3 +4 +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: insert into test_alter values ('5', '100') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter +POSTHOOK: query: insert into test_alter values ('5', '100') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter +POSTHOOK: Lineage: test_alter.id SCRIPT [] +POSTHOOK: Lineage: test_alter.id2 SCRIPT [] +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id char(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id char(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: alter table test_alter replace columns (id varchar(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id varchar(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +5 100 +1 NULL +2 NULL +3 NULL +4 NULL +PREHOOK: query: alter table test_alter replace columns (id string, id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (id string, id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: alter table test_alter replace columns (idv varchar(10), id2 string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter +PREHOOK: Output: default@test_alter +POSTHOOK: query: alter table test_alter replace columns (idv varchar(10), id2 string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter +POSTHOOK: Output: default@test_alter +PREHOOK: query: select * from test_alter +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter +#### A masked pattern was here #### +NULL 100 +NULL NULL +NULL NULL +NULL NULL +NULL NULL +PREHOOK: query: create table test_alter2 (id int) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (id int) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values (1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.id SCRIPT [] +PREHOOK: query: alter table test_alter2 replace columns (id bigint) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (id bigint) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +1 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (id float) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (id float) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values (1.5) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values (1.5) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.id SCRIPT [] +PREHOOK: query: alter table test_alter2 replace columns (id double) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (id double) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +1.5 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.ts SCRIPT [] +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.123456 +2018-01-02 14:15:16.123456 +2018-01-03 16:17:18.123456 +PREHOOK: query: alter table test_alter2 replace columns (ts string) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (ts string) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.123456 +2018-01-02 14:15:16.123456 +2018-01-03 16:17:18.123456 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.ts SCRIPT [] +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.123456 +2018-01-02 14:15:16.123456 +2018-01-03 16:17:18.123456 +PREHOOK: query: alter table test_alter2 replace columns (ts varchar(19)) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (ts varchar(19)) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15 +2018-01-02 14:15:16 +2018-01-03 16:17:18 +PREHOOK: query: drop table test_alter2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: drop table test_alter2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: create table test_alter2 (ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: insert into test_alter2 values ('2018-01-01 13:14:15.123456'), ('2018-01-02 14:15:16.123456'), ('2018-01-03 16:17:18.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter2 +POSTHOOK: Lineage: test_alter2.ts SCRIPT [] +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.123456 +2018-01-02 14:15:16.123456 +2018-01-03 16:17:18.123456 +PREHOOK: query: alter table test_alter2 replace columns (ts char(25)) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter2 +PREHOOK: Output: default@test_alter2 +POSTHOOK: query: alter table test_alter2 replace columns (ts char(25)) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter2 +POSTHOOK: Output: default@test_alter2 +PREHOOK: query: select * from test_alter2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_alter2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter2 +#### A masked pattern was here #### +2018-01-01 13:14:15.12345 +2018-01-02 14:15:16.12345 +2018-01-03 16:17:18.12345 +PREHOOK: query: create table test_alter3 (id1 tinyint, id2 smallint, id3 int, id4 bigint) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: create table test_alter3 (id1 tinyint, id2 smallint, id3 int, id4 bigint) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_alter3 +PREHOOK: query: insert into test_alter3 values (10, 20, 30, 40) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: insert into test_alter3 values (10, 20, 30, 40) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_alter3 +POSTHOOK: Lineage: test_alter3.id1 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id2 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id3 SCRIPT [] +POSTHOOK: Lineage: test_alter3.id4 SCRIPT [] +PREHOOK: query: alter table test_alter3 replace columns (id1 smallint, id2 int, id3 bigint, id4 decimal(10,4)) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@test_alter3 +PREHOOK: Output: default@test_alter3 +POSTHOOK: query: alter table test_alter3 replace columns (id1 smallint, id2 int, id3 bigint, id4 decimal(10,4)) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@test_alter3 +POSTHOOK: Output: default@test_alter3 +PREHOOK: query: select id1, id2, id3 from test_alter3 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_alter3 +#### A masked pattern was here #### +POSTHOOK: query: select id1, id2, id3 from test_alter3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_alter3 +#### A masked pattern was here #### +10 20 30