diff --git ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java index 55ac8c6..c4ff684 100644 --- ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java +++ ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java @@ -7003,6 +7003,10 @@ public Builder removeColumns(int index) { java.util.List getFieldNamesList(); int getFieldNamesCount(); String getFieldNames(int index); + + // optional uint32 maximumLength = 4; + boolean hasMaximumLength(); + int getMaximumLength(); } public static final class Type extends com.google.protobuf.GeneratedMessage @@ -7050,6 +7054,7 @@ public Type getDefaultInstanceForType() { UNION(13, 13), DECIMAL(14, 14), DATE(15, 15), + VARCHAR(16, 16), ; public static final int BOOLEAN_VALUE = 0; @@ -7068,6 +7073,7 @@ public Type getDefaultInstanceForType() { public static final int UNION_VALUE = 13; public static final int DECIMAL_VALUE = 14; public static final int DATE_VALUE = 15; + public static final int VARCHAR_VALUE = 16; public final int getNumber() { return value; } @@ -7090,6 +7096,7 @@ public static Kind valueOf(int value) { case 13: return UNION; case 14: return DECIMAL; case 15: return DATE; + case 16: return VARCHAR; default: return null; } } @@ -7120,7 +7127,7 @@ public Kind findValueByNumber(int number) { } private static final Kind[] VALUES = { - BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, TIMESTAMP, LIST, MAP, STRUCT, UNION, DECIMAL, DATE, + BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, TIMESTAMP, LIST, MAP, STRUCT, UNION, DECIMAL, DATE, VARCHAR, }; public static Kind valueOf( @@ -7183,10 +7190,21 @@ public String getFieldNames(int index) { return fieldNames_.get(index); } + // optional uint32 maximumLength = 4; + public static final int MAXIMUMLENGTH_FIELD_NUMBER = 4; + private int maximumLength_; + public boolean hasMaximumLength() { + return ((bitField0_ & 0x00000002) == 0x00000002); + } + public int getMaximumLength() { + return maximumLength_; + } + private void initFields() { kind_ = org.apache.hadoop.hive.ql.io.orc.OrcProto.Type.Kind.BOOLEAN; subtypes_ = java.util.Collections.emptyList();; fieldNames_ = com.google.protobuf.LazyStringArrayList.EMPTY; + maximumLength_ = 0; } private byte memoizedIsInitialized = -1; public final boolean isInitialized() { @@ -7217,6 +7235,9 @@ public void writeTo(com.google.protobuf.CodedOutputStream output) for (int i = 0; i < fieldNames_.size(); i++) { output.writeBytes(3, fieldNames_.getByteString(i)); } + if (((bitField0_ & 0x00000002) == 0x00000002)) { + output.writeUInt32(4, maximumLength_); + } getUnknownFields().writeTo(output); } @@ -7253,6 +7274,10 @@ public int getSerializedSize() { size += dataSize; size += 1 * getFieldNamesList().size(); } + if (((bitField0_ & 0x00000002) == 0x00000002)) { + size += com.google.protobuf.CodedOutputStream + .computeUInt32Size(4, maximumLength_); + } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; return size; @@ -7383,6 +7408,8 @@ public Builder clear() { bitField0_ = (bitField0_ & ~0x00000002); fieldNames_ = com.google.protobuf.LazyStringArrayList.EMPTY; bitField0_ = (bitField0_ & ~0x00000004); + maximumLength_ = 0; + bitField0_ = (bitField0_ & ~0x00000008); return this; } @@ -7436,6 +7463,10 @@ public Builder clone() { bitField0_ = (bitField0_ & ~0x00000004); } result.fieldNames_ = fieldNames_; + if (((from_bitField0_ & 0x00000008) == 0x00000008)) { + to_bitField0_ |= 0x00000002; + } + result.maximumLength_ = maximumLength_; result.bitField0_ = to_bitField0_; onBuilt(); return result; @@ -7475,6 +7506,9 @@ public Builder mergeFrom(org.apache.hadoop.hive.ql.io.orc.OrcProto.Type other) { } onChanged(); } + if (other.hasMaximumLength()) { + setMaximumLength(other.getMaximumLength()); + } this.mergeUnknownFields(other.getUnknownFields()); return this; } @@ -7540,6 +7574,11 @@ public Builder mergeFrom( fieldNames_.add(input.readBytes()); break; } + case 32: { + bitField0_ |= 0x00000008; + maximumLength_ = input.readUInt32(); + break; + } } } } @@ -7671,6 +7710,27 @@ void addFieldNames(com.google.protobuf.ByteString value) { onChanged(); } + // optional uint32 maximumLength = 4; + private int maximumLength_ ; + public boolean hasMaximumLength() { + return ((bitField0_ & 0x00000008) == 0x00000008); + } + public int getMaximumLength() { + return maximumLength_; + } + public Builder setMaximumLength(int value) { + bitField0_ |= 0x00000008; + maximumLength_ = value; + onChanged(); + return this; + } + public Builder clearMaximumLength() { + bitField0_ = (bitField0_ & ~0x00000008); + maximumLength_ = 0; + onChanged(); + return this; + } + // @@protoc_insertion_point(builder_scope:org.apache.hadoop.hive.ql.io.orc.Type) } @@ -11132,33 +11192,34 @@ void setMagic(com.google.protobuf.ByteString value) { "treams\030\001 \003(\0132(.org.apache.hadoop.hive.ql" + ".io.orc.Stream\022A\n\007columns\030\002 \003(\01320.org.ap", "ache.hadoop.hive.ql.io.orc.ColumnEncodin" + - "g\"\250\002\n\004Type\0229\n\004kind\030\001 \002(\0162+.org.apache.ha" + + "g\"\314\002\n\004Type\0229\n\004kind\030\001 \002(\0162+.org.apache.ha" + "doop.hive.ql.io.orc.Type.Kind\022\024\n\010subtype" + - "s\030\002 \003(\rB\002\020\001\022\022\n\nfieldNames\030\003 \003(\t\"\272\001\n\004Kind" + - "\022\013\n\007BOOLEAN\020\000\022\010\n\004BYTE\020\001\022\t\n\005SHORT\020\002\022\007\n\003IN" + - "T\020\003\022\010\n\004LONG\020\004\022\t\n\005FLOAT\020\005\022\n\n\006DOUBLE\020\006\022\n\n\006" + - "STRING\020\007\022\n\n\006BINARY\020\010\022\r\n\tTIMESTAMP\020\t\022\010\n\004L" + - "IST\020\n\022\007\n\003MAP\020\013\022\n\n\006STRUCT\020\014\022\t\n\005UNION\020\r\022\013\n" + - "\007DECIMAL\020\016\022\010\n\004DATE\020\017\"x\n\021StripeInformatio" + - "n\022\016\n\006offset\030\001 \001(\004\022\023\n\013indexLength\030\002 \001(\004\022\022", - "\n\ndataLength\030\003 \001(\004\022\024\n\014footerLength\030\004 \001(\004" + - "\022\024\n\014numberOfRows\030\005 \001(\004\"/\n\020UserMetadataIt" + - "em\022\014\n\004name\030\001 \002(\t\022\r\n\005value\030\002 \002(\014\"\356\002\n\006Foot" + - "er\022\024\n\014headerLength\030\001 \001(\004\022\025\n\rcontentLengt" + - "h\030\002 \001(\004\022D\n\007stripes\030\003 \003(\01323.org.apache.ha" + - "doop.hive.ql.io.orc.StripeInformation\0225\n" + - "\005types\030\004 \003(\0132&.org.apache.hadoop.hive.ql" + - ".io.orc.Type\022D\n\010metadata\030\005 \003(\01322.org.apa" + - "che.hadoop.hive.ql.io.orc.UserMetadataIt" + - "em\022\024\n\014numberOfRows\030\006 \001(\004\022F\n\nstatistics\030\007", - " \003(\01322.org.apache.hadoop.hive.ql.io.orc." + - "ColumnStatistics\022\026\n\016rowIndexStride\030\010 \001(\r" + - "\"\255\001\n\nPostScript\022\024\n\014footerLength\030\001 \001(\004\022F\n" + - "\013compression\030\002 \001(\01621.org.apache.hadoop.h" + - "ive.ql.io.orc.CompressionKind\022\034\n\024compres" + - "sionBlockSize\030\003 \001(\004\022\023\n\007version\030\004 \003(\rB\002\020\001" + - "\022\016\n\005magic\030\300> \001(\t*:\n\017CompressionKind\022\010\n\004N" + - "ONE\020\000\022\010\n\004ZLIB\020\001\022\n\n\006SNAPPY\020\002\022\007\n\003LZO\020\003" + "s\030\002 \003(\rB\002\020\001\022\022\n\nfieldNames\030\003 \003(\t\022\025\n\rmaxim" + + "umLength\030\004 \001(\r\"\307\001\n\004Kind\022\013\n\007BOOLEAN\020\000\022\010\n\004" + + "BYTE\020\001\022\t\n\005SHORT\020\002\022\007\n\003INT\020\003\022\010\n\004LONG\020\004\022\t\n\005" + + "FLOAT\020\005\022\n\n\006DOUBLE\020\006\022\n\n\006STRING\020\007\022\n\n\006BINAR" + + "Y\020\010\022\r\n\tTIMESTAMP\020\t\022\010\n\004LIST\020\n\022\007\n\003MAP\020\013\022\n\n" + + "\006STRUCT\020\014\022\t\n\005UNION\020\r\022\013\n\007DECIMAL\020\016\022\010\n\004DAT" + + "E\020\017\022\013\n\007VARCHAR\020\020\"x\n\021StripeInformation\022\016\n", + "\006offset\030\001 \001(\004\022\023\n\013indexLength\030\002 \001(\004\022\022\n\nda" + + "taLength\030\003 \001(\004\022\024\n\014footerLength\030\004 \001(\004\022\024\n\014" + + "numberOfRows\030\005 \001(\004\"/\n\020UserMetadataItem\022\014" + + "\n\004name\030\001 \002(\t\022\r\n\005value\030\002 \002(\014\"\356\002\n\006Footer\022\024" + + "\n\014headerLength\030\001 \001(\004\022\025\n\rcontentLength\030\002 " + + "\001(\004\022D\n\007stripes\030\003 \003(\01323.org.apache.hadoop" + + ".hive.ql.io.orc.StripeInformation\0225\n\005typ" + + "es\030\004 \003(\0132&.org.apache.hadoop.hive.ql.io." + + "orc.Type\022D\n\010metadata\030\005 \003(\01322.org.apache." + + "hadoop.hive.ql.io.orc.UserMetadataItem\022\024", + "\n\014numberOfRows\030\006 \001(\004\022F\n\nstatistics\030\007 \003(\013" + + "22.org.apache.hadoop.hive.ql.io.orc.Colu" + + "mnStatistics\022\026\n\016rowIndexStride\030\010 \001(\r\"\255\001\n" + + "\nPostScript\022\024\n\014footerLength\030\001 \001(\004\022F\n\013com" + + "pression\030\002 \001(\01621.org.apache.hadoop.hive." + + "ql.io.orc.CompressionKind\022\034\n\024compression" + + "BlockSize\030\003 \001(\004\022\023\n\007version\030\004 \003(\rB\002\020\001\022\016\n\005" + + "magic\030\300> \001(\t*:\n\017CompressionKind\022\010\n\004NONE\020" + + "\000\022\010\n\004ZLIB\020\001\022\n\n\006SNAPPY\020\002\022\007\n\003LZO\020\003" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { @@ -11266,7 +11327,7 @@ void setMagic(com.google.protobuf.ByteString value) { internal_static_org_apache_hadoop_hive_ql_io_orc_Type_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_org_apache_hadoop_hive_ql_io_orc_Type_descriptor, - new java.lang.String[] { "Kind", "Subtypes", "FieldNames", }, + new java.lang.String[] { "Kind", "Subtypes", "FieldNames", "MaximumLength", }, org.apache.hadoop.hive.ql.io.orc.OrcProto.Type.class, org.apache.hadoop.hive.ql.io.orc.OrcProto.Type.Builder.class); internal_static_org_apache_hadoop_hive_ql_io_orc_StripeInformation_descriptor = diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java index 7e976f5..6268617 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java @@ -714,6 +714,7 @@ static ColumnStatisticsImpl create(ObjectInspector inspector) { case DOUBLE: return new DoubleStatisticsImpl(); case STRING: + case VARCHAR: return new StringStatisticsImpl(); case DECIMAL: return new DecimalStatisticsImpl(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java index 2b1f0d8..4fe3798 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java @@ -33,13 +33,17 @@ import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ParameterizedPrimitiveTypeUtils; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeParams; import org.apache.hadoop.io.Writable; final class OrcStruct implements Writable { @@ -473,6 +477,15 @@ static ObjectInspector createObjectInspector(TypeInfo info) { return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; case STRING: return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + case VARCHAR: + // For varchar we need to retrieve the string length from the TypeInfo. + VarcharTypeParams varcharParams = (VarcharTypeParams) + ParameterizedPrimitiveTypeUtils.getTypeParamsFromTypeInfo(info); + if (varcharParams == null) { + throw new IllegalArgumentException("varchar type used without type params"); + } + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + (PrimitiveTypeInfo) info); case TIMESTAMP: return PrimitiveObjectInspectorFactory.javaTimestampObjectInspector; case DATE: @@ -519,6 +532,16 @@ static ObjectInspector createObjectInspector(int columnId, return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; case STRING: return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + case VARCHAR: + if (!type.hasMaximumLength()) { + throw new UnsupportedOperationException( + "Illegal use of varchar type without length in ORC type definition."); + } + VarcharTypeParams varcharParams = new VarcharTypeParams(); + varcharParams.setLength(type.getMaximumLength()); + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + PrimitiveObjectInspectorUtils.getTypeEntryFromTypeSpecs( + PrimitiveCategory.VARCHAR, varcharParams)); case TIMESTAMP: return PrimitiveObjectInspectorFactory.javaTimestampObjectInspector; case DATE: diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index 15c0330..81c914c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; @@ -1075,6 +1076,34 @@ void skipRows(long items) throws IOException { } } + private static class VarcharTreeReader extends StringTreeReader { + int maxLength; + + VarcharTreeReader(Path path, int columnId, int maxLength) { + super(path, columnId); + this.maxLength = maxLength; + } + + @Override + Object next(Object previous) throws IOException { + HiveVarcharWritable result = null; + if (previous == null) { + result = new HiveVarcharWritable(); + } else { + result = (HiveVarcharWritable) previous; + } + // Use the string reader implementation to populate the internal Text value + Object textVal = super.next(result.getTextValue()); + if (textVal == null) { + return null; + } + // result should now hold the value that was read in. + // enforce varchar length + result.enforceMaxLength(maxLength); + return result; + } + } + private static class StructTreeReader extends TreeReader { private final TreeReader[] fields; private final String[] fieldNames; @@ -1426,6 +1455,11 @@ private static TreeReader createTreeReader(Path path, return new LongTreeReader(path, columnId); case STRING: return new StringTreeReader(path, columnId); + case VARCHAR: + if (!type.hasMaximumLength()) { + throw new IllegalArgumentException("ORC varchar type has no length specified"); + } + return new VarcharTreeReader(path, columnId, type.getMaximumLength()); case BINARY: return new BinaryTreeReader(path, columnId); case TIMESTAMP: diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java index 6634932..44961ce 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; +import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; @@ -54,11 +55,14 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.ParameterizedPrimitiveTypeUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeParams; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; @@ -876,12 +880,21 @@ void recordPosition(PositionRecorder recorder) throws IOException { defaultFloatVal); } + /** + * Method to retrieve string values from the value object, which can be overridden + * by subclasses. + * @param obj value + * @return String value from obj + */ + String getStringValue(Object obj) { + return ((StringObjectInspector) inspector).getPrimitiveJavaObject(obj); + } + @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { - String val = ((StringObjectInspector) inspector) - .getPrimitiveJavaObject(obj); + String val = getStringValue(obj); rows.add(dictionary.add(val)); indexStatistics.updateString(val); } @@ -1014,6 +1027,28 @@ long estimateMemory() { } } + /** + * Under the covers, varchar is written to ORC the same way as string. + */ + private static class VarcharTreeWriter extends StringTreeWriter { + + VarcharTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + } + + /** + * Override base class implementation to support varchar values. + */ + @Override + String getStringValue(Object obj) { + return (((HiveVarcharObjectInspector) inspector) + .getPrimitiveJavaObject(obj)).getValue(); + } + } + private static class BinaryTreeWriter extends TreeWriter { private final PositionedOutputStream stream; private final IntegerWriter length; @@ -1500,6 +1535,9 @@ private static TreeWriter createTreeWriter(ObjectInspector inspector, case STRING: return new StringTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); + case VARCHAR: + return new VarcharTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); case BINARY: return new BinaryTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); @@ -1565,6 +1603,18 @@ private static void writeTypes(OrcProto.Footer.Builder builder, case STRING: type.setKind(OrcProto.Type.Kind.STRING); break; + case VARCHAR: + // The varchar length needs to be written to file and should be available + // from the object inspector + VarcharTypeParams varcharParams = (VarcharTypeParams) + ParameterizedPrimitiveTypeUtils.getTypeParamsFromPrimitiveObjectInspector( + (PrimitiveObjectInspector) treeWriter.inspector); + if (varcharParams == null) { + throw new IllegalArgumentException("No varchar length specified in ORC type"); + } + type.setKind(Type.Kind.VARCHAR); + type.setMaximumLength(varcharParams.getLength()); + break; case BINARY: type.setKind(OrcProto.Type.Kind.BINARY); break; diff --git ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto index d5bea25..edbf822 100644 --- ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto +++ ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto @@ -103,10 +103,12 @@ message Type { UNION = 13; DECIMAL = 14; DATE = 15; + VARCHAR = 16; } required Kind kind = 1; repeated uint32 subtypes = 2 [packed=true]; repeated string fieldNames = 3; + optional uint32 maximumLength = 4; } message StripeInformation { diff --git ql/src/test/queries/clientpositive/varchar_serde.q ql/src/test/queries/clientpositive/varchar_serde.q new file mode 100644 index 0000000..7351b68 --- /dev/null +++ ql/src/test/queries/clientpositive/varchar_serde.q @@ -0,0 +1,102 @@ +drop table if exists varchar_serde_regex; +drop table if exists varchar_serde_lb; +drop table if exists varchar_serde_ls; +drop table if exists varchar_serde_c; +drop table if exists varchar_serde_lbc; +drop table if exists varchar_serde_orc; + +-- +-- RegexSerDe +-- +create table varchar_serde_regex ( + key varchar(10), + value varchar(20) +) +row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' +with serdeproperties ( + "input.regex" = "([^]*)([^]*)" +) +stored as textfile; + +load data local inpath '../data/files/srcbucket0.txt' overwrite into table varchar_serde_regex; + +select * from varchar_serde_regex limit 5; +select value, count(*) from varchar_serde_regex group by value limit 5; + +-- +-- LazyBinary +-- +create table varchar_serde_lb ( + key varchar(10), + value varchar(20) +); +alter table varchar_serde_lb set serde 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe'; + +insert overwrite table varchar_serde_lb + select key, value from varchar_serde_regex; +select * from varchar_serde_lb limit 5; +select value, count(*) from varchar_serde_lb group by value limit 5; + +-- +-- LazySimple +-- +create table varchar_serde_ls ( + key varchar(10), + value varchar(20) +); +alter table varchar_serde_ls set serde 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'; + +insert overwrite table varchar_serde_ls + select key, value from varchar_serde_lb; +select * from varchar_serde_ls limit 5; +select value, count(*) from varchar_serde_ls group by value limit 5; + +-- +-- Columnar +-- +create table varchar_serde_c ( + key varchar(10), + value varchar(20) +) stored as rcfile; +alter table varchar_serde_c set serde 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe'; + +insert overwrite table varchar_serde_c + select key, value from varchar_serde_ls; +select * from varchar_serde_c limit 5; +select value, count(*) from varchar_serde_c group by value limit 5; + +-- +-- LazyBinaryColumnar +-- +create table varchar_serde_lbc ( + key varchar(10), + value varchar(20) +) stored as rcfile; +alter table varchar_serde_lbc set serde 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'; + +insert overwrite table varchar_serde_lbc + select key, value from varchar_serde_c; +select * from varchar_serde_lbc limit 5; +select value, count(*) from varchar_serde_lbc group by value limit 5; + +-- +-- ORC +-- +create table varchar_serde_orc ( + key varchar(10), + value varchar(20) +) stored as orc; +alter table varchar_serde_orc set serde 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'; + + +insert overwrite table varchar_serde_orc + select key, value from varchar_serde_lbc; +select * from varchar_serde_orc limit 5; +select value, count(*) from varchar_serde_orc group by value limit 5; + +drop table if exists varchar_serde_regex; +drop table if exists varchar_serde_lb; +drop table if exists varchar_serde_ls; +drop table if exists varchar_serde_c; +drop table if exists varchar_serde_lbc; +drop table if exists varchar_serde_orc; diff --git ql/src/test/results/clientpositive/varchar_serde.q.out ql/src/test/results/clientpositive/varchar_serde.q.out new file mode 100644 index 0000000..8ae974e --- /dev/null +++ ql/src/test/results/clientpositive/varchar_serde.q.out @@ -0,0 +1,626 @@ +PREHOOK: query: drop table if exists varchar_serde_regex +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists varchar_serde_regex +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists varchar_serde_lb +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists varchar_serde_lb +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists varchar_serde_ls +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists varchar_serde_ls +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists varchar_serde_c +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists varchar_serde_c +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists varchar_serde_lbc +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists varchar_serde_lbc +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists varchar_serde_orc +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists varchar_serde_orc +POSTHOOK: type: DROPTABLE +PREHOOK: query: -- +-- RegexSerDe +-- +create table varchar_serde_regex ( + key varchar(10), + value varchar(20) +) +row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' +with serdeproperties ( + "input.regex" = "([^]*)([^]*)" +) +stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- +-- RegexSerDe +-- +create table varchar_serde_regex ( + key varchar(10), + value varchar(20) +) +row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' +with serdeproperties ( + "input.regex" = "([^]*)([^]*)" +) +stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@varchar_serde_regex +PREHOOK: query: load data local inpath '../data/files/srcbucket0.txt' overwrite into table varchar_serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@varchar_serde_regex +POSTHOOK: query: load data local inpath '../data/files/srcbucket0.txt' overwrite into table varchar_serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@varchar_serde_regex +PREHOOK: query: select * from varchar_serde_regex limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_regex +#### A masked pattern was here #### +POSTHOOK: query: select * from varchar_serde_regex limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_regex +#### A masked pattern was here #### +474 val_475 +62 val_63 +468 val_469 +272 val_273 +448 val_449 +PREHOOK: query: select value, count(*) from varchar_serde_regex group by value limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_regex +#### A masked pattern was here #### +POSTHOOK: query: select value, count(*) from varchar_serde_regex group by value limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_regex +#### A masked pattern was here #### +val_0 3 +val_1 2 +val_10 1 +val_100 2 +val_101 2 +PREHOOK: query: -- +-- LazyBinary +-- +create table varchar_serde_lb ( + key varchar(10), + value varchar(20) +) +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- +-- LazyBinary +-- +create table varchar_serde_lb ( + key varchar(10), + value varchar(20) +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@varchar_serde_lb +PREHOOK: query: alter table varchar_serde_lb set serde 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe' +PREHOOK: type: ALTERTABLE_SERIALIZER +PREHOOK: Input: default@varchar_serde_lb +PREHOOK: Output: default@varchar_serde_lb +POSTHOOK: query: alter table varchar_serde_lb set serde 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe' +POSTHOOK: type: ALTERTABLE_SERIALIZER +POSTHOOK: Input: default@varchar_serde_lb +POSTHOOK: Output: default@varchar_serde_lb +PREHOOK: query: insert overwrite table varchar_serde_lb + select key, value from varchar_serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_regex +PREHOOK: Output: default@varchar_serde_lb +POSTHOOK: query: insert overwrite table varchar_serde_lb + select key, value from varchar_serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_regex +POSTHOOK: Output: default@varchar_serde_lb +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: select * from varchar_serde_lb limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_lb +#### A masked pattern was here #### +POSTHOOK: query: select * from varchar_serde_lb limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_lb +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +474 val_475 +62 val_63 +468 val_469 +272 val_273 +448 val_449 +PREHOOK: query: select value, count(*) from varchar_serde_lb group by value limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_lb +#### A masked pattern was here #### +POSTHOOK: query: select value, count(*) from varchar_serde_lb group by value limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_lb +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +val_0 3 +val_1 2 +val_10 1 +val_100 2 +val_101 2 +PREHOOK: query: -- +-- LazySimple +-- +create table varchar_serde_ls ( + key varchar(10), + value varchar(20) +) +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- +-- LazySimple +-- +create table varchar_serde_ls ( + key varchar(10), + value varchar(20) +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@varchar_serde_ls +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: alter table varchar_serde_ls set serde 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +PREHOOK: type: ALTERTABLE_SERIALIZER +PREHOOK: Input: default@varchar_serde_ls +PREHOOK: Output: default@varchar_serde_ls +POSTHOOK: query: alter table varchar_serde_ls set serde 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +POSTHOOK: type: ALTERTABLE_SERIALIZER +POSTHOOK: Input: default@varchar_serde_ls +POSTHOOK: Output: default@varchar_serde_ls +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: insert overwrite table varchar_serde_ls + select key, value from varchar_serde_lb +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_lb +PREHOOK: Output: default@varchar_serde_ls +POSTHOOK: query: insert overwrite table varchar_serde_ls + select key, value from varchar_serde_lb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_lb +POSTHOOK: Output: default@varchar_serde_ls +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: select * from varchar_serde_ls limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_ls +#### A masked pattern was here #### +POSTHOOK: query: select * from varchar_serde_ls limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_ls +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +474 val_475 +62 val_63 +468 val_469 +272 val_273 +448 val_449 +PREHOOK: query: select value, count(*) from varchar_serde_ls group by value limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_ls +#### A masked pattern was here #### +POSTHOOK: query: select value, count(*) from varchar_serde_ls group by value limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_ls +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +val_0 3 +val_1 2 +val_10 1 +val_100 2 +val_101 2 +PREHOOK: query: -- +-- Columnar +-- +create table varchar_serde_c ( + key varchar(10), + value varchar(20) +) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- +-- Columnar +-- +create table varchar_serde_c ( + key varchar(10), + value varchar(20) +) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@varchar_serde_c +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: alter table varchar_serde_c set serde 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe' +PREHOOK: type: ALTERTABLE_SERIALIZER +PREHOOK: Input: default@varchar_serde_c +PREHOOK: Output: default@varchar_serde_c +POSTHOOK: query: alter table varchar_serde_c set serde 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe' +POSTHOOK: type: ALTERTABLE_SERIALIZER +POSTHOOK: Input: default@varchar_serde_c +POSTHOOK: Output: default@varchar_serde_c +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: insert overwrite table varchar_serde_c + select key, value from varchar_serde_ls +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_ls +PREHOOK: Output: default@varchar_serde_c +POSTHOOK: query: insert overwrite table varchar_serde_c + select key, value from varchar_serde_ls +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_ls +POSTHOOK: Output: default@varchar_serde_c +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: select * from varchar_serde_c limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_c +#### A masked pattern was here #### +POSTHOOK: query: select * from varchar_serde_c limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_c +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +474 val_475 +62 val_63 +468 val_469 +272 val_273 +448 val_449 +PREHOOK: query: select value, count(*) from varchar_serde_c group by value limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_c +#### A masked pattern was here #### +POSTHOOK: query: select value, count(*) from varchar_serde_c group by value limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_c +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +val_0 3 +val_1 2 +val_10 1 +val_100 2 +val_101 2 +PREHOOK: query: -- +-- LazyBinaryColumnar +-- +create table varchar_serde_lbc ( + key varchar(10), + value varchar(20) +) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- +-- LazyBinaryColumnar +-- +create table varchar_serde_lbc ( + key varchar(10), + value varchar(20) +) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@varchar_serde_lbc +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: alter table varchar_serde_lbc set serde 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe' +PREHOOK: type: ALTERTABLE_SERIALIZER +PREHOOK: Input: default@varchar_serde_lbc +PREHOOK: Output: default@varchar_serde_lbc +POSTHOOK: query: alter table varchar_serde_lbc set serde 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe' +POSTHOOK: type: ALTERTABLE_SERIALIZER +POSTHOOK: Input: default@varchar_serde_lbc +POSTHOOK: Output: default@varchar_serde_lbc +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: insert overwrite table varchar_serde_lbc + select key, value from varchar_serde_c +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_c +PREHOOK: Output: default@varchar_serde_lbc +POSTHOOK: query: insert overwrite table varchar_serde_lbc + select key, value from varchar_serde_c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_c +POSTHOOK: Output: default@varchar_serde_lbc +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: select * from varchar_serde_lbc limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_lbc +#### A masked pattern was here #### +POSTHOOK: query: select * from varchar_serde_lbc limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_lbc +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +474 val_475 +62 val_63 +468 val_469 +272 val_273 +448 val_449 +PREHOOK: query: select value, count(*) from varchar_serde_lbc group by value limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_lbc +#### A masked pattern was here #### +POSTHOOK: query: select value, count(*) from varchar_serde_lbc group by value limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_lbc +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +val_0 3 +val_1 2 +val_10 1 +val_100 2 +val_101 2 +PREHOOK: query: -- +-- ORC +-- +create table varchar_serde_orc ( + key varchar(10), + value varchar(20) +) stored as orc +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- +-- ORC +-- +create table varchar_serde_orc ( + key varchar(10), + value varchar(20) +) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@varchar_serde_orc +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: alter table varchar_serde_orc set serde 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +PREHOOK: type: ALTERTABLE_SERIALIZER +PREHOOK: Input: default@varchar_serde_orc +PREHOOK: Output: default@varchar_serde_orc +POSTHOOK: query: alter table varchar_serde_orc set serde 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +POSTHOOK: type: ALTERTABLE_SERIALIZER +POSTHOOK: Input: default@varchar_serde_orc +POSTHOOK: Output: default@varchar_serde_orc +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: insert overwrite table varchar_serde_orc + select key, value from varchar_serde_lbc +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_lbc +PREHOOK: Output: default@varchar_serde_orc +POSTHOOK: query: insert overwrite table varchar_serde_orc + select key, value from varchar_serde_lbc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_lbc +POSTHOOK: Output: default@varchar_serde_orc +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: select * from varchar_serde_orc limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_orc +#### A masked pattern was here #### +POSTHOOK: query: select * from varchar_serde_orc limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +474 val_475 +62 val_63 +468 val_469 +272 val_273 +448 val_449 +PREHOOK: query: select value, count(*) from varchar_serde_orc group by value limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_serde_orc +#### A masked pattern was here #### +POSTHOOK: query: select value, count(*) from varchar_serde_orc group by value limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_serde_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +val_0 3 +val_1 2 +val_10 1 +val_100 2 +val_101 2 +PREHOOK: query: drop table if exists varchar_serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@varchar_serde_regex +PREHOOK: Output: default@varchar_serde_regex +POSTHOOK: query: drop table if exists varchar_serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@varchar_serde_regex +POSTHOOK: Output: default@varchar_serde_regex +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: drop table if exists varchar_serde_lb +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@varchar_serde_lb +PREHOOK: Output: default@varchar_serde_lb +POSTHOOK: query: drop table if exists varchar_serde_lb +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@varchar_serde_lb +POSTHOOK: Output: default@varchar_serde_lb +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: drop table if exists varchar_serde_ls +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@varchar_serde_ls +PREHOOK: Output: default@varchar_serde_ls +POSTHOOK: query: drop table if exists varchar_serde_ls +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@varchar_serde_ls +POSTHOOK: Output: default@varchar_serde_ls +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: drop table if exists varchar_serde_c +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@varchar_serde_c +PREHOOK: Output: default@varchar_serde_c +POSTHOOK: query: drop table if exists varchar_serde_c +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@varchar_serde_c +POSTHOOK: Output: default@varchar_serde_c +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: drop table if exists varchar_serde_lbc +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@varchar_serde_lbc +PREHOOK: Output: default@varchar_serde_lbc +POSTHOOK: query: drop table if exists varchar_serde_lbc +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@varchar_serde_lbc +POSTHOOK: Output: default@varchar_serde_lbc +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +PREHOOK: query: drop table if exists varchar_serde_orc +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@varchar_serde_orc +PREHOOK: Output: default@varchar_serde_orc +POSTHOOK: query: drop table if exists varchar_serde_orc +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@varchar_serde_orc +POSTHOOK: Output: default@varchar_serde_orc +POSTHOOK: Lineage: varchar_serde_c.key SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_c.value SIMPLE [(varchar_serde_ls)varchar_serde_ls.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.key SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lb.value SIMPLE [(varchar_serde_regex)varchar_serde_regex.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.key SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_lbc.value SIMPLE [(varchar_serde_c)varchar_serde_c.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.key SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_ls.value SIMPLE [(varchar_serde_lb)varchar_serde_lb.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.key SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:key, type:varchar(10), comment:from deserializer), ] +POSTHOOK: Lineage: varchar_serde_orc.value SIMPLE [(varchar_serde_lbc)varchar_serde_lbc.FieldSchema(name:value, type:varchar(20), comment:from deserializer), ] diff --git serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java index 5d0eb0c..5de5bd5 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java @@ -30,13 +30,19 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.typeinfo.ParameterizedPrimitiveTypeUtils; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeParams; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -119,7 +125,8 @@ public void initialize(Configuration conf, Properties tbl) */ List columnOIs = new ArrayList(columnNames.size()); for (int c = 0; c < numColumns; c++) { - String typeName = columnTypes.get(c).getTypeName(); + TypeInfo typeInfo = columnTypes.get(c); + String typeName = typeInfo.getTypeName(); if (typeName.equals(serdeConstants.STRING_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); } else if (typeName.equals(serdeConstants.TINYINT_TYPE_NAME)) { @@ -142,6 +149,13 @@ public void initialize(Configuration conf, Properties tbl) columnOIs.add(PrimitiveObjectInspectorFactory.javaDateObjectInspector); } else if (typeName.equals(serdeConstants.DECIMAL_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector); + } else if (typeInfo instanceof PrimitiveTypeInfo + && + ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() == PrimitiveCategory.VARCHAR) { + VarcharTypeParams varcharParams = (VarcharTypeParams) + ParameterizedPrimitiveTypeUtils.getTypeParamsFromTypeInfo(typeInfo); + columnOIs.add(PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector( + (PrimitiveTypeInfo) typeInfo)); } else { throw new SerDeException(getClass().getName() + " doesn't allow column [" + c + "] named " @@ -202,7 +216,8 @@ public Object deserialize(Writable blob) throws SerDeException { for (int c = 0; c < numColumns; c++) { try { String t = m.group(c+1); - String typeName = columnTypes.get(c).getTypeName(); + TypeInfo typeInfo = columnTypes.get(c); + String typeName = typeInfo.getTypeName(); // Convert the column to the correct type when needed and set in row obj if (typeName.equals(serdeConstants.STRING_TYPE_NAME)) { @@ -247,6 +262,13 @@ public Object deserialize(Writable blob) throws SerDeException { HiveDecimal bd; bd = new HiveDecimal(t); row.set(c, bd); + } else if (typeInfo instanceof PrimitiveTypeInfo + && + ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() == PrimitiveCategory.VARCHAR) { + VarcharTypeParams varcharParams = (VarcharTypeParams) + ParameterizedPrimitiveTypeUtils.getTypeParamsFromTypeInfo(typeInfo); + HiveVarchar hv = new HiveVarchar(t, varcharParams != null ? varcharParams.length : -1); + row.set(c, hv); } } catch (RuntimeException e) { partialMatchedRowsCount++; diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java index 1b4c509..a206023 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java @@ -703,12 +703,9 @@ static void serialize(OutputByteBuffer buffer, Object o, ObjectInspector oi, case VARCHAR: { HiveVarcharObjectInspector hcoi = (HiveVarcharObjectInspector)poi; HiveVarcharWritable hc = hcoi.getPrimitiveWritableObject(o); - try { - ByteBuffer bb = Text.encode(hc.getHiveVarchar().getValue()); - serializeBytes(buffer, bb.array(), bb.limit(), invert); - } catch (CharacterCodingException err) { - throw new SerDeException(err); - } + // use varchar's text field directly + Text t = hc.getTextValue(); + serializeBytes(buffer, t.getBytes(), t.getLength(), invert); return; } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java index 05822cb..ec91717 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java @@ -229,8 +229,9 @@ public static void writePrimitiveUTF8(OutputStream out, Object o, case VARCHAR: { HiveVarcharWritable hc = ((HiveVarcharObjectInspector)oi).getPrimitiveWritableObject(o); - ByteBuffer b = Text.encode(hc.toString()); - writeEscaped(out, b.array(), 0, b.limit(), escaped, escapeChar, needsEscape); + Text t = hc.getTextValue(); + writeEscaped(out, t.getBytes(), 0, t.getLength(), escaped, escapeChar, + needsEscape); break; } case BINARY: { diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java index 77a1951..ab4eb56 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java @@ -270,6 +270,17 @@ private static boolean serializeStruct(Output byteStream, Object obj, return warnedOnceNullMapKey; } + private static void serializeText(Output byteStream, Text t, boolean skipLengthPrefix) { + /* write byte size of the string which is a vint */ + int length = t.getLength(); + if (!skipLengthPrefix) { + LazyBinaryUtils.writeVInt(byteStream, length); + } + /* write string itself */ + byte[] data = t.getBytes(); + byteStream.write(data, 0, length); + } + /** * A recursive function that serialize an object to a byte buffer based on its * object inspector. @@ -358,32 +369,13 @@ public static boolean serialize(Output byteStream, Object obj, case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(obj); - /* write byte size of the string which is a vint */ - int length = t.getLength(); - if (!skipLengthPrefix) { - LazyBinaryUtils.writeVInt(byteStream, length); - } - /* write string itself */ - byte[] data = t.getBytes(); - byteStream.write(data, 0, length); + serializeText(byteStream, t, skipLengthPrefix); return warnedOnceNullMapKey; } case VARCHAR: { HiveVarcharObjectInspector hcoi = (HiveVarcharObjectInspector) poi; - String value = - hcoi.getPrimitiveWritableObject(obj).getHiveVarchar().getValue(); - int length = value.length(); - // Write byte size - if (!skipLengthPrefix) { - LazyBinaryUtils.writeVInt(byteStream, length); - } - // Write string value - try { - ByteBuffer bb = Text.encode(value); - byteStream.write(bb.array(), 0, bb.limit()); - } catch (CharacterCodingException err) { - throw new SerDeException(err); - } + Text t = hcoi.getPrimitiveWritableObject(obj).getTextValue(); + serializeText(byteStream, t, skipLengthPrefix); return warnedOnceNullMapKey; } case BINARY: {