From 5973e78c65ebafcb7aa854a174aafe95ad92044c Mon Sep 17 00:00:00 2001 From: "suma.shivaprasad" Date: Fri, 5 Sep 2014 14:18:44 +0530 Subject: [PATCH] Fixed HIVE-2390 - Add uniontype support or LazyBinarySerde Fixed order of query output for matching --- .../test/queries/clientpositive/input_lazyserde.q | 4 + .../results/clientpositive/input_lazyserde.q.out | 533 ++++++++++++++++++++ .../hive/serde2/lazybinary/LazyBinaryFactory.java | 3 + .../hive/serde2/lazybinary/LazyBinarySerDe.java | 32 +- .../hive/serde2/lazybinary/LazyBinaryUnion.java | 196 +++++++ .../hive/serde2/lazybinary/LazyBinaryUtils.java | 16 + .../LazyBinaryObjectInspectorFactory.java | 17 + .../LazyBinaryUnionObjectInspector.java | 63 +++ 8 files changed, 855 insertions(+), 9 deletions(-) create mode 100644 serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryUnion.java create mode 100644 serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/objectinspector/LazyBinaryUnionObjectInspector.java diff --git ql/src/test/queries/clientpositive/input_lazyserde.q ql/src/test/queries/clientpositive/input_lazyserde.q index 53ae6d8..69c0d04 100644 --- ql/src/test/queries/clientpositive/input_lazyserde.q +++ ql/src/test/queries/clientpositive/input_lazyserde.q @@ -30,3 +30,7 @@ CREATE TABLE dest1(a map) ROW FORMAT DELIMITED FIELDS TERMINATED INSERT OVERWRITE TABLE dest1 SELECT src_thrift.mstringstring FROM src_thrift DISTRIBUTE BY 1; SELECT * from dest1; +CREATE TABLE destBin(a UNIONTYPE, struct>) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe' STORED AS SEQUENCEFILE; +INSERT OVERWRITE TABLE destBin SELECT create_union( CASE WHEN key < 100 THEN 0 WHEN key < 200 THEN 1 WHEN key < 300 THEN 2 WHEN key < 400 THEN 3 ELSE 0 END, key, 2.0, array("one","two"), struct(5,"five")) FROM srcbucket2; +SELECT * from destBin ORDER BY a; +DROP TABLE destBin; diff --git ql/src/test/results/clientpositive/input_lazyserde.q.out ql/src/test/results/clientpositive/input_lazyserde.q.out index 4710789..5f99696 100644 --- ql/src/test/results/clientpositive/input_lazyserde.q.out +++ ql/src/test/results/clientpositive/input_lazyserde.q.out @@ -219,3 +219,536 @@ NULL {"key_7":"value_7"} {"key_8":"value_8"} {"key_9":"value_9"} +PREHOOK: query: CREATE TABLE destBin(a UNIONTYPE, struct>) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe' STORED AS SEQUENCEFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@destBin +POSTHOOK: query: CREATE TABLE destBin(a UNIONTYPE, struct>) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe' STORED AS SEQUENCEFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@destBin +PREHOOK: query: INSERT OVERWRITE TABLE destBin SELECT create_union( CASE WHEN key < 100 THEN 0 WHEN key < 200 THEN 1 WHEN key < 300 THEN 2 WHEN key < 400 THEN 3 ELSE 0 END, key, 2.0, array("one","two"), struct(5,"five")) FROM srcbucket2 +PREHOOK: type: QUERY +PREHOOK: Input: default@srcbucket2 +PREHOOK: Output: default@destbin +POSTHOOK: query: INSERT OVERWRITE TABLE destBin SELECT create_union( CASE WHEN key < 100 THEN 0 WHEN key < 200 THEN 1 WHEN key < 300 THEN 2 WHEN key < 400 THEN 3 ELSE 0 END, key, 2.0, array("one","two"), struct(5,"five")) FROM srcbucket2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcbucket2 +POSTHOOK: Output: default@destbin +POSTHOOK: Lineage: destbin.a EXPRESSION [(srcbucket2)srcbucket2.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: SELECT * from destBin ORDER BY a +PREHOOK: type: QUERY +PREHOOK: Input: default@destbin +#### A masked pattern was here #### +POSTHOOK: query: SELECT * from destBin ORDER BY a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@destbin +#### A masked pattern was here #### +{0:0} +{0:0} +{0:0} +{0:10} +{0:11} +{0:12} +{0:12} +{0:15} +{0:15} +{0:17} +{0:18} +{0:18} +{0:19} +{0:20} +{0:24} +{0:24} +{0:26} +{0:26} +{0:27} +{0:28} +{0:2} +{0:30} +{0:33} +{0:34} +{0:35} +{0:35} +{0:35} +{0:37} +{0:37} +{0:400} +{0:401} +{0:401} +{0:401} +{0:401} +{0:401} +{0:402} +{0:403} +{0:403} +{0:403} +{0:404} +{0:404} +{0:406} +{0:406} +{0:406} +{0:406} +{0:407} +{0:409} +{0:409} +{0:409} +{0:411} +{0:413} +{0:413} +{0:414} +{0:414} +{0:417} +{0:417} +{0:417} +{0:418} +{0:419} +{0:41} +{0:421} +{0:424} +{0:424} +{0:427} +{0:429} +{0:429} +{0:42} +{0:42} +{0:430} +{0:430} +{0:430} +{0:431} +{0:431} +{0:431} +{0:432} +{0:435} +{0:436} +{0:437} +{0:438} +{0:438} +{0:438} +{0:439} +{0:439} +{0:43} +{0:443} +{0:444} +{0:446} +{0:448} +{0:449} +{0:44} +{0:452} +{0:453} +{0:454} +{0:454} +{0:454} +{0:455} +{0:457} +{0:458} +{0:458} +{0:459} +{0:459} +{0:460} +{0:462} +{0:462} +{0:463} +{0:463} +{0:466} +{0:466} +{0:466} +{0:467} +{0:468} +{0:468} +{0:468} +{0:468} +{0:469} +{0:469} +{0:469} +{0:469} +{0:469} +{0:470} +{0:472} +{0:475} +{0:477} +{0:478} +{0:478} +{0:479} +{0:47} +{0:480} +{0:480} +{0:480} +{0:481} +{0:482} +{0:483} +{0:484} +{0:485} +{0:487} +{0:489} +{0:489} +{0:489} +{0:489} +{0:490} +{0:491} +{0:492} +{0:492} +{0:493} +{0:494} +{0:495} +{0:496} +{0:497} +{0:498} +{0:498} +{0:498} +{0:4} +{0:51} +{0:51} +{0:53} +{0:54} +{0:57} +{0:58} +{0:58} +{0:5} +{0:5} +{0:5} +{0:64} +{0:65} +{0:66} +{0:67} +{0:67} +{0:69} +{0:70} +{0:70} +{0:70} +{0:72} +{0:72} +{0:74} +{0:76} +{0:76} +{0:77} +{0:78} +{0:80} +{0:82} +{0:83} +{0:83} +{0:84} +{0:84} +{0:85} +{0:86} +{0:87} +{0:8} +{0:90} +{0:90} +{0:90} +{0:92} +{0:95} +{0:95} +{0:96} +{0:97} +{0:97} +{0:98} +{0:98} +{0:9} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{1:2.0} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{2:["one","two"]} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +{3:{"col1":5,"col2":"five"}} +PREHOOK: query: DROP TABLE destBin +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@destbin +PREHOOK: Output: default@destbin +POSTHOOK: query: DROP TABLE destBin +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@destbin +POSTHOOK: Output: default@destbin diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryFactory.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryFactory.java index cae4faa..8bd5838 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryFactory.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryFactory.java @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.serde2.lazybinary.objectinspector.LazyBinaryListObjectInspector; import org.apache.hadoop.hive.serde2.lazybinary.objectinspector.LazyBinaryMapObjectInspector; import org.apache.hadoop.hive.serde2.lazybinary.objectinspector.LazyBinaryStructObjectInspector; +import org.apache.hadoop.hive.serde2.lazybinary.objectinspector.LazyBinaryUnionObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; @@ -106,6 +107,8 @@ public static LazyBinaryObject createLazyBinaryObject(ObjectInspector oi) { return new LazyBinaryArray((LazyBinaryListObjectInspector) oi); case STRUCT: return new LazyBinaryStruct((LazyBinaryStructObjectInspector) oi); + case UNION: + return new LazyBinaryUnion((LazyBinaryUnionObjectInspector) oi); } throw new RuntimeException("Hive LazyBinarySerDe Internal error."); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java index 1c8f795..6cf9be6 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java @@ -43,8 +43,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; @@ -281,6 +281,13 @@ private static void serializeStruct(RandomAccessOutput byteStream, Object[] fiel } } + private static void serializeUnion(RandomAccessOutput byteStream, Object obj, + UnionObjectInspector uoi, BooleanRef warnedOnceNullMapKey) throws SerDeException { + byte tag = uoi.getTag(obj); + byteStream.write(tag); + serialize(byteStream, uoi.getField(obj), uoi.getObjectInspectors().get(tag), false, warnedOnceNullMapKey); + } + private static void serializeText( RandomAccessOutput byteStream, Text t, boolean skipLengthPrefix) { /* write byte size of the string which is a vint */ @@ -544,24 +551,31 @@ public static void serialize(RandomAccessOutput byteStream, Object obj, } return; } - case STRUCT: { + case STRUCT: + case UNION:{ int byteSizeStart = 0; - int structStart = 0; + int typeStart = 0; if (!skipLengthPrefix) { // 1/ reserve spaces for the byte size of the struct // which is a integer and takes four bytes byteSizeStart = byteStream.getLength(); byteStream.reserve(4); - structStart = byteStream.getLength(); + typeStart = byteStream.getLength(); + } + + if (ObjectInspector.Category.STRUCT.equals(objInspector.getCategory()) ) { + // 2/ serialize the struct + serializeStruct(byteStream, obj, (StructObjectInspector) objInspector, warnedOnceNullMapKey); + } else { + // 2/ serialize the union + serializeUnion(byteStream, obj, (UnionObjectInspector) objInspector, warnedOnceNullMapKey); } - // 2/ serialize the struct - serializeStruct(byteStream, obj, (StructObjectInspector) objInspector, warnedOnceNullMapKey); if (!skipLengthPrefix) { // 3/ update the byte size of the struct - int structEnd = byteStream.getLength(); - int structSize = structEnd - structStart; - writeSizeAtOffset(byteStream, byteSizeStart, structSize); + int typeEnd = byteStream.getLength(); + int typeSize = typeEnd - typeStart; + writeSizeAtOffset(byteStream, byteSizeStart, typeSize); } return; } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryUnion.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryUnion.java new file mode 100644 index 0000000..8b17b25 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryUnion.java @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hadoop.hive.serde2.lazybinary; + + import java.util.ArrayList; + import java.util.Arrays; + import java.util.List; + + import org.apache.commons.logging.Log; + import org.apache.commons.logging.LogFactory; + import org.apache.hadoop.hive.serde2.SerDeStatsStruct; + import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; + import org.apache.hadoop.hive.serde2.lazybinary.objectinspector.LazyBinaryUnionObjectInspector; + import org.apache.hadoop.hive.serde2.objectinspector.*; + +/** + * LazyBinaryUnion is serialized as follows: start TAG FIELD end bytes[] -> + * |-----|---------|--- ... ---|-----|---------| + * + * Section TAG is one byte, corresponding to tag of set union field + * FIELD is a LazyBinaryObject corresponding to set union field value. + * + */ + public class LazyBinaryUnion extends + LazyBinaryNonPrimitive implements SerDeStatsStruct { + + private static Log LOG = LogFactory.getLog(LazyBinaryUnion.class.getName()); + + /** + * Whether the data is already parsed or not. + */ + boolean parsed; + + /** + * Size of serialized data + */ + long serializedSize; + + /** + * The field of the union which contains the value. + */ + LazyBinaryObject field; + + boolean fieldInited; + + /** + * The start positions and lengths of union fields. Only valid when the data + * is parsed. + */ + int fieldStart; + int fieldLength; + + byte tag; + + final LazyBinaryUtils.VInt vInt = new LazyBinaryUtils.VInt(); + + /** + * Construct a LazyBinaryUnion object with an ObjectInspector. + */ + protected LazyBinaryUnion(LazyBinaryUnionObjectInspector oi) { + super(oi); + } + + @Override + public void init(ByteArrayRef bytes, int start, int length) { + super.init(bytes, start, length); + parsed = false; + serializedSize = length; + fieldInited = false; + field = null; + cachedObject = null; + } + + LazyBinaryUtils.RecordInfo recordInfo = new LazyBinaryUtils.RecordInfo(); + boolean missingFieldWarned = false; + boolean extraFieldWarned = false; + + /** + * Parse the byte[] and fill fieldStart, fieldLength, fieldInited and + * fieldIsNull. + */ + private void parse() { + LazyBinaryUnionObjectInspector uoi = (LazyBinaryUnionObjectInspector) oi; + + /** + * Please note that tag is followed by field + */ + int unionByteEnd = start + length; + byte[] byteArr = this.bytes.getData(); + + //Tag of union field is the first byte to be parsed + final int tagEnd = start + 1; + tag = byteArr[start]; + field = LazyBinaryFactory.createLazyBinaryObject(uoi.getObjectInspectors().get(tag)); + //Check the union field's length and offset + LazyBinaryUtils.checkObjectByteInfo(uoi.getObjectInspectors().get(tag), byteArr, tagEnd, recordInfo, vInt); + fieldStart = tagEnd + recordInfo.elementOffset; + // Add 1 for tag + fieldLength = recordInfo.elementSize; + + // Extra bytes at the end? + if (!extraFieldWarned && (fieldStart + fieldLength) < unionByteEnd) { + extraFieldWarned = true; + LOG.warn("Extra bytes detected at the end of the row! Ignoring similar " + + "problems."); + } + + // Missing fields? + if (!missingFieldWarned && (fieldStart + fieldLength) > unionByteEnd) { + missingFieldWarned = true; + LOG.info("Missing fields! Expected 1 fields but " + + "only got " + field + "! Ignoring similar problems."); + } + + parsed = true; + } + + /** + * Get the set field out of the union. + * + * If the field is a primitive field, return the actual object. Otherwise + * return the LazyObject. This is because PrimitiveObjectInspector does not + * have control over the object used by the user - the user simply directly + * use the Object instead of going through Object + * PrimitiveObjectInspector.get(Object). + * @return The field as a LazyObject + */ + public Object getField() { + if (!parsed) { + parse(); + } + if(cachedObject == null) { + return uncheckedGetField(); + } + return cachedObject; + } + + /** + * Get the field out of the row without checking parsed. This is called by + * both getField and getFieldsAsList. + * + * @param fieldID + * The id of the field starting from 0. + * @return The value of the field + */ + private Object uncheckedGetField() { + // Test the length first so in most cases we avoid doing a byte[] + // comparison. + if (!fieldInited) { + fieldInited = true; + field.init(bytes, fieldStart, fieldLength); + } + cachedObject = field.getObject(); + return field.getObject(); + } + + Object cachedObject; + + @Override + public Object getObject() { + return this; + } + + public long getRawDataSerializedSize() { + return serializedSize; + } + + /** + * Get the set field's tag + * + * + * @return The tag of the field set in the union + */ + public byte getTag() { + if (!parsed) { + parse(); + } + return tag; + } + } + diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryUtils.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryUtils.java index 155b267..11e8cf4 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryUtils.java @@ -37,6 +37,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.WritableUtils; @@ -226,6 +227,7 @@ public static void checkObjectByteInfo(ObjectInspector objectInspector, case LIST: case MAP: case STRUCT: + case UNION: recordInfo.elementOffset = 4; recordInfo.elementSize = LazyBinaryUtils.byteArrayToInt(bytes, offset); break; @@ -474,6 +476,20 @@ public static ObjectInspector getLazyBinaryObjectInspectorFromTypeInfo( fieldObjectInspectors); break; } + case UNION: { + UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo; + final List fieldTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos(); + List fieldObjectInspectors = new ArrayList( + fieldTypeInfos.size()); + for (int i = 0; i < fieldTypeInfos.size(); i++) { + fieldObjectInspectors + .add(getLazyBinaryObjectInspectorFromTypeInfo(fieldTypeInfos + .get(i))); + } + result = LazyBinaryObjectInspectorFactory + .getLazyBinaryUnionObjectInspector(fieldObjectInspectors); + break; + } default: { result = null; } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/objectinspector/LazyBinaryObjectInspectorFactory.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/objectinspector/LazyBinaryObjectInspectorFactory.java index b3ec24d..1b56c54 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/objectinspector/LazyBinaryObjectInspectorFactory.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/objectinspector/LazyBinaryObjectInspectorFactory.java @@ -40,6 +40,9 @@ static ConcurrentHashMap, LazyBinaryStructObjectInspector> cachedLazyBinaryStructObjectInspector = new ConcurrentHashMap, LazyBinaryStructObjectInspector>(); + static ConcurrentHashMap, LazyBinaryUnionObjectInspector> cachedLazyBinaryUnionObjectInspector = + new ConcurrentHashMap, LazyBinaryUnionObjectInspector>(); + public static LazyBinaryStructObjectInspector getLazyBinaryStructObjectInspector( List structFieldNames, List structFieldObjectInspectors) { @@ -66,6 +69,20 @@ public static LazyBinaryStructObjectInspector getLazyBinaryStructObjectInspector return result; } + public static LazyBinaryUnionObjectInspector getLazyBinaryUnionObjectInspector( + List unionFieldObjectInspectors) { + ArrayList signature = new ArrayList(1); + signature.add(unionFieldObjectInspectors); + + LazyBinaryUnionObjectInspector result = cachedLazyBinaryUnionObjectInspector + .get(signature); + if (result == null) { + result = new LazyBinaryUnionObjectInspector(unionFieldObjectInspectors); + cachedLazyBinaryUnionObjectInspector.put(signature, result); + } + return result; + } + static ConcurrentHashMap, LazyBinaryListObjectInspector> cachedLazyBinaryListObjectInspector = new ConcurrentHashMap, LazyBinaryListObjectInspector>(); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/objectinspector/LazyBinaryUnionObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/objectinspector/LazyBinaryUnionObjectInspector.java new file mode 100644 index 0000000..d43f41c --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/objectinspector/LazyBinaryUnionObjectInspector.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.lazybinary.objectinspector; + +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUnion; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObject; + +import java.util.List; + +/** + * ObjectInspector for LazyBinaryUnion. + * + * @see org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUnion + */ +public class LazyBinaryUnionObjectInspector extends + StandardUnionObjectInspector { + + protected LazyBinaryUnionObjectInspector() { + super(); + } + protected LazyBinaryUnionObjectInspector(List unionFieldObjectInspectors) { + super(unionFieldObjectInspectors); + } + + /** + * Return the tag of the object. + */ + public byte getTag(Object o) { + if (o == null) { + return -1; + } + LazyBinaryUnion lazyBinaryUnion = (LazyBinaryUnion) o; + return lazyBinaryUnion.getTag(); + } + + /** + * Return the field based on the tag value associated with the Object. + */ + public Object getField(Object o) { + if (o == null) { + return null; + } + LazyBinaryUnion lazyBinaryUnion = (LazyBinaryUnion) o; + return lazyBinaryUnion.getField(); + } +} -- 1.7.7.5 (Apple Git-26)