diff --git druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDe.java druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDe.java index ea84326..65259b0 100644 --- druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDe.java +++ druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDe.java @@ -29,6 +29,7 @@ import org.apache.calcite.adapter.druid.DruidTable; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.conf.Constants; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.druid.DruidStorageHandlerUtils; @@ -38,6 +39,7 @@ import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeSpec; import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; @@ -55,8 +57,12 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.ByteWritable; +import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.ShortWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.StringUtils; @@ -460,12 +466,27 @@ public Object deserialize(Writable writable) throws SerDeException { case TIMESTAMP: output.add(new TimestampWritable(new Timestamp((Long) value))); break; + case BYTE: + output.add(new ByteWritable(((Number) value).byteValue())); + break; + case SHORT: + output.add(new ShortWritable(((Number) value).shortValue())); + break; + case INT: + output.add(new IntWritable(((Number) value).intValue())); + break; case LONG: output.add(new LongWritable(((Number) value).longValue())); break; case FLOAT: output.add(new FloatWritable(((Number) value).floatValue())); break; + case DOUBLE: + output.add(new DoubleWritable(((Number) value).doubleValue())); + break; + case DECIMAL: + output.add(new HiveDecimalWritable(HiveDecimal.create(((Number) value).doubleValue()))); + break; case STRING: output.add(new Text(value.toString())); break; diff --git ql/src/test/queries/clientpositive/druid_basic2.q ql/src/test/queries/clientpositive/druid_basic2.q index fe24410..d70f3c3 100644 --- ql/src/test/queries/clientpositive/druid_basic2.q +++ ql/src/test/queries/clientpositive/druid_basic2.q @@ -50,3 +50,19 @@ FROM FROM druid_table_1) b ON a.language = b.language ); + +EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10; + +-- No CBO test: it should work +set hive.cbo.enable=false; +EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10; diff --git ql/src/test/results/clientpositive/druid_basic2.q.out ql/src/test/results/clientpositive/druid_basic2.q.out index 74dccf0..bc9410b 100644 --- ql/src/test/results/clientpositive/druid_basic2.q.out +++ ql/src/test/results/clientpositive/druid_basic2.q.out @@ -529,3 +529,431 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: druid_table_1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + GatherStats: false + Select Operator + expressions: robot (type: string), language (type: string), __time (type: timestamp), added (type: float), delta (type: float) + outputColumnNames: robot, language, __time, added, delta + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: max(added), sum(delta) + keys: robot (type: string), language (type: string), floor_day(__time) (type: timestamp) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: timestamp) + null sort order: aaa + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string), _col2 (type: timestamp) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + tag: -1 + value expressions: _col3 (type: float), _col4 (type: double) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: druid_table_1 + input format: org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat + output format: org.apache.hadoop.hive.druid.io.DruidOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + EXTERNAL TRUE + bucket_count -1 + column.name.delimiter , + columns __time,robot,namespace,anonymous,unpatrolled,page,language,newpage,user,count,added,delta,variation,deleted + columns.comments 'from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer' + columns.types timestamp:string:string:string:string:string:string:string:string:float:float:float:float:float + druid.datasource wikipedia +#### A masked pattern was here #### + name default.druid_table_1 + numFiles 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct druid_table_1 { timestamp __time, string robot, string namespace, string anonymous, string unpatrolled, string page, string language, string newpage, string user, float count, float added, float delta, float variation, float deleted} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.druid.QTestDruidSerDe + storage_handler org.apache.hadoop.hive.druid.QTestDruidStorageHandler + totalSize 0 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.druid.QTestDruidSerDe + + input format: org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat + output format: org.apache.hadoop.hive.druid.io.DruidOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + EXTERNAL TRUE + bucket_count -1 + column.name.delimiter , + columns __time,robot,namespace,anonymous,unpatrolled,page,language,newpage,user,count,added,delta,variation,deleted + columns.comments 'from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer' + columns.types timestamp:string:string:string:string:string:string:string:string:float:float:float:float:float + druid.datasource wikipedia +#### A masked pattern was here #### + name default.druid_table_1 + numFiles 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct druid_table_1 { timestamp __time, string robot, string namespace, string anonymous, string unpatrolled, string page, string language, string newpage, string user, float count, float added, float delta, float variation, float deleted} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.druid.QTestDruidSerDe + storage_handler org.apache.hadoop.hive.druid.QTestDruidStorageHandler + totalSize 0 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.druid.QTestDruidSerDe + name: default.druid_table_1 + name: default.druid_table_1 + Truncated Path -> Alias: + /druid_table_1 [druid_table_1] + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col2 (type: timestamp) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: timestamp), _col3 (type: float), _col4 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + GatherStats: false + Reduce Output Operator + key expressions: UDFToInteger(_col0) (type: int), _col2 (type: float) + null sort order: az + sort order: +- + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + tag: -1 + TopN: 10 + TopN Hash Memory Usage: 0.1 + value expressions: _col0 (type: string), _col1 (type: timestamp), _col3 (type: double) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10004 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Truncated Path -> Alias: +#### A masked pattern was here #### + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: timestamp), KEY.reducesinkkey1 (type: float), VALUE._col2 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:timestamp:float:double + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: druid_table_1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + GatherStats: false + Select Operator + expressions: robot (type: string), language (type: string), __time (type: timestamp), added (type: float), delta (type: float) + outputColumnNames: robot, language, __time, added, delta + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: max(added), sum(delta) + keys: robot (type: string), language (type: string), floor_day(__time) (type: timestamp) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: timestamp) + null sort order: aaa + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string), _col2 (type: timestamp) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + tag: -1 + value expressions: _col3 (type: float), _col4 (type: double) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: druid_table_1 + input format: org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat + output format: org.apache.hadoop.hive.druid.io.DruidOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + EXTERNAL TRUE + bucket_count -1 + column.name.delimiter , + columns __time,robot,namespace,anonymous,unpatrolled,page,language,newpage,user,count,added,delta,variation,deleted + columns.comments 'from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer' + columns.types timestamp:string:string:string:string:string:string:string:string:float:float:float:float:float + druid.datasource wikipedia +#### A masked pattern was here #### + name default.druid_table_1 + numFiles 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct druid_table_1 { timestamp __time, string robot, string namespace, string anonymous, string unpatrolled, string page, string language, string newpage, string user, float count, float added, float delta, float variation, float deleted} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.druid.QTestDruidSerDe + storage_handler org.apache.hadoop.hive.druid.QTestDruidStorageHandler + totalSize 0 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.druid.QTestDruidSerDe + + input format: org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat + output format: org.apache.hadoop.hive.druid.io.DruidOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + EXTERNAL TRUE + bucket_count -1 + column.name.delimiter , + columns __time,robot,namespace,anonymous,unpatrolled,page,language,newpage,user,count,added,delta,variation,deleted + columns.comments 'from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer' + columns.types timestamp:string:string:string:string:string:string:string:string:float:float:float:float:float + druid.datasource wikipedia +#### A masked pattern was here #### + name default.druid_table_1 + numFiles 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct druid_table_1 { timestamp __time, string robot, string namespace, string anonymous, string unpatrolled, string page, string language, string newpage, string user, float count, float added, float delta, float variation, float deleted} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.druid.QTestDruidSerDe + storage_handler org.apache.hadoop.hive.druid.QTestDruidStorageHandler + totalSize 0 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.druid.QTestDruidSerDe + name: default.druid_table_1 + name: default.druid_table_1 + Truncated Path -> Alias: + /druid_table_1 [druid_table_1] + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col2 (type: timestamp) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: timestamp), _col3 (type: float), _col4 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + GatherStats: false + Reduce Output Operator + key expressions: UDFToInteger(_col0) (type: int), _col2 (type: float) + null sort order: az + sort order: +- + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + tag: -1 + TopN: 10 + TopN Hash Memory Usage: 0.1 + value expressions: _col0 (type: string), _col1 (type: timestamp), _col3 (type: double) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10003 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Truncated Path -> Alias: +#### A masked pattern was here #### + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: timestamp), KEY.reducesinkkey1 (type: float), VALUE._col2 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:timestamp:float:double + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink +