diff --git druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDe.java druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDe.java index ea84326..1d10365 100644 --- druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDe.java +++ druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDe.java @@ -130,7 +130,8 @@ public void initialize(Configuration configuration, Properties properties) throw new Function() { @Override public PrimitiveTypeInfo apply(String type) { - return TypeInfoFactory.getPrimitiveTypeInfo(type); + return TypeInfoFactory.getPrimitiveTypeInfo( + DruidSerDeUtils.convertHiveToHiveSupportedTypeString(type)); } } )); diff --git druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDeUtils.java druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDeUtils.java index 64a19f6..7be2e9b 100644 --- druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDeUtils.java +++ druid-handler/src/java/org/apache/hadoop/hive/druid/serde/DruidSerDeUtils.java @@ -82,4 +82,23 @@ public static String convertDruidToHiveTypeString(String typeName) { } } + /* This method converts from the String representation of Hive type + * to the String representation of Hive type supported by Druid */ + public static String convertHiveToHiveSupportedTypeString(String typeName) { + switch (typeName) { + case serdeConstants.DECIMAL_TYPE_NAME: + case serdeConstants.DOUBLE_TYPE_NAME: + case serdeConstants.FLOAT_TYPE_NAME: + return serdeConstants.FLOAT_TYPE_NAME; + case serdeConstants.TINYINT_TYPE_NAME: + case serdeConstants.SMALLINT_TYPE_NAME: + case serdeConstants.INT_TYPE_NAME: + case serdeConstants.BIGINT_TYPE_NAME: + return serdeConstants.BIGINT_TYPE_NAME; + default: + // Return the same + return typeName; + } + } + } diff --git ql/src/test/queries/clientpositive/druid_basic2.q ql/src/test/queries/clientpositive/druid_basic2.q index fe24410..d70f3c3 100644 --- ql/src/test/queries/clientpositive/druid_basic2.q +++ ql/src/test/queries/clientpositive/druid_basic2.q @@ -50,3 +50,19 @@ FROM FROM druid_table_1) b ON a.language = b.language ); + +EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10; + +-- No CBO test: it should work +set hive.cbo.enable=false; +EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10; diff --git ql/src/test/results/clientpositive/druid_basic2.q.out ql/src/test/results/clientpositive/druid_basic2.q.out index 74dccf0..bc9410b 100644 --- ql/src/test/results/clientpositive/druid_basic2.q.out +++ ql/src/test/results/clientpositive/druid_basic2.q.out @@ -529,3 +529,431 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: druid_table_1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + GatherStats: false + Select Operator + expressions: robot (type: string), language (type: string), __time (type: timestamp), added (type: float), delta (type: float) + outputColumnNames: robot, language, __time, added, delta + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: max(added), sum(delta) + keys: robot (type: string), language (type: string), floor_day(__time) (type: timestamp) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: timestamp) + null sort order: aaa + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string), _col2 (type: timestamp) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + tag: -1 + value expressions: _col3 (type: float), _col4 (type: double) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: druid_table_1 + input format: org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat + output format: org.apache.hadoop.hive.druid.io.DruidOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + EXTERNAL TRUE + bucket_count -1 + column.name.delimiter , + columns __time,robot,namespace,anonymous,unpatrolled,page,language,newpage,user,count,added,delta,variation,deleted + columns.comments 'from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer' + columns.types timestamp:string:string:string:string:string:string:string:string:float:float:float:float:float + druid.datasource wikipedia +#### A masked pattern was here #### + name default.druid_table_1 + numFiles 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct druid_table_1 { timestamp __time, string robot, string namespace, string anonymous, string unpatrolled, string page, string language, string newpage, string user, float count, float added, float delta, float variation, float deleted} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.druid.QTestDruidSerDe + storage_handler org.apache.hadoop.hive.druid.QTestDruidStorageHandler + totalSize 0 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.druid.QTestDruidSerDe + + input format: org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat + output format: org.apache.hadoop.hive.druid.io.DruidOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + EXTERNAL TRUE + bucket_count -1 + column.name.delimiter , + columns __time,robot,namespace,anonymous,unpatrolled,page,language,newpage,user,count,added,delta,variation,deleted + columns.comments 'from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer' + columns.types timestamp:string:string:string:string:string:string:string:string:float:float:float:float:float + druid.datasource wikipedia +#### A masked pattern was here #### + name default.druid_table_1 + numFiles 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct druid_table_1 { timestamp __time, string robot, string namespace, string anonymous, string unpatrolled, string page, string language, string newpage, string user, float count, float added, float delta, float variation, float deleted} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.druid.QTestDruidSerDe + storage_handler org.apache.hadoop.hive.druid.QTestDruidStorageHandler + totalSize 0 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.druid.QTestDruidSerDe + name: default.druid_table_1 + name: default.druid_table_1 + Truncated Path -> Alias: + /druid_table_1 [druid_table_1] + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col2 (type: timestamp) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: timestamp), _col3 (type: float), _col4 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + GatherStats: false + Reduce Output Operator + key expressions: UDFToInteger(_col0) (type: int), _col2 (type: float) + null sort order: az + sort order: +- + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + tag: -1 + TopN: 10 + TopN Hash Memory Usage: 0.1 + value expressions: _col0 (type: string), _col1 (type: timestamp), _col3 (type: double) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10004 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Truncated Path -> Alias: +#### A masked pattern was here #### + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: timestamp), KEY.reducesinkkey1 (type: float), VALUE._col2 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:timestamp:float:double + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +SELECT robot, floor_day(`__time`), max(added) as m, sum(delta) as s +FROM druid_table_1 +GROUP BY robot, language, floor_day(`__time`) +ORDER BY CAST(robot AS INTEGER) ASC, m DESC +LIMIT 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: druid_table_1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + GatherStats: false + Select Operator + expressions: robot (type: string), language (type: string), __time (type: timestamp), added (type: float), delta (type: float) + outputColumnNames: robot, language, __time, added, delta + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: max(added), sum(delta) + keys: robot (type: string), language (type: string), floor_day(__time) (type: timestamp) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: timestamp) + null sort order: aaa + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string), _col2 (type: timestamp) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + tag: -1 + value expressions: _col3 (type: float), _col4 (type: double) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: druid_table_1 + input format: org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat + output format: org.apache.hadoop.hive.druid.io.DruidOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + EXTERNAL TRUE + bucket_count -1 + column.name.delimiter , + columns __time,robot,namespace,anonymous,unpatrolled,page,language,newpage,user,count,added,delta,variation,deleted + columns.comments 'from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer' + columns.types timestamp:string:string:string:string:string:string:string:string:float:float:float:float:float + druid.datasource wikipedia +#### A masked pattern was here #### + name default.druid_table_1 + numFiles 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct druid_table_1 { timestamp __time, string robot, string namespace, string anonymous, string unpatrolled, string page, string language, string newpage, string user, float count, float added, float delta, float variation, float deleted} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.druid.QTestDruidSerDe + storage_handler org.apache.hadoop.hive.druid.QTestDruidStorageHandler + totalSize 0 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.druid.QTestDruidSerDe + + input format: org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat + output format: org.apache.hadoop.hive.druid.io.DruidOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + EXTERNAL TRUE + bucket_count -1 + column.name.delimiter , + columns __time,robot,namespace,anonymous,unpatrolled,page,language,newpage,user,count,added,delta,variation,deleted + columns.comments 'from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer','from deserializer' + columns.types timestamp:string:string:string:string:string:string:string:string:float:float:float:float:float + druid.datasource wikipedia +#### A masked pattern was here #### + name default.druid_table_1 + numFiles 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct druid_table_1 { timestamp __time, string robot, string namespace, string anonymous, string unpatrolled, string page, string language, string newpage, string user, float count, float added, float delta, float variation, float deleted} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.druid.QTestDruidSerDe + storage_handler org.apache.hadoop.hive.druid.QTestDruidStorageHandler + totalSize 0 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.druid.QTestDruidSerDe + name: default.druid_table_1 + name: default.druid_table_1 + Truncated Path -> Alias: + /druid_table_1 [druid_table_1] + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col2 (type: timestamp) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: timestamp), _col3 (type: float), _col4 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + GatherStats: false + Reduce Output Operator + key expressions: UDFToInteger(_col0) (type: int), _col2 (type: float) + null sort order: az + sort order: +- + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + tag: -1 + TopN: 10 + TopN Hash Memory Usage: 0.1 + value expressions: _col0 (type: string), _col1 (type: timestamp), _col3 (type: double) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10003 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1,_col2,_col3 + columns.types string,timestamp,float,double + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Truncated Path -> Alias: +#### A masked pattern was here #### + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: timestamp), KEY.reducesinkkey1 (type: float), VALUE._col2 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:timestamp:float:double + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink +