Details
-
Improvement
-
Status: Open
-
Major
-
Resolution: Unresolved
-
None
-
None
Description
git.commit.id.abbrev=142e577
Query :
select max(length(sub.str_var)) from ( select str_var, tinyint_var from `wide-strings` order by tinyint_var) sub group by sub.tinyint_var;
Plan :
+------------+------------+ | text | json | +------------+------------+ | 00-00 Screen 00-01 Project(EXPR$0=[$1]) 00-02 HashAgg(group=[{0}], EXPR$0=[MAX($1)]) 00-03 Project(tinyint_var=[$1], $f1=[LENGTH($0)]) 00-04 SelectionVectorRemover 00-05 Sort(sort0=[$1], dir0=[ASC]) 00-06 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/data-shapes/wide-columns/flat/wide-strings]], selectionRoot=/drill/testdata/data-shapes/wide-columns/flat/wide-strings, numFiles=1, columns=[`str_var`, `tinyint_var`]]]) | { "head" : { "version" : 1, "generator" : { "type" : "ExplainHandler", "info" : "" }, "type" : "APACHE_DRILL_PHYSICAL", "options" : [ ], "queue" : 0, "resultMode" : "EXEC" }, "graph" : [ { "pop" : "parquet-scan", "@id" : 6, "entries" : [ { "path" : "maprfs:/drill/testdata/data-shapes/wide-columns/flat/wide-strings" } ], "storage" : { "type" : "file", "enabled" : true, "connection" : "maprfs:///", "workspaces" : { "root" : { "location" : "/", "writable" : false, "defaultInputFormat" : null }, "tmp" : { "location" : "/tmp", "writable" : true, "defaultInputFormat" : "csv" }, "drillTestDir" : { "location" : "/drill/testdata/", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirAmplab" : { "location" : "/drill/testdata/amplab", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirInformationSchema" : { "location" : "/drill/testdata/information-schema", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirUdfs" : { "location" : "/drill/testdata/udfs/", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirP1" : { "location" : "/drill/testdata/p1tests", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirTpch10Parquet" : { "location" : "/drill/testdata/tpch10", "writable" : true, "defaultInputFormat" : "parquet" }, "Join" : { "location" : "/drill/testdata/join", "writable" : true, "defaultInputFormat" : "parquet" }, "NoExtJson" : { "location" : "/drill/testdata/no-extension/json", "writable" : true, "defaultInputFormat" : "json" }, "NoExtParquet" : { "location" : "/drill/testdata/no-extension/parquet", "writable" : true, "defaultInputFormat" : "parquet" }, "NoExtParquetNull" : { "location" : "/drill/testdata/no-extension/parquet", "writable" : true, "defaultInputFormat" : null }, "NoExtText" : { "location" : "/drill/testdata/no-extension/text", "writable" : true, "defaultInputFormat" : "psv" }, "drillTestDirExchanges" : { "location" : "/drill/testdata/exchanges_test", "writable" : true, "defaultInputFormat" : "parquet" }, "TpcHMulti" : { "location" : "/drill/testdata/tpch-multi", "writable" : true, "defaultInputFormat" : "parquet" }, "TpcHMulti100" : { "location" : "/drill/testdata/SF100", "writable" : true, "defaultInputFormat" : "parquet" }, "TpcHMulti1" : { "location" : "/drill/testdata/tpch_SF1", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirExplicit" : { "location" : "/drill/testdata/explicit_cast", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirImplicit" : { "location" : "/drill/testdata/implicit_cast", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirImplicit1" : { "location" : "/drill/testdata/implicit_cast", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirTPCDS" : { "location" : "/user/root/tpcds/parquet", "writable" : true, "defaultInputFormat" : "parquet" }, "TPCDS" : { "location" : "/drill/testdata/tpcds", "writable" : true, "defaultInputFormat" : "parquet" }, "drillMondrian" : { "location" : "/user/root/mondrian", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirDatetime" : { "location" : "/drill/testdata/datetime/datasources", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirViews" : { "location" : "/drill/testdata/views/", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirNumerical" : { "location" : "/drill/testdata/numerical/", "writable" : true, "defaultInputFormat" : "parquet" }, "drillTestDirJson" : { "location" : "/drill/testdata/json_storage/", "writable" : true, "defaultInputFormat" : null }, "drillTestDirTestNewWS" : { "location" : "/drill/testdata/newWS/", "writable" : true, "defaultInputFormat" : null }, "drillTestDirTpch01Text" : { "location" : "/drill/testdata/Tpch0.01/text/", "writable" : true, "defaultInputFormat" : null }, "drillTestDirTpch01Json" : { "location" : "/drill/testdata/Tpch0.01/json/", "writable" : true, "defaultInputFormat" : null }, "drillTestDirTpch01Parquet" : { "location" : "/drill/testdata/Tpch0.01/parquet/", "writable" : true, "defaultInputFormat" : null }, "drillTestDirConvert" : { "location" : "/drill/testdata/convert", "writable" : true, "defaultInputFormat" : null }, "drillTestDirTpch100Text" : { "location" : "/drill/testdata/tpch100/text/", "writable" : true, "defaultInputFormat" : null }, "drillTestDirTpch100Parquet" : { "location" : "/drill/testdata/tpch100/parquet", "writable" : true, "defaultInputFormat" : null }, "drillTestDirAggregate1parquet" : { "location" : "/drill/testdata/tpcds/parquet/s1", "writable" : true, "defaultInputFormat" : null }, "drillTestDirAggregate1csv" : { "location" : "/drill/testdata/tpcds/csv/s1", "writable" : true, "defaultInputFormat" : null }, "drillTestDirAggregate1json" : { "location" : "/drill/testdata/tpcds/json/s1", "writable" : true, "defaultInputFormat" : null }, "drillTestDirMondrian" : { "location" : "/drill/testdata/mondrian", "writable" : true, "defaultInputFormat" : null }, "drillTestDirTpcdsImpalaSF1" : { "location" : "/drill/testdata/tpcds-impala-sf1", "writable" : true, "defaultInputFormat" : null }, "sandbox" : { "location" : "/sandbox", "writable" : true, "defaultInputFormat" : "parquet" }, "sandbox-logs" : { "location" : "/sandbox/flat", "writable" : true, "defaultInputFormat" : "parquet" }, "sandbox-json" : { "location" : "/sandbox/json", "writable" : true, "defaultInputFormat" : "parquet" } }, "formats" : { "psv" : { "type" : "text", "extensions" : [ "tbl" ], "delimiter" : "|" }, "dsv" : { "type" : "text", "extensions" : [ "dat" ], "delimiter" : "|" }, "csv" : { "type" : "text", "extensions" : [ "csv" ], "delimiter" : "," }, "tsv" : { "type" : "text", "extensions" : [ "tsv" ], "delimiter" : "\t" }, "parquet" : { "type" : "parquet" }, "json" : { "type" : "json" } } }, "format" : { "type" : "parquet" }, "columns" : [ "`str_var`", "`tinyint_var`" ], "selectionRoot" : "/drill/testdata/data-shapes/wide-columns/flat/wide-strings", "cost" : 50.0 }, { "pop" : "external-sort", "@id" : 5, "child" : 6, "orderings" : [ { "order" : "ASC", "expr" : "`tinyint_var`", "nullDirection" : "UNSPECIFIED" } ], "reverse" : false, "initialAllocation" : 20000000, "maxAllocation" : 10000000000, "cost" : 50.0 }, { "pop" : "selection-vector-remover", "@id" : 4, "child" : 5, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 50.0 }, { "pop" : "project", "@id" : 3, "exprs" : [ { "ref" : "`tinyint_var`", "expr" : "`tinyint_var`" }, { "ref" : "`$f1`", "expr" : "length(`str_var`) " } ], "child" : 4, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 50.0 }, { "pop" : "hash-aggregate", "@id" : 2, "child" : 3, "cardinality" : 1.0, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "groupByExprs" : [ { "ref" : "`tinyint_var`", "expr" : "`tinyint_var`" } ], "aggrExprs" : [ { "ref" : "`EXPR$0`", "expr" : "max(`$f1`) " } ], "cost" : 25.0 }, { "pop" : "project", "@id" : 1, "ex | +------------+------------+ 1 row selected (0.087 seconds)
The sub-query actually does an order by. So when we do a group by, it makes more sense cost-wise to use the streaming aggregate instead of hash-aggregate.
I attached the parquet file used. Let me know if you have any questions.