Uploaded image for project: 'Hive'
  1. Hive
  2. HIVE-21132

Semi join edge is not being removed despite max bloomfilter entries set to 1

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Major
    • Resolution: Fixed
    • 4.0.0
    • 4.0.0-alpha-1
    • Query Planning
    • None

    Description

      • Reproducer
        --! qt:dataset:lineitem
        --! qt:dataset:part
        --! qt:dataset:src
        
        set hive.support.concurrency=true;
        set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
        --set hive.compute.query.using.stats=false;
        set hive.mapred.mode=nonstrict;
        set hive.explain.user=false;
        set hive.optimize.ppd=true;
        set hive.ppd.remove.duplicatefilters=true;
        set hive.tez.dynamic.partition.pruning=true;
        set hive.tez.dynamic.semijoin.reduction=true;
        set hive.optimize.metadataonly=false;
        set hive.optimize.index.filter=true;
        set hive.stats.autogather=true;
        set hive.tez.bigtable.minsize.semijoin.reduction=1;
        set hive.tez.min.bloom.filter.entries=1;
        set hive.stats.fetch.column.stats=true;
        set hive.tez.bloom.filter.factor=1.0f;
        set hive.auto.convert.join=false;
        set hive.optimize.shared.work=false;
        
        
        create database tpch_test;
        use tpch_test;
        
        CREATE TABLE `customer`(
          `c_custkey` bigint, 
          `c_name` string, 
          `c_address` string, 
          `c_nationkey` bigint, 
          `c_phone` string, 
          `c_acctbal` double, 
          `c_mktsegment` string, 
          `c_comment` string)
        ROW FORMAT SERDE 
          'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
        STORED AS INPUTFORMAT 
          'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
        OUTPUTFORMAT 
          'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
        TBLPROPERTIES (
          'bucketing_version'='2', 
          'transactional'='true', 
          'transactional_properties'='default', 
          'transient_lastDdlTime'='1543026723');
        
        CREATE TABLE `lineitem`(
          `l_orderkey` bigint, 
          `l_partkey` bigint, 
          `l_suppkey` bigint, 
          `l_linenumber` int, 
          `l_quantity` double, 
          `l_extendedprice` double, 
          `l_discount` double, 
          `l_tax` double, 
          `l_returnflag` string, 
          `l_linestatus` string, 
          `l_shipdate` string, 
          `l_commitdate` string, 
          `l_receiptdate` string, 
          `l_shipinstruct` string, 
          `l_shipmode` string, 
          `l_comment` string)
        ROW FORMAT SERDE 
          'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
        STORED AS INPUTFORMAT 
          'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
        OUTPUTFORMAT 
          'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
        TBLPROPERTIES (
          'bucketing_version'='2', 
          'transactional'='true', 
          'transactional_properties'='default', 
          'transient_lastDdlTime'='1543027179');
        
        CREATE TABLE `orders`(
          `o_orderkey` bigint, 
          `o_custkey` bigint, 
          `o_orderstatus` string, 
          `o_totalprice` double, 
          `o_orderdate` string, 
          `o_orderpriority` string, 
          `o_clerk` string, 
          `o_shippriority` int, 
          `o_comment` string)
        ROW FORMAT SERDE 
          'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
        STORED AS INPUTFORMAT 
          'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
        OUTPUTFORMAT 
          'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
        TBLPROPERTIES (
          'bucketing_version'='2', 
          'transactional'='true', 
          'transactional_properties'='default', 
          'transient_lastDdlTime'='1543026824');
        
        alter table customer update statistics set('numRows'='150000000','rawDataSize'='8633707142');
        alter table lineitem update statistics set('numRows'='5999989709','rawDataSize'='184245066955');
        alter table orders update statistics set('numRows'='1500000000','rawDataSize'='46741318253');
        
        
        create view q18_tmp_cached as
        select l_orderkey, sum(l_quantity) as t_sum_quantity
        from lineitem
        where l_orderkey is not null
        group by l_orderkey;
        
        -- Set bloom filter size to huge number so we get any possible semijoin reductions
        
        set hive.tez.min.bloom.filter.entries=0;
        set hive.tez.max.bloom.filter.entries=1;
        
        
        create table q18_large_volume_customer_cached stored as orc tblproperties ('transactional'='true', 'transactional_properties'='default') as
        select c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice, sum(l_quantity)
        from customer, orders, q18_tmp_cached t, lineitem l
        where
          c_custkey = o_custkey and o_orderkey = t.l_orderkey
          and o_orderkey is not null and t.t_sum_quantity > 300
          and o_orderkey = l.l_orderkey and l.l_orderkey is not null
        group by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
        order by o_totalprice desc, o_orderdate
        limit 100;
        
        drop database tpch_test cascade;
        

      To reproduce run the above as TestMiniLlapLocalCliDriver test

      Attachments

        1. HIVE-21132.4.patch
          36 kB
          Vineet Garg
        2. HIVE-21132.3.patch
          36 kB
          Vineet Garg
        3. HIVE-21132.2.patch
          36 kB
          Vineet Garg
        4. HIVE-21132.1.patch
          36 kB
          Vineet Garg

        Activity

          People

            vgarg Vineet Garg
            vgarg Vineet Garg
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: