Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-31705

Push more possible predicates through Join via CNF

    XMLWordPrintableJSON

    Details

    • Type: Improvement
    • Status: Resolved
    • Priority: Major
    • Resolution: Fixed
    • Affects Version/s: 3.1.0
    • Fix Version/s: 3.1.0
    • Component/s: SQL
    • Labels:
      None

      Description

      Rewrite join condition to conjunctive normal form to push more conditions to filter.

      PostgreSQL:

      CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT,   
      l_linenumber INT, l_quantity DECIMAL(10,0), l_extendedprice DECIMAL(10,0),        
      l_discount DECIMAL(10,0), l_tax DECIMAL(10,0), l_returnflag varchar(255),               
      l_linestatus varchar(255), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE,    
      l_shipinstruct varchar(255), l_shipmode varchar(255), l_comment varchar(255));
        
      CREATE TABLE orders (                                                    
      o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus varchar(255),           
      o_totalprice DECIMAL(10,0), o_orderdate DATE, o_orderpriority varchar(255),
      o_clerk varchar(255), o_shippriority INT, o_comment varchar(255));  
      
      EXPLAIN
      SELECT Count(*)
      FROM   lineitem,
             orders
      WHERE  l_orderkey = o_orderkey
             AND ( ( l_suppkey > 3
                     AND o_custkey > 13 )
                    OR ( l_suppkey > 1
                         AND o_custkey > 11 ) )
             AND l_partkey > 19;
      
      EXPLAIN
      SELECT Count(*)
      FROM   lineitem
             JOIN orders
               ON l_orderkey = o_orderkey
                  AND ( ( l_suppkey > 3
                          AND o_custkey > 13 )
                         OR ( l_suppkey > 1
                              AND o_custkey > 11 ) )
                  AND l_partkey > 19;
      
      EXPLAIN
      SELECT Count(*) 
      FROM   lineitem, 
             orders 
      WHERE  l_orderkey = o_orderkey 
             AND NOT ( ( l_suppkey > 3 
                         AND ( l_suppkey > 2 
                                OR o_custkey > 13 ) ) 
                        OR ( l_suppkey > 1 
                             AND o_custkey > 11 ) ) 
             AND l_partkey > 19;
      
      postgres=# EXPLAIN
      postgres-# SELECT Count(*)
      postgres-# FROM   lineitem,
      postgres-#        orders
      postgres-# WHERE  l_orderkey = o_orderkey
      postgres-#        AND ( ( l_suppkey > 3
      postgres(#                AND o_custkey > 13 )
      postgres(#               OR ( l_suppkey > 1
      postgres(#                    AND o_custkey > 11 ) )
      postgres-#        AND l_partkey > 19;
                                                                     QUERY PLAN
      -----------------------------------------------------------------------------------------------------------------------------------------
       Aggregate  (cost=21.18..21.19 rows=1 width=8)
         ->  Hash Join  (cost=10.60..21.17 rows=2 width=0)
               Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
               Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
               ->  Seq Scan on orders  (cost=0.00..10.45 rows=17 width=16)
                     Filter: ((o_custkey > 13) OR (o_custkey > 11))
               ->  Hash  (cost=10.53..10.53 rows=6 width=16)
                     ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
                           Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
      (9 rows)
      
      postgres=# EXPLAIN
      postgres-# SELECT Count(*)
      postgres-# FROM   lineitem
      postgres-#        JOIN orders
      postgres-#          ON l_orderkey = o_orderkey
      postgres-#             AND ( ( l_suppkey > 3
      postgres(#                     AND o_custkey > 13 )
      postgres(#                    OR ( l_suppkey > 1
      postgres(#                         AND o_custkey > 11 ) )
      postgres-#             AND l_partkey > 19;
                                                                     QUERY PLAN
      -----------------------------------------------------------------------------------------------------------------------------------------
       Aggregate  (cost=21.18..21.19 rows=1 width=8)
         ->  Hash Join  (cost=10.60..21.17 rows=2 width=0)
               Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
               Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
               ->  Seq Scan on orders  (cost=0.00..10.45 rows=17 width=16)
                     Filter: ((o_custkey > 13) OR (o_custkey > 11))
               ->  Hash  (cost=10.53..10.53 rows=6 width=16)
                     ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
                           Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
      (9 rows)
      
      postgres=# EXPLAIN
      postgres-# SELECT Count(*)
      postgres-# FROM   lineitem,
      postgres-#        orders
      postgres-# WHERE  l_orderkey = o_orderkey
      postgres-#        AND NOT ( ( l_suppkey > 3
      postgres(#                    AND ( l_suppkey > 2
      postgres(#                           OR o_custkey > 13 ) )
      postgres(#                   OR ( l_suppkey > 1
      postgres(#                        AND o_custkey > 11 ) )
      postgres-#        AND l_partkey > 19;
                                                                                       QUERY PLAN
      ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
       Aggregate  (cost=21.15..21.16 rows=1 width=8)
         ->  Hash Join  (cost=10.60..21.15 rows=2 width=0)
               Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
               Join Filter: (((lineitem.l_suppkey <= 1) OR (orders.o_custkey <= 11)) AND ((lineitem.l_suppkey <= 3) OR ((lineitem.l_suppkey <= 2) AND (orders.o_custkey <= 13))))
               ->  Seq Scan on orders  (cost=0.00..10.30 rows=30 width=16)
               ->  Hash  (cost=10.53..10.53 rows=6 width=16)
                     ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
                           Filter: ((l_partkey > 19) AND ((l_suppkey <= 3) OR (l_suppkey <= 2)))
      (8 rows)
      

      https://docs.teradata.com/reader/i_VlYHwN0b8knh6AEWrv1Q/Bh~37Qcc2~24P_jn2~0w6w

        Attachments

          Issue Links

            Activity

              People

              • Assignee:
                Gengliang.Wang Gengliang Wang
                Reporter:
                yumwang Yuming Wang
              • Votes:
                0 Vote for this issue
                Watchers:
                6 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved: