Uploaded image for project: 'IMPALA'
  1. IMPALA
  2. IMPALA-5125

Check failed: tuple_desc_map_.back() != __null

    Details

      Description

      The following query seems to produce this dcheck reliably:

      USE tpch;
      
      SELECT
      IF(True, 514, COUNT(a1.ps_partkey)) AS int_col
      FROM partsupp a1
      WHERE
      (a1.ps_availqty) IN (SELECT
      (a2.ps_availqty) * (((a2.ps_partkey) + (538.4658339348)) - (-445.1448857941)) AS decimal_col
      FROM partsupp a2
      WHERE
      (False) AND ((a2.ps_partkey) < (a2.ps_suppkey)))
      
      #6  0x00000000028c316e in google::LogMessageFatal::~LogMessageFatal() ()
      #7  0x00000000013f1d1a in impala::RowDescriptor::RowDescriptor (this=0x93bfe48, desc_tbl=..., row_tuples=..., nullable_tuples=...) at /home/mikeb/Impala/be/src/runtime/descriptors.cc:364
      #8  0x00000000016a94a1 in impala::ExecNode::ExecNode (this=0x93bfe00, pool=0x9752b40, tnode=..., descs=...) at /home/mikeb/Impala/be/src/exec/exec-node.cc:134
      #9  0x000000000179c27a in impala::PartitionedAggregationNode::PartitionedAggregationNode (this=0x93bfe00, pool=0x9752b40, tnode=..., descs=...) at /home/mikeb/Impala/be/src/exec/partitioned-aggregation-node.cc:137
      #10 0x00000000016ab599 in impala::ExecNode::CreateNode (pool=0x9752b40, tnode=..., descs=..., node=0x7fcd18fa4878, state=0x9817500) at /home/mikeb/Impala/be/src/exec/exec-node.cc:303
      #11 0x00000000016aab70 in impala::ExecNode::CreateTreeHelper (state=0x9817500, tnodes=..., descs=..., parent=0x0, node_idx=0x7fcd18fa494c, root=0xad690d0) at /home/mikeb/Impala/be/src/exec/exec-node.cc:239
      #12 0x00000000016aa82e in impala::ExecNode::CreateTree (state=0x9817500, plan=..., descs=..., root=0xad690d0) at /home/mikeb/Impala/be/src/exec/exec-node.cc:217
      #13 0x0000000001a6fe26 in impala::PlanFragmentExecutor::PrepareInternal (this=0xad690d0, qs=0x9183c00, tdesc_tbl=..., fragment_ctx=..., instance_ctx=...) at /home/mikeb/Impala/be/src/runtime/plan-fragment-executor.cc:177
      #14 0x0000000001a6eaff in impala::PlanFragmentExecutor::Prepare (this=0xad690d0, query_state=0x9183c00, desc_tbl=..., fragment_ctx=..., instance_ctx=...) at /home/mikeb/Impala/be/src/runtime/plan-fragment-executor.cc:99
      #15 0x0000000001a6ba7b in impala::FragmentInstanceState::Exec (this=0xad68e00) at /home/mikeb/Impala/be/src/runtime/fragment-instance-state.cc:64
      #16 0x0000000001a77167 in impala::QueryExecMgr::ExecFInstance (this=0x9a46ba0, fis=0xad68e00) at /home/mikeb/Impala/be/src/runtime/query-exec-mgr.cc:110
      #17 0x0000000001a79f90 in boost::_mfi::mf1<void, impala::QueryExecMgr, impala::FragmentInstanceState*>::operator() (this=0x9752ac0, p=0x9a46ba0, a1=0xad68e00)
          at /home/mikeb/Impala/toolchain/boost-1.57.0-p1/include/boost/bind/mem_fn_template.hpp:165
      #18 0x0000000001a79e19 in boost::_bi::list2<boost::_bi::value<impala::QueryExecMgr*>, boost::_bi::value<impala::FragmentInstanceState*> >::operator()<boost::_mfi::mf1<void, impala::QueryExecMgr, impala::FragmentInstanceState*>, boost::_bi::list0> (this=0x9752ad0, f=..., a=...) at /home/mikeb/Impala/toolchain/boost-1.57.0-p1/include/boost/bind/bind.hpp:313
      #19 0x0000000001a7994d in boost::_bi::bind_t<void, boost::_mfi::mf1<void, impala::QueryExecMgr, impala::FragmentInstanceState*>, boost::_bi::list2<boost::_bi::value<impala::QueryExecMgr*>, boost::_bi::value<impala::FragmentInstanceState*> > >::operator() (this=0x9752ac0) at /home/mikeb/Impala/toolchain/boost-1.57.0-p1/include/boost/bind/bind_template.hpp:20
      #20 0x0000000001a79558 in boost::detail::function::void_function_obj_invoker0<boost::_bi::bind_t<void, boost::_mfi::mf1<void, impala::QueryExecMgr, impala::FragmentInstanceState*>, boost::_bi::list2<boost::_bi::value<impala::QueryExecMgr*>, boost::_bi::value<impala::FragmentInstanceState*> > >, void>::invoke (function_obj_ptr=...) at /home/mikeb/Impala/toolchain/boost-1.57.0-p1/include/boost/function/function_template.hpp:153
      #21 0x000000000137cb70 in boost::function0<void>::operator() (this=0x7fcd18fa5d20) at /home/mikeb/Impala/toolchain/boost-1.57.0-p1/include/boost/function/function_template.hpp:767
      #22 0x000000000162f1c7 in impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*) (name=..., category=..., functor=..., thread_started=0x7fcd1a7aea80)
          at /home/mikeb/Impala/be/src/util/thread.cc:325
      #23 0x0000000001637ba2 in boost::_bi::list4<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::Promise<long>*> >::operator()<void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*), boost::_bi::list0>(boost::_bi::type<void>, void (*&)(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*), boost::_bi::list0&, int) (this=0x8a2c5c0, f=@0x8a2c5b8: 0x162ef02 <impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*)>, a=...)
          at /home/mikeb/Impala/toolchain/boost-1.57.0-p1/include/boost/bind/bind.hpp:457
      #24 0x0000000001637ae5 in boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*), boost::_bi::list4<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::Promise<long>*> > >::operator()() (this=0x8a2c5b8) at /home/mikeb/Impala/toolchain/boost-1.57.0-p1/include/boost/bind/bind_template.hpp:20
      #25 0x0000000001637aa8 in boost::detail::thread_data<boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*), boost::_bi::list4<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::Promise<long>*> > > >::run() (this=0x8a2c400) at /home/mikeb/Impala/toolchain/boost-1.57.0-p1/include/boost/thread/detail/thread.hpp:116
      #26 0x0000000001af148a in thread_proxy ()
      #27 0x00007fcda3de6184 in start_thread (arg=0x7fcd18fa6700) at pthread_create.c:312
      #28 0x00007fcda3b1337d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
      

      Commit: IMPALA-5113: fix dirty unpinned invariant

      Workaround

      SET ENABLE_EXPR_REWRITES=FALSE;
      

        Issue Links

          Activity

          Hide
          twmarshall Thomas Tauber-Marshall added a comment -

          commit 915a16345c9325f29cad2a4c113d960e434b4ba7
          Author: Thomas Tauber-Marshall <tmarshall@cloudera.com>
          Date: Fri Apr 14 12:36:46 2017 -0700

          IMPALA-5125: SimplifyConditionalsRule incorrectly handles aggregates

          This patch addresses 3 issues:

          • SelectList.reset() didn't properly reset some of its members, though
            they're documented as needing to be reset. This was causing a crash
            when the Planner attempted to make an aggregation node for an agg
            function that had been eliminated by expr rewriting. While I'm here,
            I added resetting of all of SelectList's members that need to be
            reset, and fixed the documentation of one member that shouldn't be
            reset.
          • SimplifyConditionalsRule was changing the meaning of queries that
            contain agg functions, e.g. because "select if(true, 0, sum(id))"
            is not equivalent to "select 0". The fix is to not return the
            simplfied expr if it removes all aggregates.
          • ExprRewriteRulesTest was performing rewrites on the result exprs of
            the SelectStmt, which causes problems if the result exprs have been
            substituted. In normal query execution, we don't rewrite the result
            exprs anyway, so the fix is to match normal query execution and
            rewrite the select list exprs.

          Testing:

          • Added e2e test to exprs.test.
          • Added unit test to ExprRewriteRulesTest.

          Change-Id: Ic20b1621753980b47a612e0885804363b733f6da
          Reviewed-on: http://gerrit.cloudera.org:8080/6653
          Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com>
          Tested-by: Impala Public Jenkins

          Show
          twmarshall Thomas Tauber-Marshall added a comment - commit 915a16345c9325f29cad2a4c113d960e434b4ba7 Author: Thomas Tauber-Marshall <tmarshall@cloudera.com> Date: Fri Apr 14 12:36:46 2017 -0700 IMPALA-5125 : SimplifyConditionalsRule incorrectly handles aggregates This patch addresses 3 issues: SelectList.reset() didn't properly reset some of its members, though they're documented as needing to be reset. This was causing a crash when the Planner attempted to make an aggregation node for an agg function that had been eliminated by expr rewriting. While I'm here, I added resetting of all of SelectList's members that need to be reset, and fixed the documentation of one member that shouldn't be reset. SimplifyConditionalsRule was changing the meaning of queries that contain agg functions, e.g. because "select if(true, 0, sum(id))" is not equivalent to "select 0". The fix is to not return the simplfied expr if it removes all aggregates. ExprRewriteRulesTest was performing rewrites on the result exprs of the SelectStmt, which causes problems if the result exprs have been substituted. In normal query execution, we don't rewrite the result exprs anyway, so the fix is to match normal query execution and rewrite the select list exprs. Testing: Added e2e test to exprs.test. Added unit test to ExprRewriteRulesTest. Change-Id: Ic20b1621753980b47a612e0885804363b733f6da Reviewed-on: http://gerrit.cloudera.org:8080/6653 Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com> Tested-by: Impala Public Jenkins
          Hide
          alex.behm Alexander Behm added a comment -

          Thomas Tauber-Marshall good thoughts. I'm thinking that options 2-4 are too complicated and the gain is questionable.

          How about the following more refined variant of option 1? Before simplification we check if the expr contains an aggregate expr. If the simplified expr does not contain an aggregate expr anymore, then return the original expr.
          In particular, I'm thinking that these cases can still be simplified. You can think of similar cases with the other conditionals.

          IF(true, count(int_col), sum(bigint_col));
          IF(true, count(int_col), bool_col);
          
          Show
          alex.behm Alexander Behm added a comment - Thomas Tauber-Marshall good thoughts. I'm thinking that options 2-4 are too complicated and the gain is questionable. How about the following more refined variant of option 1? Before simplification we check if the expr contains an aggregate expr. If the simplified expr does not contain an aggregate expr anymore, then return the original expr. In particular, I'm thinking that these cases can still be simplified. You can think of similar cases with the other conditionals. IF( true , count(int_col), sum(bigint_col)); IF( true , count(int_col), bool_col);
          Hide
          mikesbrown Michael Brown added a comment -

          Note IMPALA-5206, which looks very similar and could even be a dupe, though the effect is different (NPE, not DCHECK).

          Show
          mikesbrown Michael Brown added a comment - Note IMPALA-5206 , which looks very similar and could even be a dupe, though the effect is different (NPE, not DCHECK).
          Hide
          twmarshall Thomas Tauber-Marshall added a comment -

          So the immediate cause of the crash is that SelectList doesn't properly reset some of its members, aggInfo_ in this case, which causes the planner to try to create an aggregation node for an agg function that's no longer part of the query. Setting 'aggInfo_ = null' in SelectStmt.reset() solves this and allows the query to pass.

          However, we still have a problem of incorrect results. Consider the query:
          "select if(true, 0, sum(id)) from functional.alltypes"
          the "if(true, 0, sum(id))" will be interpreted as an agg function over the whole table and the output should be a single "0". But if the query gets simplified to:
          "select 0 from functional.alltypes" we lose the info that it was an agg and the output will be a "0" for each row of the table.

          I see a couple of ideas for solving this:
          1. Disallow simplifying conditionals that contain agg functions, on the premise that "if(true, 0, sum(id))" isn't actually equivalent to "0". This potentially loses some of the benefit of SimplfyConditionalsRule.
          2. Allow the simpification, but instead of returning just the NumericLiteral of "0" in the example return something that knows it's an agg, eg. by adding an Expr.setIsAgg function that is called on the NumericalLiteral, or a WrapAggExpr with the NumericLiteral as its child.
          3. Have the SelectStmt remember its select list after being reset so that it can look at the original list to determine which of its elements should be treated as agg functions.
          4. Something else.

          1. is easy to implement, not sure about 2. and 3.
          Also queries with a "group by" don't have this problem, so I think it only affects agg functions over entire tables, which simplifies what we have to do here.

          Show
          twmarshall Thomas Tauber-Marshall added a comment - So the immediate cause of the crash is that SelectList doesn't properly reset some of its members, aggInfo_ in this case, which causes the planner to try to create an aggregation node for an agg function that's no longer part of the query. Setting 'aggInfo_ = null' in SelectStmt.reset() solves this and allows the query to pass. However, we still have a problem of incorrect results. Consider the query: "select if(true, 0, sum(id)) from functional.alltypes" the "if(true, 0, sum(id))" will be interpreted as an agg function over the whole table and the output should be a single "0". But if the query gets simplified to: "select 0 from functional.alltypes" we lose the info that it was an agg and the output will be a "0" for each row of the table. I see a couple of ideas for solving this: 1. Disallow simplifying conditionals that contain agg functions, on the premise that "if(true, 0, sum(id))" isn't actually equivalent to "0". This potentially loses some of the benefit of SimplfyConditionalsRule. 2. Allow the simpification, but instead of returning just the NumericLiteral of "0" in the example return something that knows it's an agg, eg. by adding an Expr.setIsAgg function that is called on the NumericalLiteral, or a WrapAggExpr with the NumericLiteral as its child. 3. Have the SelectStmt remember its select list after being reset so that it can look at the original list to determine which of its elements should be treated as agg functions. 4. Something else. 1. is easy to implement, not sure about 2. and 3. Also queries with a "group by" don't have this problem, so I think it only affects agg functions over entire tables, which simplifies what we have to do here.
          Hide
          alex.behm Alexander Behm added a comment -

          JIRA that caused the regression:
          https://issues.apache.org/jira/browse/IMPALA-1861

          Show
          alex.behm Alexander Behm added a comment - JIRA that caused the regression: https://issues.apache.org/jira/browse/IMPALA-1861
          Hide
          mikesbrown Michael Brown added a comment -

          This is definitely a regression introduced by the conditional simplification expr rewrite.

          Alexander Behm Is there a bug associated with this rewrite? We should mention in that bug that it caused this regression, if we can.

          Show
          mikesbrown Michael Brown added a comment - This is definitely a regression introduced by the conditional simplification expr rewrite. Alexander Behm Is there a bug associated with this rewrite? We should mention in that bug that it caused this regression, if we can.
          Hide
          alex.behm Alexander Behm added a comment -

          The problem is that we generate an empty aggregation tree because "IF(True, 514, COUNT(a1.ps_partkey)) AS int_col" gets simplified to "514". Using "set enable_expr_rewrites=false" makes the query work.

          +----------------------------------------------------------------------------------------------------------------------+
          | Explain String                                                                                                       |
          +----------------------------------------------------------------------------------------------------------------------+
          | Estimated Per-Host Requirements: Memory=186.00MB VCores=1                                                            |
          |                                                                                                                      |
          | PLAN-ROOT SINK                                                                                                       |
          | |                                                                                                                    |
          | 06:AGGREGATE [FINALIZE]                                                                                              |
          | |  hosts=1 per-host-mem=unavailable                                                                                  |
          | |  tuple-ids=2 row-size=0B cardinality=1                                                                             |
          | |                                                                                                                    |
          | 05:EXCHANGE [UNPARTITIONED]                                                                                          |
          | |  hosts=1 per-host-mem=unavailable                                                                                  |
          | |  tuple-ids=2 row-size=0B cardinality=1                                                                             |
          | |                                                                                                                    |
          | 03:AGGREGATE                                                                                                         |
          | |  hosts=1 per-host-mem=10.00MB                                                                                      |
          | |  tuple-ids=2 row-size=0B cardinality=1                                                                             |
          | |                                                                                                                    |
          | 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST]                                                                             |
          | |  hash predicates: (a1.ps_availqty) = (a2.ps_availqty) * (((a2.ps_partkey) + (538.4658339348)) - (-445.1448857941)) |
          | |  runtime filters: RF000 <- (a2.ps_availqty) * (((a2.ps_partkey) + (538.4658339348)) - (-445.1448857941))           |
          | |  hosts=1 per-host-mem=0B                                                                                           |
          | |  tuple-ids=0 row-size=4B cardinality=800000                                                                        |
          | |                                                                                                                    |
          | |--04:EXCHANGE [BROADCAST]                                                                                           |
          | |  |  hosts=1 per-host-mem=0B                                                                                        |
          | |  |  tuple-ids=1 row-size=0B cardinality=0                                                                          |
          | |  |                                                                                                                 |
          | |  01:EMPTYSET                                                                                                       |
          | |     hosts=1 per-host-mem=0B                                                                                        |
          | |     tuple-ids=1 row-size=0B cardinality=0                                                                          |
          | |                                                                                                                    |
          | 00:SCAN HDFS [tpch.partsupp a1, RANDOM]                                                                              |
          |    partitions=1/1 files=1 size=112.71MB                                                                              |
          |    runtime filters: RF000 -> (a1.ps_availqty)                                                                        |
          |    table stats: 800000 rows total                                                                                    |
          |    column stats: all                                                                                                 |
          |    hosts=1 per-host-mem=176.00MB                                                                                     |
          |    tuple-ids=0 row-size=4B cardinality=800000                                                                        |
          +----------------------------------------------------------------------------------------------------------------------+
          

          This is definitely a regression introduced by the conditional simplification expr rewrite.

          Show
          alex.behm Alexander Behm added a comment - The problem is that we generate an empty aggregation tree because "IF(True, 514, COUNT(a1.ps_partkey)) AS int_col" gets simplified to "514". Using "set enable_expr_rewrites=false" makes the query work. +----------------------------------------------------------------------------------------------------------------------+ | Explain String | +----------------------------------------------------------------------------------------------------------------------+ | Estimated Per-Host Requirements: Memory=186.00MB VCores=1 | | | | PLAN-ROOT SINK | | | | | 06:AGGREGATE [FINALIZE] | | | hosts=1 per-host-mem=unavailable | | | tuple-ids=2 row-size=0B cardinality=1 | | | | | 05:EXCHANGE [UNPARTITIONED] | | | hosts=1 per-host-mem=unavailable | | | tuple-ids=2 row-size=0B cardinality=1 | | | | | 03:AGGREGATE | | | hosts=1 per-host-mem=10.00MB | | | tuple-ids=2 row-size=0B cardinality=1 | | | | | 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST] | | | hash predicates: (a1.ps_availqty) = (a2.ps_availqty) * (((a2.ps_partkey) + (538.4658339348)) - (-445.1448857941)) | | | runtime filters: RF000 <- (a2.ps_availqty) * (((a2.ps_partkey) + (538.4658339348)) - (-445.1448857941)) | | | hosts=1 per-host-mem=0B | | | tuple-ids=0 row-size=4B cardinality=800000 | | | | | |--04:EXCHANGE [BROADCAST] | | | | hosts=1 per-host-mem=0B | | | | tuple-ids=1 row-size=0B cardinality=0 | | | | | | | 01:EMPTYSET | | | hosts=1 per-host-mem=0B | | | tuple-ids=1 row-size=0B cardinality=0 | | | | | 00:SCAN HDFS [tpch.partsupp a1, RANDOM] | | partitions=1/1 files=1 size=112.71MB | | runtime filters: RF000 -> (a1.ps_availqty) | | table stats: 800000 rows total | | column stats: all | | hosts=1 per-host-mem=176.00MB | | tuple-ids=0 row-size=4B cardinality=800000 | +----------------------------------------------------------------------------------------------------------------------+ This is definitely a regression introduced by the conditional simplification expr rewrite.
          Hide
          alex.behm Alexander Behm added a comment -

          Dan Hecht, yes I'll triage this one.

          Show
          alex.behm Alexander Behm added a comment - Dan Hecht , yes I'll triage this one.
          Hide
          mikesbrown Michael Brown added a comment -

          Lars Volker, Dan Hecht is correct: the hash is simply the commit corresponding to the backtrace, so that line numbers are absolutely correct. Also, I would only claim a particular commit is the culprit explicitly.

          Show
          mikesbrown Michael Brown added a comment - Lars Volker , Dan Hecht is correct: the hash is simply the commit corresponding to the backtrace, so that line numbers are absolutely correct. Also, I would only claim a particular commit is the culprit explicitly.
          Hide
          dhecht Dan Hecht added a comment -

          Dimitris Tsirogiannis - No, AFAIK, we don't know yet whether this is a regression. Just thought I'd check if you thought of something recent, in case.

          Lars Volker - The change in the description is just the githash that this was reproduced at. We don't know if this is a regression, and that change couldn't have been the culprit.

          Alexander Behm - do you have time to triage this one?

          Show
          dhecht Dan Hecht added a comment - Dimitris Tsirogiannis - No, AFAIK, we don't know yet whether this is a regression. Just thought I'd check if you thought of something recent, in case. Lars Volker - The change in the description is just the githash that this was reproduced at. We don't know if this is a regression, and that change couldn't have been the culprit. Alexander Behm - do you have time to triage this one?
          Hide
          lv Lars Volker added a comment -

          Tim Armstrong - The JIRA pointed out by Michael Brown in the description was assigned to you. Could you have a look at this?

          Michael Brown - Does that comment under the stack trace imply that your script identified that commit as the likely culprit?

          Show
          lv Lars Volker added a comment - Tim Armstrong - The JIRA pointed out by Michael Brown in the description was assigned to you. Could you have a look at this? Michael Brown - Does that comment under the stack trace imply that your script identified that commit as the likely culprit?
          Hide
          dtsirogiannis Dimitris Tsirogiannis added a comment -

          Dan Hecht, I don't see any recent changes in the FE that could cause this. Do we know for sure that this was recently introduced? This looks like a query produced by the random query generator.

          Show
          dtsirogiannis Dimitris Tsirogiannis added a comment - Dan Hecht , I don't see any recent changes in the FE that could cause this. Do we know for sure that this was recently introduced? This looks like a query produced by the random query generator.
          Hide
          dhecht Dan Hecht added a comment -

          Looks like invalid descriptors passed from the frontend. Dimitris Tsirogiannis, any recent changes ring a bell?

          Show
          dhecht Dan Hecht added a comment - Looks like invalid descriptors passed from the frontend. Dimitris Tsirogiannis , any recent changes ring a bell?

            People

            • Assignee:
              twmarshall Thomas Tauber-Marshall
              Reporter:
              mikesbrown Michael Brown
            • Votes:
              0 Vote for this issue
              Watchers:
              7 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved:

                Development