Uploaded image for project: 'IMPALA'
  1. IMPALA
  2. IMPALA-2168

SEGV in BufferedTupleStream::num_rows() in a query with very large, spilling ROJ

    XMLWordPrintableJSON

Details

    Description

      In a query where the optimizer wrongly chose a plan with ROJs with huge right side (see IMPALA-2165) Impalad seg faulted after 2.5h or execution with the following stack trace:

       (gdb) bt
      #0  0x00007f1ab994c885 in raise () from /lib64/libc.so.6
      #1  0x00007f1ab994de61 in abort () from /lib64/libc.so.6
      #2  0x00007f1abb867155 in os::abort(bool) () from /usr/java/jdk1.7.0_45-cloudera/jre/lib/amd64/server/libjvm.so
      #3  0x00007f1abb9e6087 in VMError::report_and_die() () from /usr/java/jdk1.7.0_45-cloudera/jre/lib/amd64/server/libjvm.so
      #4  0x00007f1abb86badf in JVM_handle_linux_signal () from /usr/java/jdk1.7.0_45-cloudera/jre/lib/amd64/server/libjvm.so
      #5  <signal handler called>
      #6  0x0000000001470d7e in impala::BufferedTupleStream::num_rows() const ()
      #7  0x000000000154dbe0 in impala::PartitionedHashJoinNode::NextSpilledProbeRowBatch(impala::RuntimeState*, impala::RowBatch*) ()
      #8  0x000000000154f484 in impala::PartitionedHashJoinNode::GetNext(impala::RuntimeState*, impala::RowBatch*, bool*) ()
      #9  0x000000000145dfa5 in impala::PlanFragmentExecutor::GetNextInternal(impala::RowBatch**) ()
      #10 0x000000000145c88e in impala::PlanFragmentExecutor::OpenInternal() ()
      #11 0x000000000145c4e6 in impala::PlanFragmentExecutor::Open() ()
      #12 0x00000000010fd270 in impala::FragmentMgr::FragmentExecState::Exec() ()
      #13 0x00000000010f4b92 in impala::FragmentMgr::FragmentExecThread(impala::FragmentMgr::FragmentExecState*) ()
      #14 0x00000000010f8ddf in boost::_mfi::mf1<void, impala::FragmentMgr, impala::FragmentMgr::FragmentExecState*>::operator()(impala::FragmentMgr*, impala::FragmentMgr::FragmentExecState*) const ()
      #15 0x00000000010f8bab in void boost::_bi::list2<boost::_bi::value<impala::FragmentMgr*>, boost::_bi::value<impala::FragmentMgr::FragmentExecState*> >::operator()<boost::_mfi::mf1<void, impala::FragmentMgr, impala::FragmentMgr::FragmentExecState*>, boost::_bi::list0>(boost::_bi::type<void>, boost::_mfi::mf1<void, impala::FragmentMgr, impala::FragmentMgr::FragmentExecState*>&, boost::_bi::list0&, int) ()
      #16 0x00000000010f84b9 in boost::_bi::bind_t<void, boost::_mfi::mf1<void, impala::FragmentMgr, impala::FragmentMgr::FragmentExecState*>, boost::_bi::list2<boost::_bi::value<impala::FragmentMgr*>, boost::_bi::value<impala::FragmentMgr::FragmentExecState*> > >::operator()() ()
      #17 0x00000000010f7e38 in boost::detail::function::void_function_obj_invoker0<boost::_bi::bind_t<void, boost::_mfi::mf1<void, impala::FragmentMgr, impala::FragmentMgr::FragmentExecState*>, boost::_bi::list2<boost::_bi::value<impala::FragmentMgr*>, boost::_bi::value<impala::FragmentMgr::FragmentExecState*> > >, void>::invoke(boost::detail::function::function_buffer&) ()
      #18 0x0000000000f9e448 in boost::function0<void>::operator()() const ()
      #19 0x00000000011c0d67 in impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*) ()
      #20 0x00000000011ca3b8 in void boost::_bi::list4<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::Promise<long>*> >::operator()<void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*), boost::_bi::list0>(boost::_bi::type<void>, void (*&)(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*), boost::_bi::list0&, int) ()
      #21 0x00000000011ca301 in boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*), boost::_bi::list4<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::Promise<long>*> > >::operator()() ()
      #22 0x00000000011ca2c6 in boost::detail::thread_data<boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::Promise<long>*), boost::_bi::list4<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::Promise<long>*> > > >::run() ()
      #23 0x0000000001602f83 in ?? ()
      #24 0x00007f1abaa24806 in start_thread () from /lib64/libpthread.so.0
      #25 0x00007f1ab99f5e8d in clone () from /lib64/libc.so.6
      #26 0x0000000000000000 in ?? ()
      

      The right side of the ROJ was very large. A single node had to put 85M tuples of 2K each => 170GB of data in the partitioned hash tables.

      Below is the summary of the failed run that produced the segv.

      Operator                #Hosts   Avg Time   Max Time   #Rows  Est. #Rows   Peak Mem  Est. Peak Mem  Detail                         
      -----------------------------------------------------------------------------------------------------------------------------------
      10:HASH JOIN                 1        0ns        0ns       0         274   10.27 GB       10.73 KB  RIGHT OUTER JOIN, PARTITIONED  
      |--18:EXCHANGE               1        0ns        0ns   7.91M           5          0              0  HASH(jobx.Promotion_Channel... 
      |  09:HASH JOIN              1      2h19m      2h19m   7.91M           5   23.11 GB              0  RIGHT OUTER JOIN, PARTITIONED  
      |  |--16:EXCHANGE            1   49s658ms   49s658ms  84.22M           0          0              0  HASH(trim(rsp.Client_Code),... 
      |  |  08:CROSS JOIN          4   12s708ms   14s488ms  84.22M           0    9.84 MB              0                                 
      |  |  |--14:EXCHANGE         4   49.468us   82.629us       1           0          0              0  BROADCAST                      
      |  |  |  04:SCAN HDFS        1  225.715ms  225.715ms       1           0  142.00 KB       32.00 MB  default.sbt_2650617_attribu... 
      |  |  07:HASH JOIN           4      1h14m      1h29m  84.22M     747.15M    3.47 GB        1.98 GB  RIGHT OUTER JOIN, PARTITIONED  
      |  |  |--13:EXCHANGE         4    4s740ms    5s171ms  13.39M      13.39M          0              0  HASH(rsp.MC_Address)           
      |  |  |  00:SCAN HDFS        4  410.557ms  442.982ms  13.39M      13.39M  724.68 MB        1.03 GB  default.sbt_2650617_respons... 
      |  |  12:EXCHANGE            4       6m8s       7m2s   1.05B     521.80M          0              0  HASH(prm.mc_address)           
      |  |  03:HASH JOIN           4      2m22s      2m40s   1.05B     521.80M   13.95 MB      482.01 KB  INNER JOIN, BROADCAST          
      |  |  |--11:EXCHANGE         4    1.909ms    2.748ms   7.82K       7.82K          0              0  BROADCAST                      
      |  |  |  02:SCAN HDFS        1  481.483ms  481.483ms   7.82K       7.82K    1.14 MB       32.00 MB  default.sbt_2650617_infojob... 
      |  |  01:SCAN HDFS           4    4s097ms    4s566ms   1.05B     351.09M    1.28 GB        7.50 GB  default.sbt_2650617_promoti... 
      |  15:EXCHANGE               1  408.401us  408.401us       7           5          0              0  HASH(trim(cw.Client_Code),c... 
      |  05:SCAN HDFS              1  844.743ms  844.743ms       7           5   94.00 KB       32.00 MB  default.sbt_2650617_attribu... 
      17:EXCHANGE                  1    4.553us    4.553us       0         274          0              0  HASH(rb.Channel_Code,rb.RB_... 
      06:SCAN HDFS                 1  797.315ms  797.315ms     274         274  101.00 KB       32.00 MB  default.sbt_2650617_attribu... 
      

      Attachments

        1. gdb-session.txt
          6 kB
          Ippokratis Pandis

        Issue Links

          Activity

            People

              ippokratis Ippokratis Pandis
              ippokratis Ippokratis Pandis
              Votes:
              0 Vote for this issue
              Watchers:
              5 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: