Uploaded image for project: 'Kudu'
  1. Kudu
  2. KUDU-2194

Kudu crashes when started with more than 34 data directories

    XMLWordPrintableJSON

    Details

    • Type: Bug
    • Status: Resolved
    • Priority: Major
    • Resolution: Fixed
    • Affects Version/s: 1.6.0
    • Fix Version/s: 1.6.0
    • Component/s: fs
    • Labels:
      None

      Description

      Each data directory creates its own uniquely named thread pool, which means that when we intern trace metric names during pool construction, each pool interns a unique set of three trace metric names. The global intern map is limited to 100 entries, and 34 data directories will cause us to exceed that.

      Here's the relevant code from the thread pool constructor:

        std::string prefix = !builder.trace_metric_prefix_.empty() ?
            builder.trace_metric_prefix_ : builder.name_;
      
        queue_time_trace_metric_name_ = TraceMetrics::InternName(
            prefix + ".queue_time_us");
        run_wall_time_trace_metric_name_ = TraceMetrics::InternName(
            prefix + ".run_wall_time_us");
        run_cpu_time_trace_metric_name_ = TraceMetrics::InternName(
            prefix + ".run_cpu_time_us");
      

      And here's what the crash looks like reproduced in a unit test:

      F1019 14:34:53.684109  5117 trace_metrics.cc:68] Check failed: g_intern_map->size() < 100 (100 vs. 100) Too many interned strings: (data dir 0.queue_time_us, data dir 0.queue_time_us) (data dir 0.run_cpu_time_us, data dir 0.run_cpu_time_us) (data dir 0.run_wall_time_us, data dir 0.run_wall_time_us) (data dir 1.queue_time_us, data dir 1.queue_time_us) (data dir 1.run_cpu_time_us, data dir 1.run_cpu_time_us) (data dir 1.run_wall_time_us, data dir 1.run_wall_time_us) (data dir 10.queue_time_us, data dir 10.queue_time_us) (data dir 10.run_cpu_time_us, data dir 10.run_cpu_time_us) (data dir 10.run_wall_time_us, data dir 10.run_wall_time_us) (data dir 11.queue_time_us, data dir 11.queue_time_us) (data dir 11.run_cpu_time_us, data dir 11.run_cpu_time_us) (data dir 11.run_wall_time_us, data dir 11.run_wall_time_us) (data dir 12.queue_time_us, data dir 12.queue_time_us) (data dir 12.run_cpu_time_us, data dir 12.run_cpu_time_us) (data dir 12.run_wall_time_us, data dir 12.run_wall_time_us) (data dir 13.queue_time_us, data dir 13.queue_time_us) (data dir 13.run_cpu_time_us, data dir 13.run_cpu_time_us) (data dir 13.run_wall_time_us, data dir 13.run_wall_time_us) (data dir 14.queue_time_us, data dir 14.queue_time_us) (data dir 14.run_cpu_time_us, data dir 14.run_cpu_time_us) (data dir 14.run_wall_time_us, data dir 14.run_wall_time_us) (data dir 15.queue_time_us, data dir 15.queue_time_us) (data dir 15.run_cpu_time_us, data dir 15.run_cpu_time_us) (data dir 15.run_wall_time_us, data dir 15.run_wall_time_us) (data dir 16.queue_time_us, data dir 16.queue_time_us) (data dir 16.run_cpu_time_us, data dir 16.run_cpu_time_us) (data dir 16.run_wall_time_us, data dir 16.run_wall_time_us) (data dir 17.queue_time_us, data dir 17.queue_time_us) (data dir 17.run_cpu_time_us, data dir 17.run_cpu_time_us) (data dir 17.run_wall_time_us, data dir 17.run_wall_time_us) (data dir 18.queue_time_us, data dir 18.queue_time_us) (data dir 18.run_cpu_time_us, data dir 18.run_cpu_time_us) (data dir 18.run_wall_time_us, data dir 18.run_wall_time_us) (data dir 19.queue_time_us, data dir 19.queue_time_us) (data dir 19.run_cpu_time_us, data dir 19.run_cpu_time_us) (data dir 19.run_wall_time_us, data dir 19.run_wall_time_us) (data dir 2.queue_time_us, data dir 2.queue_time_us) (data dir 2.run_cpu_time_us, data dir 2.run_cpu_time_us) (data dir 2.run_wall_time_us, data dir 2.run_wall_time_us) (data dir 20.queue_time_us, data dir 20.queue_time_us) (data dir 20.run_cpu_time_us, data dir 20.run_cpu_time_us) (data dir 20.run_wall_time_us, data dir 20.run_wall_time_us) (data dir 21.queue_time_us, data dir 21.queue_time_us) (data dir 21.run_cpu_time_us, data dir 21.run_cpu_time_us) (data dir 21.run_wall_time_us, data dir 21.run_wall_time_us) (data dir 22.queue_time_us, data dir 22.queue_time_us) (data dir 22.run_cpu_time_us, data dir 22.run_cpu_time_us) (data dir 22.run_wall_time_us, data dir 22.run_wall_time_us) (data dir 23.queue_time_us, data dir 23.queue_time_us) (data dir 23.run_cpu_time_us, data dir 23.run_cpu_time_us) (data dir 23.run_wall_time_us, data dir 23.run_wall_time_us) (data dir 24.queue_time_us, data dir 24.queue_time_us) (data dir 24.run_cpu_time_us, data dir 24.run_cpu_time_us) (data dir 24.run_wall_time_us, data dir 24.run_wall_time_us) (data dir 25.queue_time_us, data dir 25.queue_time_us) (data dir 25.run_cpu_time_us, data dir 25.run_cpu_time_us) (data dir 25.run_wall_time_us, data dir 25.run_wall_time_us) (data dir 26.queue_time_us, data dir 26.queue_time_us) (data dir 26.run_cpu_time_us, data dir 26.run_cpu_time_us) (data dir 26.run_wall_time_us, data dir 26.run_wall_time_us) (data dir 27.queue_time_us, data dir 27.queue_time_us) (data dir 27.run_cpu_time_us, data dir 27.run_cpu_time_us) (data dir 27.run_wall_time_us, data dir 27.run_wall_time_us) (data dir 28.queue_time_us, data dir 28.queue_time_us) (data dir 28.run_cpu_time_us, data dir 28.run_cpu_time_us) (data dir 28.run_wall_time_us, data dir 28.run_wall_time_us) (data dir 29.queue_time_us, data dir 29.queue_time_us) (data dir 29.run_cpu_time_us, data dir 29.run_cpu_time_us) (data dir 29.run_wall_time_us, data dir 29.run_wall_time_us) (data dir 3.queue_time_us, data dir 3.queue_time_us) (data dir 3.run_cpu_time_us, data dir 3.run_cpu_time_us) (data dir 3.run_wall_time_us, data dir 3.run_wall_time_us) (data dir 30.queue_time_us, data dir 30.queue_time_us) (data dir 30.run_cpu_time_us, data dir 30.run_cpu_time_us) (data dir 30.run_wall_time_us, data dir 30.run_wall_time_us) (data dir 31.queue_time_us, data dir 31.queue_time_us) (data dir 31.run_cpu_time_us, data dir 31.run_cpu_time_us) (data dir 31.run_wall_time_us, data dir 31.run_wall_time_us) (data dir 32.queue_time_us, data dir 32.queue_time_us) (data dir 32.run_cpu_time_us, data dir 32.run_cpu_time_us) (data dir 32.run_wall_time_us, data dir 32.run_wall_time_us) (data dir 33.queue_time_us, data dir 33.queue_time_us) (data dir 4.queue_time_us, data dir 4.queue_time_us) (data dir 4.run_cpu_time_us, data dir 4.run_cpu_time_us) (data dir 4.run_wall_time_us, data dir 4.run_wall_time_us) (data dir 5.queue_time_us, data dir 5.queue_time_us) (data dir 5.run_cpu_time_us, data dir 5.run_cpu_time_us) (data dir 5.run_wall_time_us, data dir 5.run_wall_time_us) (data dir 6.queue_time_us, data dir 6.queue_time_us) (data dir 6.run_cpu_time_us, data dir 6.run_cpu_time_us) (data dir 6.run_wall_time_us, data dir 6.run_wall_time_us) (data dir 7.queue_time_us, data dir 7.queue_time_us) (data dir 7.run_cpu_time_us, data dir 7.run_cpu_time_us) (data dir 7.run_wall_time_us, data dir 7.run_wall_time_us) (data dir 8.queue_time_us, data dir 8.queue_time_us) (data dir 8.run_cpu_time_us, data dir 8.run_cpu_time_us) (data dir 8.run_wall_time_us, data dir 8.run_wall_time_us) (data dir 9.queue_time_us, data dir 9.queue_time_us) (data dir 9.run_cpu_time_us, data dir 9.run_cpu_time_us) (data dir 9.run_wall_time_us, data dir 9.run_wall_time_us)
      *** Check failure stack trace: ***
      *** Aborted at 1508448893 (unix time) try "date -d @1508448893" if you are using GNU date ***
      PC: @     0x7fa0e2fa3428 gsignal
      *** SIGABRT (@0x3e8000013fd) received by PID 5117 (TID 0x7fa0e4f26800) from PID 5117; stack trace: ***
          @     0x7fa0e3349390 (unknown)
          @     0x7fa0e2fa3428 gsignal
          @     0x7fa0e2fa502a abort
          @     0x7fa0e406cd99 google::logging_fail()
          @     0x7fa0e406e6dd google::LogMessage::Fail()
          @     0x7fa0e40705c3 google::LogMessage::SendToLog()
          @     0x7fa0e406e23a google::LogMessage::Flush()
          @     0x7fa0e4070f8f google::LogMessageFatal::~LogMessageFatal()
          @     0x7fa0e4ad9586 kudu::TraceMetrics::InternName()
          @     0x7fa0e4ac2a48 kudu::ThreadPool::ThreadPool()
          @     0x7fa0e4ac149a kudu::ThreadPoolBuilder::Build()
          @     0x7fa0e4db11bc kudu::fs::DataDirManager::Open()
          @     0x7fa0e4daed74 kudu::fs::DataDirManager::CreateNew()
          @     0x7fa0e4daeb3e kudu::fs::DataDirManager::CreateNewForTests()
          @           0x43ed42 kudu::fs::TooManyDataDirManagerTest_TestTooManyInternedStrings_Test::TestBody()
          @     0x7fa0e3e3ce77 testing::internal::HandleExceptionsInMethodIfSupported<>()
          @     0x7fa0e3e32262 testing::Test::Run()
          @     0x7fa0e3e323a8 testing::TestInfo::Run()
          @     0x7fa0e3e32485 testing::TestCase::Run()
          @     0x7fa0e3e33158 testing::internal::UnitTestImpl::RunAllTests()
          @     0x7fa0e3e3d387 testing::internal::HandleExceptionsInMethodIfSupported<>()
          @     0x7fa0e3e3255a testing::UnitTest::Run()
          @     0x7fa0e50e509a RUN_ALL_TESTS()
          @     0x7fa0e50e2d88 main
          @     0x7fa0e2f8e830 __libc_start_main
          @           0x434d89 _start
          @                0x0 (unknown)
      Aborted (core dumped)
      

        Attachments

          Activity

            People

            • Assignee:
              adar Adar Dembo
              Reporter:
              adar Adar Dembo
            • Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: