Description
We found several zombie executors on a cluster. Thermos logs indicate reaching system limits while trying to shutdown. Mesos agent is unable to get status of this container from docker daemon (docker inspect fails). Shouldn't thermos exit in such a case?
22 WARNING: Your kernel does not support swap limit capabilities, memory limited without swap. 23 twitter.common.app debug: Initializing: twitter.common.log (Logging subsystem.) 24 Writing log files to disk in /mnt/mesos/sandbox 25 I1023 19:04:32.261165 7 exec.cpp:162] Version: 1.2.0 26 I1023 19:04:32.264870 42 exec.cpp:237] Executor registered on agent b4fff262-c925-4edf-a2ef-2a5bbe89c42b-S3295 27 Writing log files to disk in /mnt/mesos/sandbox 28 Traceback (most recent call last): 29 File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1 26, in _excepting_run 30 self.__real_run(*args, **kw) 31 File "apache/thermos/monitoring/resource.py", line 243, in run 32 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/event_muxer.py", lin e 79, in wait 33 thread.start() 34 File "/usr/lib/python2.7/threading.py", line 745, in start 35 _start_new_thread(self.__bootstrap, ()) 36 thread.error: can't start new thread 37 ERROR] Failed to stop health checkers: 38 ERROR] Traceback (most recent call last): 39 File "apache/aurora/executor/aurora_executor.py", line 209, in _shutdown 40 propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT) 41 File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline 42 return deadline(*args, daemon=True, propagate=True, **kw) 43 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6 1, in deadline 44 AnonymousThread().start() 45 File "/usr/lib/python2.7/threading.py", line 745, in start 46 _start_new_thread(self.__bootstrap, ()) 47 error: can't start new thread 48 49 ERROR] Failed to stop runner: 50 ERROR] Traceback (most recent call last): 51 File "apache/aurora/executor/aurora_executor.py", line 217, in _shutdown 52 propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT) 53 File "apache/aurora/executor/aurora_executor.py", line 35, in propagate_deadline 54 return deadline(*args, daemon=True, propagate=True, **kw) 55 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py", line 6 1, in deadline 56 AnonymousThread().start() 57 File "/usr/lib/python2.7/threading.py", line 745, in start 58 _start_new_thread(self.__bootstrap, ()) 59 error: can't start new thread 60 61 Traceback (most recent call last): 62 File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 1 26, in _excepting_run 63 self.__real_run(*args, **kw) 64 File "apache/aurora/executor/status_manager.py", line 62, in run 65 File "apache/aurora/executor/aurora_executor.py", line 235, in _shutdown 66 File "/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deferred.py", line 5 6, in defer 67 deferred.start() 68 File "/usr/lib/python2.7/threading.py", line 745, in start 69 _start_new_thread(self.__bootstrap, ()) 70 thread.error: can't start new thread