Uploaded image for project: 'Cassandra'
  1. Cassandra
  2. CASSANDRA-15307

Fix flakey test_remote_query - cql_test.TestCQLSlowQuery test

    XMLWordPrintableJSON

Details

    Description

      Example failure: https://circleci.com/gh/jolynch/cassandra/554#tests/containers/61

       

      Your job ran 959 tests with 1 failure
      - test_remote_query cql_test.TestCQLSlowQuerycql_test.py
      
      ccmlib.node.TimeoutError: 05 Sep 2019 23:05:07 [node2] Missing: ['operations were slow', 'SELECT \\* FROM ks.test2 WHERE id = 1']: DEBUG [BatchlogTasks:1] 2019-09-05 23:04:24,437 Ba..... See debug.log for remainder
      self = <cql_test.TestCQLSlowQuery object at 0x7f4309528898>
      
          def test_remote_query(self):
              """
                  Check that a query running on a node other than the coordinator is reported as slow:
          
                  - populate the cluster with 2 nodes
                  - start one node without having it join the ring
                  - start the other one node with slow_query_log_timeout_in_ms set to a small value
                    and the read request timeouts set to a large value (to ensure the query is not aborted) and
                    read_iteration_delay set to a value big enough for the query to exceed slow_query_log_timeout_in_ms
                    (this will cause read queries to take longer than the slow query timeout)
                  - CREATE a table
                  - INSERT 5000 rows on a session on the node that is not a member of the ring
                  - run SELECT statements and check that the slow query messages are present in the debug logs
                    (we cannot check the logs at info level because the no spam logger has unpredictable results)
          
                  @jira_ticket CASSANDRA-12403
                  """
              cluster = self.cluster
              cluster.set_configuration_options(values={'slow_query_log_timeout_in_ms': 10,
                                                        'request_timeout_in_ms': 120000,
                                                        'read_request_timeout_in_ms': 120000,
                                                        'range_request_timeout_in_ms': 120000})
          
              cluster.populate(2)
              node1, node2 = cluster.nodelist()
          
              node1.start(wait_for_binary_proto=True, join_ring=False)  # ensure other node executes queries
              node2.start(wait_for_binary_proto=True,
                          jvm_args=["-Dcassandra.monitoring_report_interval_ms=10",
                                    "-Dcassandra.test.read_iteration_delay_ms=1"])  # see above for explanation
          
              session = self.patient_exclusive_cql_connection(node1)
          
              create_ks(session, 'ks', 1)
              session.execute("""
                      CREATE TABLE test2 (
                          id int,
                          col int,
                          val text,
                          PRIMARY KEY(id, col)
                      );
                  """)
          
              for i, j in itertools.product(list(range(100)), list(range(10))):
                  session.execute("INSERT INTO test2 (id, col, val) VALUES ({}, {}, 'foo')".format(i, j))
          
              # only check debug logs because at INFO level the no-spam logger has unpredictable results
              mark = node2.mark_log(filename='debug.log')
              session.execute(SimpleStatement("SELECT * from test2",
                                              consistency_level=ConsistencyLevel.ONE,
                                              retry_policy=FallthroughRetryPolicy()))
              node2.watch_log_for(["operations were slow", "SELECT \* FROM ks.test2"],
                                  from_mark=mark, filename='debug.log', timeout=60)
          
          
              mark = node2.mark_log(filename='debug.log')
              session.execute(SimpleStatement("SELECT * from test2 where id = 1",
                                              consistency_level=ConsistencyLevel.ONE,
                                              retry_policy=FallthroughRetryPolicy()))
              node2.watch_log_for(["operations were slow", "SELECT \* FROM ks.test2 WHERE id = 1"],
      >                           from_mark=mark, filename='debug.log', timeout=60)
      
      cql_test.py:1150: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      
      self = <ccmlib.node.Node object at 0x7f430b0cfcc0>
      exprs = ['operations were slow', 'SELECT \\* FROM ks.test2 WHERE id = 1']
      from_mark = 166214, timeout = 60, process = None, verbose = False
      filename = 'debug.log'
      
          def watch_log_for(self, exprs, from_mark=None, timeout=600, process=None, verbose=False, filename='system.log'):
              """
                  Watch the log until one or more (regular) expression are found.
                  This methods when all the expressions have been found or the method
                  timeouts (a TimeoutError is then raised). On successful completion,
                  a list of pair (line matched, match object) is returned.
                  """
              start = time.time()
              tofind = [exprs] if isinstance(exprs, string_types) else exprs
              tofind = [re.compile(e) for e in tofind]
              matchings = []
              reads = ""
              if len(tofind) == 0:
                  return None
          
              log_file = os.path.join(self.get_path(), 'logs', filename)
              output_read = False
              while not os.path.exists(log_file):
                  time.sleep(.5)
                  if start + timeout < time.time():
                      raise TimeoutError(time.strftime("%d %b %Y %H:%M:%S", time.gmtime()) + " [" + self.name + "] Timed out waiting for {} to be created.".format(log_file))
                  if process and not output_read:
                      process.poll()
                      if process.returncode is not None:
                          self.print_process_output(self.name, process, verbose)
                          output_read = True
                          if process.returncode != 0:
                              raise RuntimeError()  # Shouldn't reuse RuntimeError but I'm lazy
          
              with open(log_file) as f:
                  if from_mark:
                      f.seek(from_mark)
          
                  while True:
                      # First, if we have a process to check, then check it.
                      # Skip on Windows - stdout/stderr is cassandra.bat
                      if not common.is_win() and not output_read:
                          if process:
                              process.poll()
                              if process.returncode is not None:
                                  self.print_process_output(self.name, process, verbose)
                                  output_read = True
                                  if process.returncode != 0:
                                      raise RuntimeError()  # Shouldn't reuse RuntimeError but I'm lazy
          
                      line = f.readline()
                      if line:
                          reads = reads + line
                          for e in tofind:
                              m = e.search(line)
                              if m:
                                  matchings.append((line, m))
                                  tofind.remove(e)
                                  if len(tofind) == 0:
                                      return matchings[0] if isinstance(exprs, string_types) else matchings
                      else:
                          # yep, it's ugly
                          time.sleep(1)
                          if start + timeout < time.time():
      >                       raise TimeoutError(time.strftime("%d %b %Y %H:%M:%S", time.gmtime()) + " [" + self.name + "] Missing: " + str([e.pattern for e in tofind]) + ":\n" + reads[:50] + ".....\nSee {} for remainder".format(filename))
      E                       ccmlib.node.TimeoutError: 05 Sep 2019 23:05:07 [node2] Missing: ['operations were slow', 'SELECT \\* FROM ks.test2 WHERE id = 1']:
      E                       DEBUG [BatchlogTasks:1] 2019-09-05 23:04:24,437 Ba.....
      E                       See debug.log for remainder
      
      ../env/src/ccm/ccmlib/node.py:536: TimeoutError 

      Attachments

        1. CASSANDRA-15307.txt
          130 kB
          Ekaterina Dimitrova
        2. CASSANDRA-15307-fixed.txt
          393 kB
          Ekaterina Dimitrova

        Activity

          People

            e.dimitrova Ekaterina Dimitrova
            jolynch Joey Lynch
            Ekaterina Dimitrova
            Brandon Williams
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: