Uploaded image for project: 'Cassandra'
  1. Cassandra
  2. CASSANDRA-19975

TopologyMixupTestBase does not fix replication factor for Keyspaces after reaching rf=3

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Low
    • Resolution: Fixed
    • NA
    • None
    • None

    Description

      This issue was found by the HarryTopologyMixupTest… in the cep-15-accord branch we added stopping nodes as well as restarting nodes (now that accord supports it) and this looks to break TCM if the down node is a CMS voting member.

      Here is the test that shows it

      /*
       * Licensed to the Apache Software Foundation (ASF) under one
       * or more contributor license agreements.  See the NOTICE file
       * distributed with this work for additional information
       * regarding copyright ownership.  The ASF licenses this file
       * to you under the Apache License, Version 2.0 (the
       * "License"); you may not use this file except in compliance
       * with the License.  You may obtain a copy of the License at
       *
       *     http://www.apache.org/licenses/LICENSE-2.0
       *
       * Unless required by applicable law or agreed to in writing, software
       * distributed under the License is distributed on an "AS IS" BASIS,
       * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       * See the License for the specific language governing permissions and
       * limitations under the License.
       */
      
      package org.apache.cassandra.distributed.test.tcm;
      
      import accord.utils.Invariants;
      import accord.utils.async.TimeoutUtils;
      import org.agrona.collections.Long2LongHashMap;
      import org.apache.cassandra.distributed.Cluster;
      import org.apache.cassandra.distributed.api.Feature;
      import org.apache.cassandra.distributed.api.IInvokableInstance;
      import org.apache.cassandra.distributed.impl.INodeProvisionStrategy;
      import org.apache.cassandra.distributed.shared.ClusterUtils;
      import org.apache.cassandra.distributed.test.TestBaseImpl;
      import org.junit.Test;
      
      import java.io.IOException;
      import java.time.Duration;
      import java.util.concurrent.ExecutionException;
      import java.util.concurrent.TimeoutException;
      import java.util.concurrent.atomic.AtomicInteger;
      
      public class RepoTest extends TestBaseImpl
      {
          /**
           * This is the history reported from HarryTopologyMixupTest
           * 
              History:
            2: Add Node3; epoch=18, cms=[1, 2]
      
            // hidden - reconfigure to rf=3
            3: Waiting for CMS to Quiesce; epoch=18, cms=[1, 2]
      
            5: Harry Validate All; epoch=31, cms=[1, 2, 3]
            6: Harry Insert; epoch=31, cms=[1, 2, 3]
      
            8: Add Node4; epoch=31, cms=[1, 2, 3]
            9: Waiting for CMS to Quiesce; epoch=31, cms=[1, 2, 3]
      
            10: Harry Validate All; epoch=38, cms=[1, 2, 3]
            11: nodetool repair harry tbl_0 from node2; epoch=38, cms=[1, 2, 3]
      
            12: Stop Node3 for nodetool removenode; epoch=38, cms=[1, 2, 3]
            13: nodetool removenode node3 from node1; epoch=38, cms=[1, 2, 3]
            14: nodetool repair harry tbl_0 from node1; epoch=49, cms=[1, 2, 3]
            15: Waiting for CMS to Quiesce; epoch=49, cms=[1, 2, 3]
      
            16: Stop Node1 for Normal Stop; epoch=49, cms=[1, 2, 4]
      
            18: Add Node5; epoch=49, cms=[1, 2, 4]
               */
          @Test
          public void test() throws IOException, ExecutionException, InterruptedException, TimeoutException
          {
              Long2LongHashMap nodeToToken = new Long2LongHashMap(-0);
              nodeToToken.put(1, -1799911656L);
              nodeToToken.put(2, -1005197310L);
              nodeToToken.put(3, -834315596L);
              nodeToToken.put(4, 335272232L);
              nodeToToken.put(5, -1829188286L);
              final AtomicInteger counter = new AtomicInteger(0);
              try (Cluster cluster = Cluster.build(2)
                                            .withTokenSupplier(i -> nodeToToken.get(i))
                                            .withConfig(c -> c.with(Feature.values()))
                                            .withNodeProvisionStrategy((subnet, portMap) -> new INodeProvisionStrategy.AbstractNodeProvisionStrategy(portMap)
                                            {
                                                {
                                                    Invariants.checkArgument(subnet == 0, "Unexpected subnet detected: %d", subnet);
                                                }
      
                                                private final String ipPrefix = "127.0." + subnet + '.';
      
                                                @Override
                                                public int seedNodeNum()
                                                {
                                                    switch (counter.getAndIncrement())
                                                    {
                                                        case 0:
                                                        case 1:
                                                            return 1;
                                                        default:
                                                            return 2;
                                                    }
                                                }
      
                                                @Override
                                                public String ipAddress(int nodeNum)
                                                {
                                                    return ipPrefix + nodeNum;
                                                }
                                            })
                                            .start())
              {
                  fixDistributedSchemas(cluster);
      
                  IInvokableInstance node1 = cluster.get(1);
                  IInvokableInstance node2 = cluster.get(2);
                  node1.nodetoolResult("cms", "reconfigure", "2").asserts().success();
      
      
                  IInvokableInstance node3 = ClusterUtils.addInstance(cluster, node1.config(), c -> c.set("auto_bootstrap", true));
                  node3.startup(cluster);
      
                  node1.nodetoolResult("cms", "reconfigure", Integer.toString(3)).asserts().success();
                  ClusterUtils.waitForCMSToQuiesce(cluster, new int[]{1, 2, 3});
      
                  IInvokableInstance node4 = ClusterUtils.addInstance(cluster, node1.config(), c -> c.set("auto_bootstrap", true));
                  node4.startup(cluster);
      
                  ClusterUtils.stopUnchecked(node3);
                  node1.nodetoolResult("removenode", "3").asserts().success();
      
                  ClusterUtils.stopUnchecked(node1);
      
                  // expected CMS Voting Group: [1, 2, 4]
                  TimeoutUtils.runBlocking(Duration.ofMinutes(2), "node5 join", () -> {
                      IInvokableInstance node5 = ClusterUtils.addInstance(cluster, node1.config(), c -> c.set("auto_bootstrap", true));
                      node5.startup(cluster);
                  });
              }
          }
      }
      

      Attachments

        Activity

          People

            dcapwell David Capwell
            dcapwell David Capwell
            David Capwell
            Alex Petrov
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved:

              Time Tracking

                Estimated:
                Original Estimate - Not Specified
                Not Specified
                Remaining:
                Remaining Estimate - 0h
                0h
                Logged:
                Time Spent - 10m
                10m