Details
-
Bug
-
Status: Resolved
-
Low
-
Resolution: Fixed
-
None
-
None
-
Correctness - Test Failure
-
Low
-
Normal
-
Fuzz Test
-
All
-
None
-
Description
This issue was found by the HarryTopologyMixupTest… in the cep-15-accord branch we added stopping nodes as well as restarting nodes (now that accord supports it) and this looks to break TCM if the down node is a CMS voting member.
Here is the test that shows it
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.distributed.test.tcm; import accord.utils.Invariants; import accord.utils.async.TimeoutUtils; import org.agrona.collections.Long2LongHashMap; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.impl.INodeProvisionStrategy; import org.apache.cassandra.distributed.shared.ClusterUtils; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.junit.Test; import java.io.IOException; import java.time.Duration; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; public class RepoTest extends TestBaseImpl { /** * This is the history reported from HarryTopologyMixupTest * History: 2: Add Node3; epoch=18, cms=[1, 2] // hidden - reconfigure to rf=3 3: Waiting for CMS to Quiesce; epoch=18, cms=[1, 2] 5: Harry Validate All; epoch=31, cms=[1, 2, 3] 6: Harry Insert; epoch=31, cms=[1, 2, 3] 8: Add Node4; epoch=31, cms=[1, 2, 3] 9: Waiting for CMS to Quiesce; epoch=31, cms=[1, 2, 3] 10: Harry Validate All; epoch=38, cms=[1, 2, 3] 11: nodetool repair harry tbl_0 from node2; epoch=38, cms=[1, 2, 3] 12: Stop Node3 for nodetool removenode; epoch=38, cms=[1, 2, 3] 13: nodetool removenode node3 from node1; epoch=38, cms=[1, 2, 3] 14: nodetool repair harry tbl_0 from node1; epoch=49, cms=[1, 2, 3] 15: Waiting for CMS to Quiesce; epoch=49, cms=[1, 2, 3] 16: Stop Node1 for Normal Stop; epoch=49, cms=[1, 2, 4] 18: Add Node5; epoch=49, cms=[1, 2, 4] */ @Test public void test() throws IOException, ExecutionException, InterruptedException, TimeoutException { Long2LongHashMap nodeToToken = new Long2LongHashMap(-0); nodeToToken.put(1, -1799911656L); nodeToToken.put(2, -1005197310L); nodeToToken.put(3, -834315596L); nodeToToken.put(4, 335272232L); nodeToToken.put(5, -1829188286L); final AtomicInteger counter = new AtomicInteger(0); try (Cluster cluster = Cluster.build(2) .withTokenSupplier(i -> nodeToToken.get(i)) .withConfig(c -> c.with(Feature.values())) .withNodeProvisionStrategy((subnet, portMap) -> new INodeProvisionStrategy.AbstractNodeProvisionStrategy(portMap) { { Invariants.checkArgument(subnet == 0, "Unexpected subnet detected: %d", subnet); } private final String ipPrefix = "127.0." + subnet + '.'; @Override public int seedNodeNum() { switch (counter.getAndIncrement()) { case 0: case 1: return 1; default: return 2; } } @Override public String ipAddress(int nodeNum) { return ipPrefix + nodeNum; } }) .start()) { fixDistributedSchemas(cluster); IInvokableInstance node1 = cluster.get(1); IInvokableInstance node2 = cluster.get(2); node1.nodetoolResult("cms", "reconfigure", "2").asserts().success(); IInvokableInstance node3 = ClusterUtils.addInstance(cluster, node1.config(), c -> c.set("auto_bootstrap", true)); node3.startup(cluster); node1.nodetoolResult("cms", "reconfigure", Integer.toString(3)).asserts().success(); ClusterUtils.waitForCMSToQuiesce(cluster, new int[]{1, 2, 3}); IInvokableInstance node4 = ClusterUtils.addInstance(cluster, node1.config(), c -> c.set("auto_bootstrap", true)); node4.startup(cluster); ClusterUtils.stopUnchecked(node3); node1.nodetoolResult("removenode", "3").asserts().success(); ClusterUtils.stopUnchecked(node1); // expected CMS Voting Group: [1, 2, 4] TimeoutUtils.runBlocking(Duration.ofMinutes(2), "node5 join", () -> { IInvokableInstance node5 = ClusterUtils.addInstance(cluster, node1.config(), c -> c.set("auto_bootstrap", true)); node5.startup(cluster); }); } } }