diff --git a/core/src/main/scala/kafka/api/LeaderAndIsrRequest.scala b/core/src/main/scala/kafka/api/LeaderAndIsrRequest.scala index 0311737..3e40817 100644 --- a/core/src/main/scala/kafka/api/LeaderAndIsrRequest.scala +++ b/core/src/main/scala/kafka/api/LeaderAndIsrRequest.scala @@ -32,6 +32,8 @@ import collection.Set object LeaderAndIsr { val initialLeaderEpoch: Int = 0 val initialZKVersion: Int = 0 + val NoLeader = -1 + val LeaderDuringDelete = -2 } case class LeaderAndIsr(var leader: Int, var leaderEpoch: Int, var isr: List[Int], var zkVersion: Int) { diff --git a/core/src/main/scala/kafka/controller/ControllerChannelManager.scala b/core/src/main/scala/kafka/controller/ControllerChannelManager.scala index f17d976..f79c1dc 100644 --- a/core/src/main/scala/kafka/controller/ControllerChannelManager.scala +++ b/core/src/main/scala/kafka/controller/ControllerChannelManager.scala @@ -211,7 +211,8 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging leaderAndIsrRequestMap(brokerId).put((topic, partition), PartitionStateInfo(leaderIsrAndControllerEpoch, replicas.toSet)) } - addUpdateMetadataRequestForBrokers(controllerContext.liveOrShuttingDownBrokerIds.toSeq) + addUpdateMetadataRequestForBrokers(controllerContext.liveOrShuttingDownBrokerIds.toSeq, + Set(TopicAndPartition(topic, partition))) } def addStopReplicaRequestForBrokers(brokerIds: Seq[Int], topic: String, partition: Int, deletePartition: Boolean, @@ -232,34 +233,40 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging * */ def addUpdateMetadataRequestForBrokers(brokerIds: Seq[Int], + partitions: collection.Set[TopicAndPartition] = Set.empty[TopicAndPartition], callback: (RequestOrResponse) => Unit = null) { - val partitionList = controllerContext.partitionLeadershipInfo.keySet.dropWhile( - p => controller.deleteTopicManager.isTopicQueuedUpForDeletion(p.topic)) - if(partitionList.size > 0) { - partitionList.foreach { partition => - val leaderIsrAndControllerEpochOpt = controllerContext.partitionLeadershipInfo.get(partition) - leaderIsrAndControllerEpochOpt match { - case Some(leaderIsrAndControllerEpoch) => - val replicas = controllerContext.partitionReplicaAssignment(partition).toSet - val partitionStateInfo = PartitionStateInfo(leaderIsrAndControllerEpoch, replicas) - brokerIds.filter(b => b >= 0).foreach { brokerId => - updateMetadataRequestMap.getOrElseUpdate(brokerId, new mutable.HashMap[TopicAndPartition, PartitionStateInfo]) - updateMetadataRequestMap(brokerId).put(partition, partitionStateInfo) - } - case None => - info("Leader not assigned yet for partition %s. Skip sending udpate metadata request".format(partition)) - } - } - } else { - if(controllerContext.partitionLeadershipInfo.keySet.size > 0) { - // last set of topics are being deleted - controllerContext.partitionLeadershipInfo.foreach { case(partition, leaderIsrAndControllerEpoch) => + def updateMetadataRequestMapFor(partition: TopicAndPartition, beingDeleted: Boolean) { + val leaderIsrAndControllerEpochOpt = controllerContext.partitionLeadershipInfo.get(partition) + leaderIsrAndControllerEpochOpt match { + case Some(leaderIsrAndControllerEpoch) => + val replicas = controllerContext.partitionReplicaAssignment(partition).toSet + val partitionStateInfo = if (beingDeleted) { + val leaderAndIsr = new LeaderAndIsr(LeaderAndIsr.LeaderDuringDelete, leaderIsrAndControllerEpoch.leaderAndIsr.isr) + PartitionStateInfo(LeaderIsrAndControllerEpoch(leaderAndIsr, leaderIsrAndControllerEpoch.controllerEpoch), replicas) + } else { + PartitionStateInfo(leaderIsrAndControllerEpoch, replicas) + } brokerIds.filter(b => b >= 0).foreach { brokerId => - updateMetadataRequestMap.put(brokerId, new mutable.HashMap[TopicAndPartition, PartitionStateInfo]) + updateMetadataRequestMap.getOrElseUpdate(brokerId, new mutable.HashMap[TopicAndPartition, PartitionStateInfo]) + updateMetadataRequestMap(brokerId).put(partition, partitionStateInfo) } - } + case None => + info("Leader not yet assigned for partition %s. Skip sending UpdateMetadataRequest.".format(partition)) } } + + val filteredPartitions = { + val givenPartitions = if (partitions.isEmpty) + controllerContext.partitionLeadershipInfo.keySet + else + partitions + if (controller.deleteTopicManager.partitionsToBeDeleted.isEmpty) + givenPartitions + else + givenPartitions -- controller.deleteTopicManager.partitionsToBeDeleted + } + filteredPartitions.foreach(partition => updateMetadataRequestMapFor(partition, beingDeleted = false)) + controller.deleteTopicManager.partitionsToBeDeleted.foreach(partition => updateMetadataRequestMapFor(partition, beingDeleted = true)) } def sendRequestsToBrokers(controllerEpoch: Int, correlationId: Int) { diff --git a/core/src/main/scala/kafka/controller/KafkaController.scala b/core/src/main/scala/kafka/controller/KafkaController.scala index e225226..7057b53 100644 --- a/core/src/main/scala/kafka/controller/KafkaController.scala +++ b/core/src/main/scala/kafka/controller/KafkaController.scala @@ -554,7 +554,7 @@ class KafkaController(val config : KafkaConfig, zkClient: ZkClient) extends Logg info("Removed partition %s from the list of reassigned partitions in zookeeper".format(topicAndPartition)) controllerContext.partitionsBeingReassigned.remove(topicAndPartition) //12. After electing leader, the replicas and isr information changes, so resend the update metadata request to every broker - sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq) + sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set(topicAndPartition)) // signal delete topic thread if reassignment for some partitions belonging to topics being deleted just completed deleteTopicManager.resumeDeletionForTopics(Set(topicAndPartition.topic)) } @@ -935,9 +935,9 @@ class KafkaController(val config : KafkaConfig, zkClient: ZkClient) extends Logg * metadata requests * @param brokers The brokers that the update metadata request should be sent to */ - def sendUpdateMetadataRequest(brokers: Seq[Int]) { + def sendUpdateMetadataRequest(brokers: Seq[Int], partitions: Set[TopicAndPartition] = Set.empty[TopicAndPartition]) { brokerRequestBatch.newBatch() - brokerRequestBatch.addUpdateMetadataRequestForBrokers(brokers) + brokerRequestBatch.addUpdateMetadataRequestForBrokers(brokers, partitions) brokerRequestBatch.sendRequestsToBrokers(epoch, controllerContext.correlationId.getAndIncrement) } @@ -969,7 +969,7 @@ class KafkaController(val config : KafkaConfig, zkClient: ZkClient) extends Logg "controller was elected with epoch %d. Aborting state change by this controller".format(controllerEpoch)) if (leaderAndIsr.isr.contains(replicaId)) { // if the replica to be removed from the ISR is also the leader, set the new leader value to -1 - val newLeader = if(replicaId == leaderAndIsr.leader) -1 else leaderAndIsr.leader + val newLeader = if (replicaId == leaderAndIsr.leader) LeaderAndIsr.NoLeader else leaderAndIsr.leader val newLeaderAndIsr = new LeaderAndIsr(newLeader, leaderAndIsr.leaderEpoch + 1, leaderAndIsr.isr.filter(b => b != replicaId), leaderAndIsr.zkVersion + 1) // update the new leadership decision in zookeeper or retry diff --git a/core/src/main/scala/kafka/controller/TopicDeletionManager.scala b/core/src/main/scala/kafka/controller/TopicDeletionManager.scala index 40c4c57..c995ec0 100644 --- a/core/src/main/scala/kafka/controller/TopicDeletionManager.scala +++ b/core/src/main/scala/kafka/controller/TopicDeletionManager.scala @@ -72,12 +72,13 @@ class TopicDeletionManager(controller: KafkaController, val controllerContext = controller.controllerContext val partitionStateMachine = controller.partitionStateMachine val replicaStateMachine = controller.replicaStateMachine - var topicsToBeDeleted: mutable.Set[String] = mutable.Set.empty[String] ++ initialTopicsToBeDeleted + val topicsToBeDeleted: mutable.Set[String] = mutable.Set.empty[String] ++ initialTopicsToBeDeleted + val partitionsToBeDeleted: mutable.Set[TopicAndPartition] = topicsToBeDeleted.flatMap(controllerContext.partitionsForTopic) val deleteLock = new ReentrantLock() - var topicsIneligibleForDeletion: mutable.Set[String] = mutable.Set.empty[String] ++ + val topicsIneligibleForDeletion: mutable.Set[String] = mutable.Set.empty[String] ++ (initialTopicsIneligibleForDeletion & initialTopicsToBeDeleted) val deleteTopicsCond = deleteLock.newCondition() - var deleteTopicStateChanged: AtomicBoolean = new AtomicBoolean(false) + val deleteTopicStateChanged: AtomicBoolean = new AtomicBoolean(false) var deleteTopicsThread: DeleteTopicsThread = null val isDeleteTopicEnabled = controller.config.deleteTopicEnable @@ -99,6 +100,7 @@ class TopicDeletionManager(controller: KafkaController, if(isDeleteTopicEnabled) { deleteTopicsThread.shutdown() topicsToBeDeleted.clear() + partitionsToBeDeleted.clear() topicsIneligibleForDeletion.clear() } } @@ -112,6 +114,7 @@ class TopicDeletionManager(controller: KafkaController, def enqueueTopicsForDeletion(topics: Set[String]) { if(isDeleteTopicEnabled) { topicsToBeDeleted ++= topics + partitionsToBeDeleted ++= topics.flatMap(controllerContext.partitionsForTopic) resumeTopicDeletionThread() } } @@ -264,6 +267,7 @@ class TopicDeletionManager(controller: KafkaController, partitionStateMachine.handleStateChanges(partitionsForDeletedTopic, OfflinePartition) partitionStateMachine.handleStateChanges(partitionsForDeletedTopic, NonExistentPartition) topicsToBeDeleted -= topic + partitionsToBeDeleted.retain(_.topic != topic) controllerContext.zkClient.deleteRecursive(ZkUtils.getTopicPath(topic)) controllerContext.zkClient.deleteRecursive(ZkUtils.getTopicConfigPath(topic)) controllerContext.zkClient.delete(ZkUtils.getDeleteTopicPath(topic)) @@ -277,7 +281,8 @@ class TopicDeletionManager(controller: KafkaController, private def onTopicDeletion(topics: Set[String]) { info("Topic deletion callback for %s".format(topics.mkString(","))) // send update metadata so that brokers stop serving data for topics to be deleted - controller.sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq) + val partitions = topics.flatMap(controllerContext.partitionsForTopic) + controller.sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, partitions) val partitionReplicaAssignmentByTopic = controllerContext.partitionReplicaAssignment.groupBy(p => p._1.topic) topics.foreach { topic => onPartitionDeletion(partitionReplicaAssignmentByTopic(topic).map(_._1).toSet) @@ -322,8 +327,8 @@ class TopicDeletionManager(controller: KafkaController, /** * This callback is invoked by the delete topic callback with the list of partitions for topics to be deleted * It does the following - - * 1. Send UpdateMetadataRequest to all live brokers (that are not shutting down) with all partitions except those for - * which the topics are being deleted. The brokers start rejecting all client requests with UnknownTopicOrPartitionException + * 1. Send UpdateMetadataRequest to all live brokers (that are not shutting down) for partitions that are being + * deleted. The brokers start rejecting all client requests with UnknownTopicOrPartitionException * 2. Move all replicas for the partitions to OfflineReplica state. This will send StopReplicaRequest to the replicas * and LeaderAndIsrRequest to the leader with the shrunk ISR. When the leader replica itself is moved to OfflineReplica state, * it will skip sending the LeaderAndIsrRequest since the leader will be updated to -1 diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index b9d2260..2d4946f 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -27,7 +27,6 @@ import scala.collection._ import java.util.concurrent.TimeUnit import java.util.concurrent.atomic._ import kafka.metrics.KafkaMetricsGroup -import org.I0Itec.zkclient.ZkClient import kafka.common._ import kafka.utils.{ZkUtils, Pool, SystemTime, Logging} import kafka.network.RequestChannel.Response @@ -144,23 +143,25 @@ class KafkaApis(val requestChannel: RequestChannel, stateChangeLogger: StateChangeLogger) { inLock(partitionMetadataLock.writeLock()) { updateMetadataRequest.aliveBrokers.foreach(b => aliveBrokers.put(b.id, b)) + val topicsToDelete = mutable.Set[String]() updateMetadataRequest.partitionStateInfos.foreach { partitionState => - addPartitionInfoInternal(partitionState._1.topic, partitionState._1.partition, partitionState._2) - stateChangeLogger.trace(("Broker %d cached leader info %s for partition %s in response to UpdateMetadata request " + - "sent by controller %d epoch %d with correlation id %d").format(brokerId, partitionState._2, partitionState._1, - updateMetadataRequest.controllerId, updateMetadataRequest.controllerEpoch, updateMetadataRequest.correlationId)) + if (partitionState._2.leaderIsrAndControllerEpoch.leaderAndIsr.leader == LeaderAndIsr.LeaderDuringDelete) { + topicsToDelete.add(partitionState._1.topic) + } else { + addPartitionInfoInternal(partitionState._1.topic, partitionState._1.partition, partitionState._2) + stateChangeLogger.trace(("Broker %d cached leader info %s for partition %s in response to " + + "UpdateMetadata request sent by controller %d epoch %d with correlation id %d") + .format(brokerId, partitionState._2, partitionState._1, updateMetadataRequest.controllerId, + updateMetadataRequest.controllerEpoch, updateMetadataRequest.correlationId)) + } } - // remove the topics that don't exist in the UpdateMetadata request since those are the topics that are - // currently being deleted by the controller - val topicsKnownToThisBroker = cache.keySet - val topicsKnownToTheController = updateMetadataRequest.partitionStateInfos.map { - case(topicAndPartition, partitionStateInfo) => topicAndPartition.topic }.toSet - val deletedTopics = topicsKnownToThisBroker -- topicsKnownToTheController - deletedTopics.foreach { topic => + + topicsToDelete.foreach { topic => cache.remove(topic) - stateChangeLogger.trace(("Broker %d deleted partitions for topic %s from metadata cache in response to UpdateMetadata request " + - "sent by controller %d epoch %d with correlation id %d").format(brokerId, topic, - updateMetadataRequest.controllerId, updateMetadataRequest.controllerEpoch, updateMetadataRequest.correlationId)) + stateChangeLogger.trace(("Broker %d deleted partitions for topic %s from metadata cache in response to " + + "UpdateMetadata request sent by controller %d epoch %d with correlation id %d") + .format(brokerId, topic, updateMetadataRequest.controllerId, + updateMetadataRequest.controllerEpoch, updateMetadataRequest.correlationId)) } } }